In [7]:
import pandas as pd
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tag import pos_tag
import torch
import re
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset


file_path = "/home/patrick/dev/LING582_Project/train.csv"
data = pd.read_csv(file_path)

lemmatizer = WordNetLemmatizer()
stop_words = {'am', 'she', "it's", 'herself', 'hasn', 'll', 'they', 'do', 'he', 
              'before', 'where', 'its', 'this', 'can', 'them', 'but', 'these', 'so', 'after', 
              'couldn', 'himself', 'has', 'once', 'had', 'were', 'by', 'just', 'if', 
              'of',  'needn', 'here', 'be', 'there',  'didn', 'more', 'on', 
              'your', 'again',  'will', 'yourselves', 'should', 
              'his', 'their', 'aren', "haven't", 'off', "you'll", 'as', 'we', 'few', 'been', 'doing', 'own', 
              'me', 'between', 'through', 'when', 'down', 'you', 'does', 'because', 'for', 'him', 'the',
              "don't", 'very', 'an', 'ours', 'at', 'hers', 'is', 'have', 'about', 'themselves', 
              'any', 'from', 'against', 'i', 'to', 'how', 'it', 'yours', 'theirs', 'not', 'my', 
              'with', 'in', 'up', 'a', 'what', "didn't", 'that',  'ourselves', 'whom', 'during', 'same', 
              'other', 'and', 'while', 'don', 'all', 'o', 'those', 'into', 'under', 'now', 'too', 'further', 
              'then', 'itself', 'having', 'who', 'isn', 'most', 'her', 
              'or', 'did', 'each', 'why', 'above', 'was', 'than', 'are', 'which', 't', 'yourself', 'myself', 
              'our', 'some', 'out', 'only', 'ma', 'no'}

def preprocess(text):
    # Convert to lowercase
    text = text.lower()
    
    # Tokenization
    words = word_tokenize(text)

    # Remove punctuation
    words = [word for word in words if word.isalnum()]

    # Remove Stopwords
    words = [word for word in words if word not in stop_words]
    
    # Lemmatization
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    
    return ' '.join(lemmatized_words)

def safe_preprocess(text):
    try:
        return preprocess(text)
    except Exception as e:
        print(f"Error with text: {text}")
        print(f"Exception: {e}")
        return text  # or return an empty string

data['TEXT'] = data['TEXT'].apply(safe_preprocess)





In [8]:
data.columns

Index(['ID', 'TEXT', 'LABEL'], dtype='object')

In [10]:
data['SPAN_1'], data['SPAN_2'] = zip(*data['TEXT'].apply(lambda x: x.split(r'[SNIPPET]', 1) + [None] if len(x.split(r'[SNIPPET]', 1)) == 1 else x.split(r'[SNIPPET]', 1)))

In [11]:
# Tokenization & Conversion to Input IDs and Attention Masks
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

def encode_text(text):
    text = "" if text is None else text
    encoded = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=512,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt',
        truncation=True
    )
    return encoded['input_ids'][0], encoded['attention_mask'][0]

data['input_ids_1'], data['attention_mask_1'] = zip(*data['SPAN_1'].apply(encode_text))
data['input_ids_2'], data['attention_mask_2'] = zip(*data['SPAN_2'].apply(encode_text))

# Dataset Splitting
train_texts, val_texts, train_labels, val_labels = train_test_split(
    data[['input_ids_1', 'attention_mask_1', 'input_ids_2', 'attention_mask_2']],
    data['LABEL'],
    test_size=0.2
)

# Batching
BATCH_SIZE = 16

input_ids_tensor = torch.cat(train_texts['input_ids_1'].tolist()).view(-1, 512)
attention_mask_tensor = torch.cat(train_texts['attention_mask_1'].tolist()).view(-1, 512)
labels_tensor = torch.tensor(train_labels.tolist())

train_dataset = TensorDataset(input_ids_tensor, attention_mask_tensor, labels_tensor)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=BATCH_SIZE)

val_input_ids_tensor = torch.cat(val_texts['input_ids_1'].tolist()).view(-1, 512)
val_attention_mask_tensor = torch.cat(val_texts['attention_mask_1'].tolist()).view(-1, 512)
val_labels_tensor = torch.tensor(val_labels.tolist())

val_dataset = TensorDataset(val_input_ids_tensor, val_attention_mask_tensor, val_labels_tensor)
val_dataloader = DataLoader(val_dataset, shuffle=False, batch_size=BATCH_SIZE)