In [None]:
import pandas as pd
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tag import pos_tag
import torch
import re
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset


In [None]:
file_path = "train.csv"
data = pd.read_csv(file_path)

lemmatizer = WordNetLemmatizer()
stop_words = {'am', 'she', "it's", 'herself', 'hasn', 'll', 'they', 'do', 'he', 
              'before', 'where', 'its', 'this', 'can', 'them', 'but', 'these', 'so', 'after', 
              'couldn', 'himself', 'has', 'once', 'had', 'were', 'by', 'just', 'if', 
              'of',  'needn', 'here', 'be', 'there',  'didn', 'more', 'on', 
              'your', 'again',  'will', 'yourselves', 'should', 
              'his', 'their', 'aren', "haven't", 'off', "you'll", 'as', 'we', 'few', 'been', 'doing', 'own', 
              'me', 'between', 'through', 'when', 'down', 'you', 'does', 'because', 'for', 'him', 'the',
              "don't", 'very', 'an', 'ours', 'at', 'hers', 'is', 'have', 'about', 'themselves', 
              'any', 'from', 'against', 'i', 'to', 'how', 'it', 'yours', 'theirs', 'not', 'my', 
              'with', 'in', 'up', 'a', 'what', "didn't", 'that',  'ourselves', 'whom', 'during', 'same', 
              'other', 'and', 'while', 'don', 'all', 'o', 'those', 'into', 'under', 'now', 'too', 'further', 
              'then', 'itself', 'having', 'who', 'isn', 'most', 'her', 
              'or', 'did', 'each', 'why', 'above', 'was', 'than', 'are', 'which', 't', 'yourself', 'myself', 
              'our', 'some', 'out', 'only', 'ma', 'no'}

def preprocess(text):
    # Convert to lowercase
    text = text.lower()
    
    # Tokenization
    words = word_tokenize(text)

    # Remove punctuation
    words = [word for word in words if word.isalnum()]

    # Remove Stopwords
    words = [word for word in words if word not in stop_words]
    
    # Lemmatization
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    
    return ' '.join(lemmatized_words)

def safe_preprocess(text):
    try:
        return preprocess(text)
    except Exception as e:
        print(f"Error with text: {text}")
        print(f"Exception: {e}")
        return text  # or return an empty string


In [None]:
data['TEXT'] = data['TEXT'].apply(safe_preprocess)

In [None]:
# Split the TEXT column
split_data = data['TEXT'].str.split(r'\[SNIPPET\]', n=1, expand=True)

# If the split results in only one column, fill the second column with NaN
split_data[1] = split_data.get(1, None)

# Assign the split data to the original dataframe
data[['SPAN_1', 'SPAN_2']] = split_data


In [None]:
# Remove annoying strings with hashmarks in them (this is easier than figuring out where they come from)
def rm_hash(list_of_tokens):

    hash = re.compile(r"(#+)")
    
    return [token for token in list_of_tokens if not re.match(hash, token)]


In [None]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [None]:
data['TOKENIZED_SPAN_1'] = data['SPAN_1'].apply(lambda x: rm_hash(tokenizer.tokenize(str(x))))

In [None]:
data['TOKENIZED_SPAN_1'][0:10]

In [None]:
data[0:10]

In [None]:
pd.read_csv(file_path)

In [None]:
# 1. Tokenization & Conversion to Input IDs and Attention Masks
def encode_text(text):
    encoded = tokenizer.encode_plus(
        text,                      # Text to encode
        add_special_tokens=True,   # Add '[CLS]' and '[SEP]'
        max_length=512,            # Pad & truncate all sentences
        padding='max_length',
        return_attention_mask=True,# Construct attention masks
        return_tensors='pt',       # Return pytorch tensors
    )
    
    return encoded['input_ids'], encoded['attention_mask']

In [None]:
# Apply encoding to all data
data['input_ids_1'], data['attention_mask_1'] = zip(*data['SPAN_1'].apply(encode_text))
data['input_ids_2'], data['attention_mask_2'] = zip(*data['SPAN_2'].apply(encode_text))

In [19]:
x = zip(*data['SPAN_1'])
print(list(x))

[('i', 'r', 'g', 'w', 'r', 'i', 'm', 't', 'h', 's', 's', 'q', 'f', 's', 'b', 'k', 'h', 'b', 'r', 'c', 'k', 's', 's', 'l', 's', 's', 't', 'd', 'l', 'r', 'j', 'p', 'n', 'g', 'k', 'g', 'q', 's', 'c', 's', 'b', 'd', 's', 'a', 't', 'w', 'r', 'b', 'p', 'y', 'f', 'r', 'm', 'u', 'r', 'a', 't', 'a', 't', 's', 's', 'u', 'w', 'l', 'c', 'c', 'm', 'm', 'g', 'v', 'f', 'c', 'i', 'l', 'w', 'm', 's', 'r', 'p', 'c', 'p', 't', 'm', 'a', 's', 's', 'w', 'w', 'c', 's', 'm', 'd', 'f', 's', 'f', 'c', 'h', 'o', 's', 'w', 'u', 'r', 'l', 'e', 'w', 'g', 's', 'w', 'g', 'c', 'h', 'w', 'm', 'd', 'k', 's', 'u', 'n', 'f', 'p', 'w', 'g', 'b', 'h', 'a', 'd', 'e', 'g', 'n', 'd', 'e', 'm', 'd', 'j', 's', 'a', 'c', 'o', 's', 'd', 'm', 'w', 's', 't', 't', 'h', 'r', 'w', 's', 'p', 'a', 's', 'o', 's', 'b', 'o', 'o', 'i', 'p', 's', 't', 'c', 's', 'p', 'm', 's', 's', 'm', 'c', 'e', 's', 'f', 'r', 'c', 'h', 'a', 'm', 'm', 'j', 'u', 'm', 'a', 'a', 'e', 't', 's', 'o', 'c', 'd', 't', 'g', 'c', 'g', 'g', 's', 'b', 'p', 'h', 'g', 'n'