In [60]:
import pandas as pd
from transformers import BertTokenizer
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tag import pos_tag
from transformers import DistilBertTokenizer
import torch
import re


In [61]:
file_path = "train.csv"
data = pd.read_csv(file_path)

lemmatizer = WordNetLemmatizer()
stop_words = {'am', 'she', "it's", 'herself', 'hasn', 'll', 'they', 'do', 'he', 
              'before', 'where', 'its', 'this', 'can', 'them', 'but', 'these', 'so', 'after', 
              'couldn', 'himself', 'has', 'once', 'had', 'were', 'by', 'just', 'if', 
              'of',  'needn', 'here', 'be', 'there',  'didn', 'more', 'on', 
              'your', 'again',  'will', 'yourselves', 'should', 
              'his', 'their', 'aren', "haven't", 'off', "you'll", 'as', 'we', 'few', 'been', 'doing', 'own', 
              'me', 'between', 'through', 'when', 'down', 'you', 'does', 'because', 'for', 'him', 'the',
              "don't", 'very', 'an', 'ours', 'at', 'hers', 'is', 'have', 'about', 'themselves', 
              'any', 'from', 'against', 'i', 'to', 'how', 'it', 'yours', 'theirs', 'not', 'my', 
              'with', 'in', 'up', 'a', 'what', "didn't", 'that',  'ourselves', 'whom', 'during', 'same', 
              'other', 'and', 'while', 'don', 'all', 'o', 'those', 'into', 'under', 'now', 'too', 'further', 
              'then', 'itself', 'having', 'who', 'isn', 'most', 'her', 
              'or', 'did', 'each', 'why', 'above', 'was', 'than', 'are', 'which', 't', 'yourself', 'myself', 
              'our', 'some', 'out', 'only', 'ma', 'no'}

def preprocess(text):
    # Convert to lowercase
    text = text.lower()
    
    # Tokenization
    words = word_tokenize(text)

    # Remove punctuation
    words = [word for word in words if word.isalnum()]

    # Remove Stopwords
    words = [word for word in words if word not in stop_words]
    
    # Lemmatization
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    
    return ' '.join(lemmatized_words)

def safe_preprocess(text):
    try:
        return preprocess(text)
    except Exception as e:
        print(f"Error with text: {text}")
        print(f"Exception: {e}")
        return text  # or return an empty string


In [62]:
data['TEXT'] = data['TEXT'].apply(safe_preprocess)

In [63]:
# Split the TEXT column
split_data = data['TEXT'].str.split(r'\[SNIPPET\]', n=1, expand=True)

# If the split results in only one column, fill the second column with NaN
split_data[1] = split_data.get(1, None)

# Assign the split data to the original dataframe
data[['SPAN_1', 'SPAN_2']] = split_data


In [64]:
# Remove annoying strings with hashmarks in them (this is easier than figuring out where they come from)
def rm_hash(list_of_tokens):

    hash = re.compile(r"(#+)")
    
    return [token for token in list_of_tokens if not re.match(hash, token)]


In [66]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

['idea', 'space', 'traveler', 'visiting', 'earth', 'learn', 'new', 'dance', 'fantastic', 'idea', 'being', 'wayne', 'turned', 'face', 'door', 'shouted', 'hey', 'dance', 'want', 'u', 'teach', 'dance', 'called', 'high', 'dragon', 'bump', 'muffled', 'metallic', 'voice', 'side', 'said', 'nod', 'dan', 'bump', 'hug', 'qui', 'wayne', 'shrugged', 'grinned', 'weakly', 's', 'said', 'earth', 'blasted', 'away', 'destroyed', 'ci', 'ship', 'therefore', 'well', 'therefore', 'first', 'place', 'actually', 'said', 'rid', 'earth']


In [69]:
data['TOKENIZED_SPAN_1'] = data['SPAN_1'].apply(lambda x: rm_hash(tokenizer.tokenize(str(x))))

In [73]:
data['TOKENIZED_SPAN_1'][0:10]

0    [idea, space, traveler, visiting, earth, learn...
1    [raised, dignity, god, accidental, death, caus...
2    [got, one, right, thought, drop, over, sort, b...
3    [want, sign, paper, joshua, lee, gorman, held,...
4    [reason, love, instinct, different, men, earth...
5    [inc, word, all, specifically, charles, course...
6    [must, something, left, at, lance, shot, u, lo...
7    [tell, run, thing, get, want, reached, slowly,...
8    [happy, nora, merry, always, kind, home, nothi...
9    [spoke, softly, came, room, would, impossible,...
Name: TOKENIZED_SPAN_1, dtype: object

In [75]:
data[0:10]

Unnamed: 0,ID,TEXT,LABEL,SPAN_1,SPAN_2,TOKENIZED_SPAN_1
0,0,idea space traveler visiting earth learn new d...,0,idea space traveler visiting earth learn new d...,,"[idea, space, traveler, visiting, earth, learn..."
1,1,raised dignity godhead accidental death causin...,0,raised dignity godhead accidental death causin...,,"[raised, dignity, god, accidental, death, caus..."
2,2,got one right thought drop over sort break ice...,1,got one right thought drop over sort break ice...,,"[got, one, right, thought, drop, over, sort, b..."
3,3,want sign paper joshua lee gorman held pen pus...,1,want sign paper joshua lee gorman held pen pus...,,"[want, sign, paper, joshua, lee, gorman, held,..."
4,4,reason love instinct different men earth born ...,1,reason love instinct different men earth born ...,,"[reason, love, instinct, different, men, earth..."
5,5,incoherent word alluded specifically charles c...,1,incoherent word alluded specifically charles c...,,"[inc, word, all, specifically, charles, course..."
6,6,must something left atlatl lance shot u looked...,0,must something left atlatl lance shot u looked...,,"[must, something, left, at, lance, shot, u, lo..."
7,7,tell run thing get want reached slowly forward...,0,tell run thing get want reached slowly forward...,,"[tell, run, thing, get, want, reached, slowly,..."
8,8,happy nora merry always kind home nothing play...,0,happy nora merry always kind home nothing play...,,"[happy, nora, merry, always, kind, home, nothi..."
9,9,spoke softly came room would impossible buy pr...,1,spoke softly came room would impossible buy pr...,,"[spoke, softly, came, room, would, impossible,..."


In [76]:
pd.read_csv(file_path)

Unnamed: 0,ID,TEXT,LABEL
0,0,The idea of space travelers visiting earth to ...,0
1,1,"""He whom we raised to the dignity of godhead o...",0
2,2,"""I've got one right here. Thought I'd drop ove...",1
3,3,"""I want you to sign these papers, Joshua."" Lee...",1
4,4,"""Is that reason why we should not love?"" ""No. ...",1
...,...,...,...
1596,1596,"Held prisoner with Joan, top of Robbins Buildi...",0
1597,1597,What Ramsey had done was as clear to him now a...,0
1598,1598,"“Yes, indeed, what has happened?” exclaimed Eu...",0
1599,1599,"Are you all right, darling? Did I forget anyth...",1
