# Pre-processing

## Imports & Data Loading

In [2]:
from nltk.corpus import stopwords
import pandas as pd
import re
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

In [3]:
df = pd.read_pickle("data/data_original.pkl")

# undersample the data in 10%, distributing the classes proportionally in a stratified way and with a fixed seed
df = df.groupby("emotions", group_keys=False).apply(lambda x: x.sample(frac=0.1, random_state=42))

print(df.shape)

# print proportion of each class
print(df["emotions"].value_counts(normalize=True))


(41681, 2)
emotions
joy         0.338452
sadness     0.290756
anger       0.137521
fear        0.114465
love        0.082891
surprise    0.035916
Name: proportion, dtype: float64


## Cleaning

In [3]:
# Do some basic cleaning using regex
df['text'] = df['text'].str.lower()

# remove href link
df['text'] = df['text'].apply(lambda x: re.sub(r'href', '', x)) 
df['text'] = df['text'].apply(lambda x: re.sub(r'http', '', x))
df['text'] = df['text'].apply(lambda x: re.sub(r'www', '', x))

# remove punctuation, numbers, and extra spaces
df['text'] = df['text'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
df['text'] = df['text'].apply(lambda x: re.sub(r'\d+', '', x))
df['text'] = df['text'].apply(lambda x: re.sub(r'\n', ' ', x))
df['text'] = df['text'].apply(lambda x: re.sub(r'\s+', ' ', x))
df['text'] = df['text'].str.strip()


### Dealing with stopwords and negations

Because the dataset is very badly written, we need to consider words written with apostrophes, spaces or just connected to other words.

In [4]:
stop_words = set(stopwords.words('english'))

# if there is a stopword with an apostrophe, add the word without the apostrophe and a space instead of the apostrophe, and the word without the apostrophe
more_stop_words = set()
for word in stop_words:
    if "'" in word:
        more_stop_words.add(word.replace("'", " "))
        more_stop_words.add(word.replace("'", ""))
stop_words = stop_words.union(more_stop_words)

print(stop_words)

{'be', 'any', 'your', "haven't", "isn't", 'it', "aren't", 'and', 'youd', 'or', 'under', 'll', 'youll', 'should', 'mustn t', "don't", 'some', 'what', 'if', 'each', 'wouldn', 'neednt', 'i', 'youre', 'of', 'do', 'don', 'through', 'weren t', 'those', 'wouldnt', 'dont', "mightn't", 'don t', 'where', 'at', 'thatll', 'just', 'shes', 'its', "hasn't", 'wasn', 'couldn', 'having', 'he', 'ma', 'you re', 'same', 'didnt', 'than', 'our', 'for', 'about', 'havent', 'haven t', "shan't", 'wouldn t', 'with', 'themselves', 'we', 'shant', 'doesnt', 'between', 'such', "won't", 'here', 'am', "didn't", 'yours', 'as', 'over', 'weren', 'shan t', 'these', 'himself', 'it s', 'd', 'his', 'mustn', 'isn t', 'myself', 'other', 'me', 'on', 'did', "should've", 'o', 'm', 'this', 'shouldve', 'until', 'mightnt', 'during', 'below', 're', "doesn't", "needn't", 'won', 'are', 'him', 'before', 'both', 'herself', 'their', 'was', 'then', "weren't", 'hasnt', 'ours', 'shan', "you're", 'because', 's', 'hasn t', 'y', 'itself', 'that 

In the case of negation words, we need to consider the words that are connected to them, so we can preserve the meaning of the sentence by preappending the negation word with a "not_" prefix. 

In [5]:
# remove negative words from stop_words list

neg_stop_words = {'no', 'nor', 'not', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't", "can't", "cant"}

more_neg_words = set()
# for each negative word with an apostrophe, we will add the word without the apostrophe and a space instead of the apostrophe
for word in neg_stop_words:
    if "'" in word:
        more_neg_words.add(word.replace("'", " "))
        more_neg_words.add(word.replace("'", ""))

neg_stop_words = neg_stop_words.union(more_neg_words)

print(neg_stop_words)

# remove from stop_words the negative words
stop_words = stop_words - neg_stop_words


{'isnt', "haven't", "isn't", "aren't", 'doesn t', "shouldn't", 'mustn t', 'mustnt', 'couldn t', 'wasn t', 'wouldn', 'neednt', 'wouldnt', 'weren t', 'didn', "mightn't", "hadn't", 'haven', 'wasn', "hasn't", 'couldn', 'shouldnt', 'ma', 'couldnt', 'isn', 'mightn', 'hasn', 'didnt', 'wont', 'aren', 'havent', 'haven t', "shan't", 'won t', 'wouldn t', 'ain', 'wasnt', 'shant', 'shouldn', 'doesn', 'doesnt', "couldn't", "won't", "wasn't", 'cant', "didn't", 'aren t', 'weren', 'needn t', 'arent', 'shan t', 'didn t', 'isn t', 'no', 'mustn', 'werent', 'needn', 'mightn t', 'mightnt', 'nor', "doesn't", "needn't", 'won', "wouldn't", "weren't", 'hasnt', 'shan', 'hasn t', 'hadn', "can't", 'shouldn t', 'hadnt', 'hadn t', 'can t', 'not', "mustn't"}


In [6]:
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

# if a word is in neg_stop_words, we will add a prefix "not_" to the word that follows it and remove the negative word
def add_not_prefix(text):
    words = text.split()
    new_words = []
    i = 0
    while i < len(words):
        if words[i] in neg_stop_words:
            try:
                new_words.append('not_' + words[i+1])
            except:
                new_words.append('not') # if the negative word is the last word, just add "not"
            i += 2
        else:
            new_words.append(words[i])
            i += 1
    return ' '.join(new_words)

df['text'] = df['text'].apply(add_not_prefix)

### Dealing with small words

Focusing now on small words, they are not very informative and are in many instances just noise. We can remove them from the dataset.

In [7]:
# Display percentage of words in the dataset that are less than 3 characters 

words = df['text'].str.split(expand=True).stack()

# Get set of unique small words
small_words = words[words.str.len() < 2]

print(f"The percentage of small words is {round(small_words[small_words.str.len() < 2].count() / words.count() * 100, 2)}%") 

# show the 30 most common small words
print(f"The 30 most common small words are:\n{small_words.value_counts().head(30)}") 

# remove small words from the dataset and keep the ones in keep_small_words
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if len(word) >= 2]))

The percentage of small words is 0.13%
The 30 most common small words are:
u    72
n    56
b    54
c    48
e    45
w    42
p    31
x    25
f    21
k    19
r    19
g    18
j    14
l    14
h    12
v     9
q     8
z     3
Name: count, dtype: int64


### Dealing with bad written words

A thing that we noticed in the dataset is that there are a lot of words that are not written correctly. We can use a spell checker to correct them. But first, one thing that we can do is fix words that are written with more than one letter repeated, like "soooooo" or "amaaaaaazinggggg".

In [8]:
# remove all consecutive same letters
def replace_repetitive(text):
    pattern = r'(\w)\1{2,}'
    replaced_text = re.sub(pattern, r'\1\1', text)
    return replaced_text

df['text'] = df['text'].apply(replace_repetitive)

In [11]:
# Stemming and lemmatization

lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def lemmatize(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

def stem(text):
    return " ".join([stemmer.stem(word) for word in text.split()])


df["text"] = df["text"].apply(stem)
# df["text"] = df["text"].apply(lemmatize)


In [13]:
# Show the first 10 rows
print(df["text"].head(10))

19132                           feel irrit kinda hate feel
51533    id rather home feel violent lone im not_tri so...
44351                suggest wait discuss feel less resent
51299                                wrong feel royal piss
55778    im tierd talk like there hope hell care unders...
17018    feel frustrat honest like not_get money worth ...
45995    tri make chang feel urg happen particularli di...
45862                    truli good feel fight ever bother
48826                                          feel jealou
14602                       feel angri think like elsewher
Name: text, dtype: object


In [14]:
# save df to pickle
df.to_pickle("data/data_processed.pkl") # NOTE: ONLY SAVING 10% OF THE DATA

## Spacy

In [15]:
# apply spacy and save to pickle
import spacy
nlp = spacy.load("en_core_web_sm")

# Does the following:
# 1. Remove non alpha characters
# 2. Convert to lower-case
# 3. Tokenize the sentence
# 4. POS Tagging
# 5. Dependency Parsing
# 6. Named Entity Recognition
# 7. Lemmatization
# 8. Remove stop words
# 9. Sentence Boundary Detection
def spacy_pre_process(sentence):
    global nlp
    return nlp(sentence)

df['text'] = df['text'].apply(spacy_pre_process) # TAKES A LOT OF TIME

df.to_pickle("data/data_spacy_processed.pkl") # NOTE: ONLY SAVING 10% OF THE DATA

