# Pre-processing

## Imports & Data Loading

In [1]:
from nltk.corpus import stopwords
import pandas as pd
import re
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

In [2]:
df = pd.read_pickle("data/data_original.pkl")

# undersample the data in 10%, distributing the classes proportionally in a stratified way and with a fixed seed
# df = df.groupby("emotions", group_keys=False).apply(lambda x: x.sample(frac=0.1, random_state=42))

print(df.shape)

# print proportion of each class
print(df["emotions"].value_counts(normalize=True))


(416809, 2)
emotions
joy         0.338445
sadness     0.290749
anger       0.137514
fear        0.114470
love        0.082901
surprise    0.035921
Name: proportion, dtype: float64


## Cleaning

In [3]:
# Do some basic cleaning using regex
df['text'] = df['text'].str.lower()

# remove href link
df['text'] = df['text'].apply(lambda x: re.sub(r'href', '', x)) 
df['text'] = df['text'].apply(lambda x: re.sub(r'http', '', x))
df['text'] = df['text'].apply(lambda x: re.sub(r'www', '', x))

# remove punctuation, numbers, and extra spaces
df['text'] = df['text'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
df['text'] = df['text'].apply(lambda x: re.sub(r'\d+', '', x))
df['text'] = df['text'].apply(lambda x: re.sub(r'\n', ' ', x))
df['text'] = df['text'].apply(lambda x: re.sub(r'\s+', ' ', x))
df['text'] = df['text'].str.strip()


### Dealing with stopwords and negations

Because the dataset is very badly written, we need to consider words written with apostrophes, spaces or just connected to other words.

In [4]:
stop_words = set(stopwords.words('english'))

# if there is a stopword with an apostrophe, add the word without the apostrophe and a space instead of the apostrophe, and the word without the apostrophe
more_stop_words = set()
for word in stop_words:
    if "'" in word:
        more_stop_words.add(word.replace("'", " "))
        more_stop_words.add(word.replace("'", ""))
stop_words = stop_words.union(more_stop_words)

print(stop_words)

{'once', 'aren t', 'has', "haven't", 'again', 'which', 'yours', 'does', "couldn't", 'their', 'any', 'only', 'shan t', 'werent', 'they', 'shant', 'whom', 'me', 'a', 'down', 'didnt', 'too', "you're", 'don', 'wouldnt', 'he', 'd', 'against', 'isnt', 'to', 'it s', 'this', 'youve', 'shouldnt', 'won', 'didn t', 'above', "wouldn't", 'ourselves', "mustn't", 'mightn', 'she', 'my', 'into', "it's", 'who', "you'd", "needn't", 'same', 'mightn t', 'further', 'youre', 'haven t', 'being', 'youll', 'shan', 'doing', 'some', 've', "shouldn't", 'should ve', 'we', 'as', 'couldn', "mightn't", 'under', 'o', 'mightnt', 'on', 'doesn', 'youd', 'yourself', 'before', "wasn't", 'dont', 'why', 'them', 'didn', 'when', 'what', 'there', 'doesnt', 'wouldn', 'won t', 'have', 'hadn t', 'out', 'through', 'yourselves', 'below', 'just', 'mustn', 'weren t', 'are', 'during', 'than', 'had', 'not', 'then', "doesn't", "didn't", 'or', 'you', 'theirs', "won't", 'where', 'such', 'weren', 'that ll', 'hasn', 'for', 'that', 'wouldn t',

In the case of negation words, we need to consider the words that are connected to them, so we can preserve the meaning of the sentence by preappending the negation word with a "not_" prefix. 

In [5]:
# remove negative words from stop_words list

neg_stop_words = {'no', 'nor', 'not', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't", "can't", "cant"}

more_neg_words = set()
# for each negative word with an apostrophe, we will add the word without the apostrophe and a space instead of the apostrophe
for word in neg_stop_words:
    if "'" in word:
        more_neg_words.add(word.replace("'", " "))
        more_neg_words.add(word.replace("'", ""))

neg_stop_words = neg_stop_words.union(more_neg_words)

print(neg_stop_words)

# remove from stop_words the negative words
stop_words = stop_words - neg_stop_words


{'aren t', "hadn't", "haven't", 'hadnt', "couldn't", 'needn t', 'shan t', 'werent', 'shant', 'wasnt', 'hadn', 'wasn t', 'didnt', 'needn', 'neednt', 'wouldnt', 'ain', 'isnt', 'shouldnt', 'wasn', 'arent', 'won', 'shouldn t', 'didn t', "wouldn't", 'shouldn', 'hasnt', 'couldnt', "mustn't", 'mightn', "shan't", "isn't", 'no', 'wont', "needn't", 'mightn t', 'couldn t', 'haven t', "weren't", 'shan', "shouldn't", 'doesn t', 'nor', 'cant', 'couldn', "mightn't", 'mightnt', 'mustn t', 'doesn', "aren't", 'ma', "wasn't", 'havent', 'didn', "hasn't", 'hasn t', 'hadn t', 'doesnt', 'wouldn', 'won t', 'isn', 'weren t', 'mustn', "can't", 'mustnt', 'isn t', 'not', "doesn't", "didn't", 'haven', 'can t', 'aren', 'weren', "won't", 'hasn', 'wouldn t'}


In [6]:
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

# if a word is in neg_stop_words, we will add a prefix "not_" to the word that follows it and remove the negative word
def add_not_prefix(text):
    words = text.split()
    new_words = []
    i = 0
    while i < len(words):
        if words[i] in neg_stop_words:
            try:
                new_words.append('not_' + words[i+1])
            except:
                new_words.append('not') # if the negative word is the last word, just add "not"
            i += 2
        else:
            new_words.append(words[i])
            i += 1
    return ' '.join(new_words)

df['text'] = df['text'].apply(add_not_prefix)

### Dealing with small words

Focusing now on small words, they are not very informative and are in many instances just noise. We can remove them from the dataset.

In [7]:
# Display percentage of words in the dataset that are less than 3 characters 

words = df['text'].str.split(expand=True).stack()

# Get set of unique small words
small_words = words[words.str.len() < 2]

print(f"The percentage of small words is {round(small_words[small_words.str.len() < 2].count() / words.count() * 100, 2)}%") 

# show the 30 most common small words
print(f"The 30 most common small words are:\n{small_words.value_counts().head(30)}") 

# remove small words from the dataset and keep the ones in keep_small_words
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if len(word) >= 2]))

The percentage of small words is 0.13%
The 30 most common small words are:
u    798
b    608
n    520
c    432
e    404
p    384
x    339
w    281
k    246
r    212
f    203
j    165
g    153
l    138
h    126
v     85
z     55
q     38
Name: count, dtype: int64


### Dealing with bad written words

A thing that we noticed in the dataset is that there are a lot of words that are not written correctly. We can use a spell checker to correct them. But first, one thing that we can do is fix words that are written with more than one letter repeated, like "soooooo" or "amaaaaaazinggggg".

In [8]:
# remove all consecutive same letters
def replace_repetitive(text):
    pattern = r'(\w)\1{2,}'
    replaced_text = re.sub(pattern, r'\1\1', text)
    return replaced_text

df['text'] = df['text'].apply(replace_repetitive)

In [9]:
# Stemming and lemmatization

lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def lemmatize(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

def stem(text):
    return " ".join([stemmer.stem(word) for word in text.split()])


df["text"] = df["text"].apply(stem)
# df["text"] = df["text"].apply(lemmatize)


In [10]:
# Show the first 10 rows
print(df["text"].head(10))

27383              feel aw job get posit succeed not_happen
110083                                      im alon feel aw
140764    ive probabl mention realli feel proud actual k...
100071                              feel littl low day back
2837          beleiv much sensit peopl feel tend compassion
18231     find frustrat christian feel constantli talk l...
10714             one peopl feel like go gym worthwhil hour
35177                      feel especi pleas long time come
122177    struggl aw feel say sweet thing not_deserv sis...
26723                              feel enrag helpless time
Name: text, dtype: object


In [12]:
df.describe()

Unnamed: 0,text,emotions
count,416809,416809
unique,379880,6
top,feel accept,joy
freq,65,141067


In [14]:
# save df to pickle
df.to_pickle("data/data_processed.pkl") # NOTE: ONLY SAVING 10% OF THE DATA

## Spacy

In [15]:
# apply spacy and save to pickle
import spacy
nlp = spacy.load("en_core_web_sm")

# Does the following:
# 1. Remove non alpha characters
# 2. Convert to lower-case
# 3. Tokenize the sentence
# 4. POS Tagging
# 5. Dependency Parsing
# 6. Named Entity Recognition
# 7. Lemmatization
# 8. Remove stop words
# 9. Sentence Boundary Detection
def spacy_pre_process(sentence):
    global nlp
    return nlp(sentence)

df['text'] = df['text'].apply(spacy_pre_process) # TAKES A LOT OF TIME

df.to_pickle("data/data_spacy_processed.pkl") # NOTE: ONLY SAVING 10% OF THE DATA

