In [50]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [51]:
df = pd.read_pickle("data.pkl")

# undersample the data
df = df.sample(frac=0.1)

In [52]:
# Do some basic cleaning

df['text'] = df['text'].str.lower() # We need to check if there is a frequent user of capital letters to express emotions
df['text'] = df['text'].str.replace(r'[^\w\s]', '')
df['text'] = df['text'].str.replace(r'\d+', '')
df['text'] = df['text'].str.replace(r'\n', ' ')
df['text'] = df['text'].str.replace(r'\s+', ' ')
df['text'] = df['text'].str.strip()


In [53]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))


# remove negative words from stop_words list

neg_stop_words = {'no', 'nor', 'not', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"}

stop_words = stop_words - neg_stop_words

df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

# if a word is in neg_stop_words, we will add a prefix "not_" to the word that follows it and remove the negative word

def add_not_prefix(text):
    words = text.split()
    new_words = []
    i = 0
    while i < len(words):
        if words[i] in neg_stop_words:
            try:
                new_words.append('not_' + words[i+1])
            except:
                new_words.append('not')
            i += 2
        else:
            new_words.append(words[i])
            i += 1
    return ' '.join(new_words)

df['text'] = df['text'].apply(add_not_prefix)




In [54]:
# Stemming and lemmatization

from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def lemmatize(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

def stem(text):
    return " ".join([stemmer.stem(word) for word in text.split()])


df["text"] = df["text"].apply(lemmatize)
df["text"] = df["text"].apply(stem)


In [55]:
# Show the first 3 with emoitoins = joy
print(df[df['emotions'] == 'joy'].head(10))


                                                     text emotions
79890       groan inwardli let feel inkl preciou one gone      joy
133001  im feel realli festiv silveri glitter pencil p...      joy
109575  still feel excit im still struggl still want c...      joy
41109            feel divin union find part soul inaccess      joy
32538                                    feel calm compos      joy
135898            im not_feel particularli creativ moment      joy
116819                   want final job make feel respect      joy
37351              feel good write excit finish next year      joy
2982    alway alway alway come away feel invigor ecsta...      joy
63118    feel lucki seen transit older sister wife mother      joy


# TODO List

- Stopwords
- Stemming
- Lemmatization
- N-grams
- TF-IDF
- Word Embeddings
