# Discover the data

In [30]:
import nltk
import re
import pandas as pd
import matplotlib.pyplot as plt

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter

In [15]:
df = pd.read_csv("./src/emotions.csv")

In [16]:
df.sample(5)

Unnamed: 0,Text,Emotion
9573,i take a long sip and feel the cold sensation ...,anger
5679,i wrote it feels slightly strange starting to ...,fear
13675,i also feel that seeing how the body reacts is...,happy
12948,i will have spontaneous bouts of needing to fe...,happy
12737,i feel like people are taking these stages of ...,sadness


In [17]:
df.head(5)

Unnamed: 0,Text,Emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [18]:
df["Emotion"].value_counts()

happy       7029
sadness     6265
anger       2993
fear        2652
love        1641
surprise     879
Name: Emotion, dtype: int64

In [19]:
# check the 5 most common words in the Text column
df["Text"].apply(lambda x: len(x.split())).sort_values(ascending=False).head(5)

6322    66
7222    64
9618    64
9626    64
46      64
Name: Text, dtype: int64

In [20]:
# rename columns
df.rename(columns={"Text": "text", "Emotion": "emotion"}, inplace=True)

In [21]:
df.head()

Unnamed: 0,text,emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


# Using NLTK package

In [26]:
nltk.download("stopwords") # for stopwords
nltk.download("wordnet") # for lemmatization
nltk.download("punkt") # for tokenization

[nltk_data] Downloading package stopwords to /Users/ant/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/ant/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/ant/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [38]:
stopwords = set(stopwords.words("english"))

In [39]:
stopwords

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [40]:
lemmatizer = WordNetLemmatizer()

In [41]:

def clean_text(text):
    """
    args: text - string
    returns: cleaned text
    """
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    words = nltk.word_tokenize(text)
    words = [lemmatizer.lemmatize(word) for word in words if word not in stopwords]
    return words

In [42]:
df['text_clean'] = df['text'].apply(clean_text)

In [44]:
df.head(5)

Unnamed: 0,text,emotion,text_clean
0,i didnt feel humiliated,sadness,"[didnt, feel, humiliated]"
1,i can go from feeling so hopeless to so damned...,sadness,"[go, feeling, hopeless, damned, hopeful, aroun..."
2,im grabbing a minute to post i feel greedy wrong,anger,"[im, grabbing, minute, post, feel, greedy, wrong]"
3,i am ever feeling nostalgic about the fireplac...,love,"[ever, feeling, nostalgic, fireplace, know, st..."
4,i am feeling grouchy,anger,"[feeling, grouchy]"
