## OBJECTIVE :

### Go through different datasets and create a train, valid and test dataset

In [1]:
import warnings
warnings.simplefilter("ignore")

In [2]:
import os
import sys
import ipdb
import numpy as np
import pandas as pd
import spacy
import unicodedata
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from nltk.tokenize import sent_tokenize, word_tokenize

print("Numpy Version : ", np.__version__)
print("Pandas Version : ", pd.__version__)

np.random.seed(42)

%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

print("Matplotlib Version : ", mpl.__version__)

Numpy Version :  1.18.1
Pandas Version :  1.0.1
Matplotlib Version :  3.1.3


In [3]:
# Initializing spacy model
nlp = spacy.load('en_core_web_md')

In [4]:
DATA_DIR = "../data/"
OBJECT_DIR = "../objects/"
LOG_DIR = "../logs/"

In [5]:
LUCKY_SEED = 42

In [6]:
RAW_DATA_DIR = os.path.join(DATA_DIR, "orig_data")
ORIG_DATA_DIR = os.path.join(RAW_DATA_DIR, "sa-emotions")
OTHERS_DATA_DIR = os.path.join(RAW_DATA_DIR, "others")
TRAIN_DIR = os.path.join(DATA_DIR, "training_data")
VALID_DIR = os.path.join(DATA_DIR, "validation_data")
TEST_DIR = os.path.join(DATA_DIR, "testing_data")

In [7]:
STOPWORDS = nlp.Defaults.stop_words.union(STOPWORDS)
KEEPWORDS = ['up', 'down', 'no', 'not', 'nothing', 'none', 'neither', 'never', 'except', 'below', 'bottom', 'without', 'serious', \
            'really', 'above', 'against', 'cannot', 'least', 'less', 'more', 'most', 'must', 'top', 'well']
# for word in KEEPWORDS:
#     STOPWORDS.remove(word)
STOPWORDS = list(STOPWORDS)
DATE_STOPWORDS = ['january', 'february', 'march', 'april', 'may', 'june', \
                  'july', 'august', 'september', 'october', 'november', 'december', \
                 'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday', \
                 'today', 'tomorrow', 'yesterday', 'year', 'month', 'date', 'day']
NUM_STOPWORDS = ['hundred', 'hundreds', 'thousand', 'thousands', 'million', 'millions', 'billion', 'billions']
REL_STOPWORDS = ['brother', 'sister', 'son', 'daughter', 'father', 'mother', 'uncle', 'aunt', 'aunty', 'wife', \
                 'husband', 'cousin', 'nephew', 'niece', 'child', 'dad', 'mom', 'papa', 'mummy', 'bro', 'sis', \
                 'grandfather', 'grandmother', 'granny', 'grandpa', 'grandma', 'children', 'boy', 'boys', 'guy', \
                 'guys', 'girl', 'girls', 'man', 'men', 'woman', 'women']
DIR_STOPWORDS = ['north', 'south', 'east', 'west']
STOPWORDS += DATE_STOPWORDS + NUM_STOPWORDS + REL_STOPWORDS + DIR_STOPWORDS
STOPWORDS = set(STOPWORDS)
print(len(STOPWORDS))

459


In [8]:
# Train-val data
kaggle_data = pd.read_csv(os.path.join(ORIG_DATA_DIR, "train_data.csv"))
anger_0 = pd.read_csv(os.path.join(OTHERS_DATA_DIR, "anger-ratings-0to1.train.txt"), sep="\t", header=None)
anger_1 = pd.read_csv(os.path.join(OTHERS_DATA_DIR, "anger-ratings-0to1.dev.target.txt"), sep="\t", header=None)
fear_0 = pd.read_csv(os.path.join(OTHERS_DATA_DIR, "fear-ratings-0to1.train.txt"), sep="\t", header=None)
fear_1 = pd.read_csv(os.path.join(OTHERS_DATA_DIR, "fear-ratings-0to1.dev.gold.txt"), sep="\t", header=None)
joy_0 = pd.read_csv(os.path.join(OTHERS_DATA_DIR, "joy-ratings-0to1.train.txt"), sep="\t", header=None)
joy_1 = pd.read_csv(os.path.join(OTHERS_DATA_DIR, "joy-ratings-0to1.dev.gold.txt"), sep="\t", header=None)
sadness_0 = pd.read_csv(os.path.join(OTHERS_DATA_DIR, "sadness-ratings-0to1.train.txt"), sep="\t", header=None)
sadness_1 = pd.read_csv(os.path.join(OTHERS_DATA_DIR, "sadness-ratings-0to1.dev.gold.txt"), sep="\t", header=None)

# Test data
kaggle_test_data = pd.read_csv(os.path.join(ORIG_DATA_DIR, "test_data.csv"))
anger_test_data = pd.read_csv(os.path.join(OTHERS_DATA_DIR, "anger-ratings-0to1.test.target.txt"), sep="\t", header=None)
fear_test_data = pd.read_csv(os.path.join(OTHERS_DATA_DIR, "fear-ratings-0to1.test.target.txt"), sep="\t", header=None)
joy_test_data = pd.read_csv(os.path.join(OTHERS_DATA_DIR, "joy-ratings-0to1.test.target.txt"), sep="\t", header=None)
sadness_test_data = pd.read_csv(os.path.join(OTHERS_DATA_DIR, "sadness-ratings-0to1.test.target.txt"), sep="\t", header=None)

In [9]:
print("Kaggle train data shape : ", kaggle_data.shape)
print("Kaggle test data shape : ", kaggle_test_data.shape)
print("Anger train data shape : ", len(anger_0) + len(anger_1))
print("Anger test data shape : ", len(anger_test_data))
print("Fear train data shape : ", len(fear_0) + len(fear_1))
print("Fear test data shape : ", len(fear_test_data))
print("Sadness train data shape : ", len(sadness_0) + len(sadness_1))
print("Sadness test data shape : ", len(sadness_test_data))
print("Joy train data shape : ", len(joy_0) + len(joy_1))
print("Joy test data shape : ", len(joy_test_data))

Kaggle train data shape :  (30000, 2)
Kaggle test data shape :  (10000, 2)
Anger train data shape :  941
Anger test data shape :  760
Fear train data shape :  1257
Fear test data shape :  995
Sadness train data shape :  860
Sadness test data shape :  673
Joy train data shape :  902
Joy test data shape :  714


In [10]:
anger_0.head()

Unnamed: 0,0,1,2,3
0,10000,How the fu*k! Who the heck! moved my fridge!.....,anger,0.938
1,10001,So my Indian Uber driver just called someone t...,anger,0.896
2,10002,@DPD_UK I asked for my parcel to be delivered ...,anger,0.896
3,10003,so ef whichever butt wipe pulled the fire alar...,anger,0.896
4,10004,Don't join @BTCare they put the phone down on ...,anger,0.896


In [11]:
anger_1.head()

Unnamed: 0,0,1,2,3
0,10857,@ZubairSabirPTI pls dont insult the word 'Molna',anger,NONE
1,10858,@ArcticFantasy I would have almost took offens...,anger,NONE
2,10859,@IllinoisLoyalty that Rutgers game was an abom...,anger,NONE
3,10860,@CozanGaming that's what lisa asked before she...,anger,NONE
4,10861,Sometimes I get mad over something so minuscul...,anger,NONE


In [12]:
sadness_0.head()

Unnamed: 0,0,1,2,3
0,40000,Depression sucks! #depression,sadness,0.958
1,40001,Feeling worthless as always #depression,sadness,0.958
2,40002,Feeling worthless as always,sadness,0.958
3,40003,My #Fibromyalgia has been really bad lately wh...,sadness,0.946
4,40004,Im think ima lay in bed all day and sulk. Life...,sadness,0.934


In [13]:
sadness_1.head()

Unnamed: 0,0,1,2,3
0,40786,@1johndes ball watching &amp; Rojo'd header wa...,sadness,0.583
1,40787,"A pessimist is someone who, when opportunity k...",sadness,0.188
2,40788,A .500 season is all I'm looking for at this p...,sadness,0.688
3,40789,"Stars, when you shine,\nYou know how I feel.\n...",sadness,0.292
4,40790,All I want to do is watch some netflix but I a...,sadness,0.667


In [14]:
fear_0.head()

Unnamed: 0,0,1,2,3
0,20000,I feel like I am drowning. #depression #anxiet...,fear,0.979
1,20001,I get so nervous even thinking about talking t...,fear,0.979
2,20002,I lost my blinders .... #panic,fear,0.975
3,20003,I feel like I am drowning. #depression #falur...,fear,0.938
4,20004,This is the scariest American Horror Story out...,fear,0.938


In [15]:
fear_1.head()

Unnamed: 0,0,1,2,3
0,21147,I know this is going to be one of those nights...,fear,0.771
1,21148,This is #horrible: Lewis Dunk has begun networ...,fear,0.479
2,21149,"@JeffersonLake speaking of ex cobblers, saw Ri...",fear,0.417
3,21150,@1johndes ball watching &amp; Rojo'd header wa...,fear,0.475
4,21151,"Really.....#Jumanji 2....w/ The Rock, Jack Bla...",fear,0.542


In [16]:
joy_0.head()

Unnamed: 0,0,1,2,3
0,30000,Just got back from seeing @GaryDelaney in Burs...,joy,0.98
1,30001,Oh dear an evening of absolute hilarity I don'...,joy,0.958
2,30002,Been waiting all week for this game ❤️❤️❤️ #ch...,joy,0.94
3,30003,"@gardiner_love : Thank you so much, Gloria! Yo...",joy,0.938
4,30004,I feel so blessed to work with the family that...,joy,0.938


In [17]:
joy_1.head()

Unnamed: 0,0,1,2,3
0,30823,"@theclobra lol I thought maybe, couldn't decid...",joy,0.312
1,30824,Nawaz Sharif is getting more funnier than @kap...,joy,0.7
2,30825,Nawaz Sharif is getting more funnier than @kap...,joy,0.58
3,30826,@tomderivan73 😁...I'll just people watch and e...,joy,0.438
4,30827,I love my family so much #lucky #grateful #sma...,joy,0.936


In [18]:
anger_test_data.head()

Unnamed: 0,0,1,2,3
0,10941,At the point today where if someone says somet...,anger,NONE
1,10942,@CorningFootball IT'S GAME DAY!!!! T MIN...,anger,NONE
2,10943,This game has pissed me off more than any othe...,anger,NONE
3,10944,@spamvicious I've just found out it's Candice ...,anger,NONE
4,10945,@moocowward @mrsajhargreaves @Melly77 @GaryBar...,anger,NONE


In [19]:
sadness_test_data.head()

Unnamed: 0,0,1,2,3
0,40860,My 2 teens sons just left in the car to get ha...,sadness,NONE
1,40861,My 2 teens sons just left in the car to get ha...,sadness,NONE
2,40862,HartRamsey'sUPLIFT If you're still discouraged...,sadness,NONE
3,40863,@AmontanaW I nearly dropped my phone into the ...,sadness,NONE
4,40864,Whenever I'm feeling sad I will listen to mons...,sadness,NONE


In [20]:
joy_test_data.head()

Unnamed: 0,0,1,2,3
0,30902,You must be knowing #blithe means (adj.) Happ...,joy,NONE
1,30903,Old saying 'A #smile shared is one gained for ...,joy,NONE
2,30904,Bridget Jones' Baby was bloody hilarious 😅 #Br...,joy,NONE
3,30905,@Elaminova sparkling water makes your life spa...,joy,NONE
4,30906,I'm tired of everybody telling me to chill out...,joy,NONE


In [21]:
fear_test_data.head()

Unnamed: 0,0,1,2,3
0,21257,#Matthew 25; 1-13\nCould somebody shoot a #vid...,fear,NONE
1,21258,@bkero @whispersystems Which really sucks beca...,fear,NONE
2,21259,Be #afraid of the #quiet ones they are the one...,fear,NONE
3,21260,@riinkanei he's a horrible person and now i ga...,fear,NONE
4,21261,What we fear doing most is usually what we mos...,fear,NONE


In [22]:
anger_data = pd.concat([anger_0, anger_1, anger_test_data]).reset_index(drop=True)
fear_data = pd.concat([fear_0, fear_1, fear_test_data]).reset_index(drop=True)
joy_data = pd.concat([joy_0, joy_1, joy_test_data]).reset_index(drop=True)
sadness_data = pd.concat([sadness_0, sadness_1, sadness_test_data]).reset_index(drop=True)

In [23]:
print("Total Anger data size : ", len(anger_data))
print("Total Fear data size : ", len(fear_data))
print("Total Joy data size : ", len(joy_data))
print("Total Sadness data size : ", len(sadness_data))

Total Anger data size :  1701
Total Fear data size :  2252
Total Joy data size :  1616
Total Sadness data size :  1533


In [24]:
kaggle_data.head()

Unnamed: 0,sentiment,content
0,empty,@tiffanylue i know i was listenin to bad habi...
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
3,enthusiasm,wants to hang out with friends SOON!
4,neutral,@dannycastillo We want to trade with someone w...


In [25]:
kaggle_test_data.head()

Unnamed: 0,id,content
0,1,is hangin with the love of my life. Tessa McCr...
1,2,I've Got An Urge To Make Music Like Massively....
2,3,@lacrossehawty rofl uh huh
3,4,"@fankri haha! thanks, Tiff it went well, but..."
4,5,@alyssaisntcool hahah i loveeee him though.


In [26]:
kaggle_data["sentiment"].value_counts()

worry         7433
neutral       6340
sadness       4828
happiness     2986
love          2068
surprise      1613
hate          1187
fun           1088
relief        1021
empty          659
enthusiasm     522
boredom        157
anger           98
Name: sentiment, dtype: int64