In [1]:
# imports
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
import string as str
from sklearn.pipeline import Pipeline

In [2]:
# load data
train_data = pd.read_csv('train_emotion_dataset.csv')
test_data = pd.read_csv('test_emotion_dataset.csv')

# peek data
train_data.head()

Unnamed: 0,Text,Emotion
0,didnt want stay feeling loneliness emptiness p...,sadness
1,feel sorry seeing parents,sadness
2,cant help feel someones going end pissed,anger
3,done music movie production last four years fe...,joy
4,think feel passionate favorite workout gear lo...,love


In [3]:
# rejoin data

data = pd.DataFrame()
temp = [train_data, test_data]
data = pd.concat(temp)
data.head()

Unnamed: 0,Text,Emotion
0,didnt want stay feeling loneliness emptiness p...,sadness
1,feel sorry seeing parents,sadness
2,cant help feel someones going end pissed,anger
3,done music movie production last four years fe...,joy
4,think feel passionate favorite workout gear lo...,love


In [4]:
data.shape

(34000, 2)

In [5]:
data['Emotion'].value_counts()

joy         11419
sadness      9913
anger        4593
fear         4098
love         2767
surprise     1210
Name: Emotion, dtype: int64

In [6]:
# Features & Labels
Xfeatures = data['Text']
ylabels = data['Emotion']

In [7]:
#  Split Data
x_train,x_test,y_train,y_test = train_test_split(Xfeatures,ylabels,test_size=0.2,random_state=42)

In [8]:
# LogisticRegression Pipeline
pipe_lr = Pipeline(steps=[('cv',CountVectorizer()),('lr',LogisticRegression())])

In [9]:
# Train and Fit Data
pipe_lr.fit(x_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [10]:
pipe_lr

In [11]:
# Check Accuracy
pipe_lr.score(x_test,y_test)

0.9551470588235295

In [12]:
# Make A Prediction
sample1 = "This chocholate was very sweet it made me happy"

In [13]:
pipe_lr.predict([sample1])

array(['joy'], dtype=object)

In [14]:
data['Predictions'] = ''
for i in data.index:
    data['Predictions'].iloc[i] = pipe_lr.predict([data['Text'].iloc[i]])

data.head(30)

Unnamed: 0,Text,Emotion,Predictions
0,didnt want stay feeling loneliness emptiness p...,sadness,[sadness]
1,feel sorry seeing parents,sadness,[sadness]
2,cant help feel someones going end pissed,anger,[anger]
3,done music movie production last four years fe...,joy,[joy]
4,think feel passionate favorite workout gear lo...,love,[love]
5,feel kind ashamed write things like sat couch ...,sadness,[sadness]
6,feel earth move tribute carole king karaoke mi...,sadness,[sadness]
7,try explaining feelings someone dismisses blin...,anger,[anger]
8,allowed feel really shitty feelings running en...,sadness,[sadness]
9,put one side focus following version shameful ...,sadness,[sadness]


In [15]:
# match column
data['Match'] = ''

for i in data.index:
    if (data['Predictions'].iloc[i] == data['Emotion'].iloc[i]):
        data['Match'].iloc[i] = True
    else:
        data['Match'].iloc[i] = False
        
data.head(20)

Unnamed: 0,Text,Emotion,Predictions,Match
0,didnt want stay feeling loneliness emptiness p...,sadness,[sadness],True
1,feel sorry seeing parents,sadness,[sadness],True
2,cant help feel someones going end pissed,anger,[anger],True
3,done music movie production last four years fe...,joy,[joy],True
4,think feel passionate favorite workout gear lo...,love,[love],True
5,feel kind ashamed write things like sat couch ...,sadness,[sadness],True
6,feel earth move tribute carole king karaoke mi...,sadness,[sadness],True
7,try explaining feelings someone dismisses blin...,anger,[anger],True
8,allowed feel really shitty feelings running en...,sadness,[sadness],True
9,put one side focus following version shameful ...,sadness,[sadness],True


In [16]:
# now that the model is trained on the emotion data set, lets apply it to our dataset

In [17]:
# load data
train_data = pd.read_csv('processed_trained_sample_gt.csv')
test_data = pd.read_csv('processed_test_sample_gt.csv')

# peek data
train_data.head()

Unnamed: 0,ID,Tweet,Sentiment,Time,Age,Country,Population -2020,Land Area (KmÂ²),Density (P/KmÂ²),Ground Truth
0,d1d7ed938d,great site photos fantastic useful visit pemb...,positive,morning,46-60,Egypt,102334404,995450,103,happiness
1,be0a927a80,_n too want ooze speaking either why lame jok...,neutral,noon,60-70,Guinea-Bissau,1968001,28120,70,df
2,6b4eb2871c,thanks link ive voted ill send too,positive,noon,21-30,Bulgaria,6948445,108560,64,df
3,9c503adc87,also liptons sparkling green tea gross,neutral,noon,21-30,Somalia,15893222,627340,25,disgust
4,3e1f7acd3c,know lender was yea people definitely sucked b...,negative,night,70-100,Monaco,39242,1,26337,anger


In [18]:
# rejoin data

data2 = pd.DataFrame()
temp = [train_data, test_data]
data2 = pd.concat(temp)
data2.head()

Unnamed: 0,ID,Tweet,Sentiment,Time,Age,Country,Population -2020,Land Area (KmÂ²),Density (P/KmÂ²),Ground Truth
0,d1d7ed938d,great site photos fantastic useful visit pemb...,positive,morning,46-60,Egypt,102334404,995450,103,happiness
1,be0a927a80,_n too want ooze speaking either why lame jok...,neutral,noon,60-70,Guinea-Bissau,1968001,28120,70,df
2,6b4eb2871c,thanks link ive voted ill send too,positive,noon,21-30,Bulgaria,6948445,108560,64,df
3,9c503adc87,also liptons sparkling green tea gross,neutral,noon,21-30,Somalia,15893222,627340,25,disgust
4,3e1f7acd3c,know lender was yea people definitely sucked b...,negative,night,70-100,Monaco,39242,1,26337,anger


In [19]:
data2 = data2.rename(columns = {'Tweet' : 'Text', 'Ground Truth' : 'Emotion'})

In [20]:
# drop nan
data2 = data2.dropna()

In [21]:
data2['Predictions'] = ''
for i in data2.index:
    data2['Predictions'].iloc[i] = pipe_lr.predict([data2['Text'].iloc[i]])

data2.head(30)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data2['Predictions'].iloc[i] = pipe_lr.predict([data2['Text'].iloc[i]])


Unnamed: 0,ID,Text,Sentiment,Time,Age,Country,Population -2020,Land Area (KmÂ²),Density (P/KmÂ²),Emotion,Predictions
0,d1d7ed938d,great site photos fantastic useful visit pemb...,positive,morning,46-60,Egypt,102334404,995450,103,happiness,[joy]
1,be0a927a80,_n too want ooze speaking either why lame jok...,neutral,noon,60-70,Guinea-Bissau,1968001,28120,70,df,[sadness]
2,6b4eb2871c,thanks link ive voted ill send too,positive,noon,21-30,Bulgaria,6948445,108560,64,df,[joy]
3,9c503adc87,also liptons sparkling green tea gross,neutral,noon,21-30,Somalia,15893222,627340,25,disgust,[joy]
4,3e1f7acd3c,know lender was yea people definitely sucked b...,negative,night,70-100,Monaco,39242,1,26337,anger,[joy]
5,c818fdf4b8,sun turned blancmange sucks pale,negative,night,70-100,"Eswatini (fmr. ""Swaziland"")",1160164,17200,67,sadness,[joy]
6,b01c96b216,you,neutral,noon,21-30,Paraguay,7132538,397300,18,df,[joy]
7,c5a47367cc,come join fun first pitch legends game may th ...,neutral,morning,0-20,Timor-Leste,1318445,14870,89,happiness,[joy]
8,a16aa7fcc4,happy mothers day gr day,positive,morning,0-20,Haiti,11402528,27560,414,happiness,[joy]
9,553beefa8b,ever since orchid quit drinking bottle eating ...,neutral,noon,60-70,Nicaragua,6624554,120340,55,surprise,[fear]


In [22]:
# Check Accuracy
pipe_lr.score(x_test,y_test)

0.9551470588235295