In [319]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer ,ENGLISH_STOP_WORDS

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import re 

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import LogisticRegression


# Merge The two datasets to get neutral emotion 

In [320]:
df1 = pd.read_csv('./First Data/training.csv')
df1.shape

(16000, 2)

In [276]:
df3 = pd.read_csv('./Third Data/tweet_emotions.csv')
df3.shape

(40000, 3)

In [278]:
df3 = df3[df3.sentiment == 'neutral'] #& 'boredom' & 'enthusiasm' & 'empty'

df3.shape

(8638, 3)

In [279]:
df3=df3[:3000]
df3.shape

(3000, 3)

In [280]:
df3.head()

Unnamed: 0,tweet_id,sentiment,content
4,1956968416,neutral,@dannycastillo We want to trade with someone w...
10,1956969456,neutral,cant fall asleep
22,1956972116,neutral,No Topic Maps talks at the Balisage Markup Con...
31,1956975441,neutral,@cynthia_123 i cant sleep
32,1956975860,neutral,I missed the bl***y bus!!!!!!!!


In [281]:
df3.drop(['tweet_id'],axis=1,inplace = True)

In [282]:
df3.head()

Unnamed: 0,sentiment,content
4,neutral,@dannycastillo We want to trade with someone w...
10,neutral,cant fall asleep
22,neutral,No Topic Maps talks at the Balisage Markup Con...
31,neutral,@cynthia_123 i cant sleep
32,neutral,I missed the bl***y bus!!!!!!!!


In [283]:
df3.rename(
    columns={"sentiment":"label",
                "content":"text",
                   }
          ,inplace=True)
df3.head()

Unnamed: 0,label,text
4,neutral,@dannycastillo We want to trade with someone w...
10,neutral,cant fall asleep
22,neutral,No Topic Maps talks at the Balisage Markup Con...
31,neutral,@cynthia_123 i cant sleep
32,neutral,I missed the bl***y bus!!!!!!!!


In [284]:
df3.label = 6
df3.head()

Unnamed: 0,label,text
4,6,@dannycastillo We want to trade with someone w...
10,6,cant fall asleep
22,6,No Topic Maps talks at the Balisage Markup Con...
31,6,@cynthia_123 i cant sleep
32,6,I missed the bl***y bus!!!!!!!!


In [285]:
df = pd.concat([df1, df3], ignore_index = True, axis = 0)
df.shape

(19000, 2)

In [286]:
df.head()

Unnamed: 0,text,label
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,3
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,3


# Prepare data for model 

In [287]:
import re
def clean_html(text):
    
    clean = re.compile('<.*?>')
    return re.sub(clean, '',text)
    
df['text']=df['text'].apply(clean_html)
df.head()

Unnamed: 0,text,label
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,3
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,3


In [288]:
def convert_lower(text):
    return text.lower()

df['text']=df['text'].apply(convert_lower)
df.head()

Unnamed: 0,text,label
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,3
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,3


In [289]:
import re
def cleaning_tags(text):
    return ' '.join(re.sub("([@#][A-Za-z0-9_]+)|(\w+:\/\/\S+)"," ", text).split())

df['text'] = df['text'].apply(lambda x: cleaning_tags(x))
df['text'].head()

0                              i didnt feel humiliated
1    i can go from feeling so hopeless to so damned...
2     im grabbing a minute to post i feel greedy wrong
3    i am ever feeling nostalgic about the fireplac...
4                                 i am feeling grouchy
Name: text, dtype: object

In [290]:
import string
english_punctuations = string.punctuation
punctuations_list = english_punctuations
def cleaning_punctuations(text):
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)
df['text']= df['text'].apply(lambda x: cleaning_punctuations(x))
df['text'].head()

0                              i didnt feel humiliated
1    i can go from feeling so hopeless to so damned...
2     im grabbing a minute to post i feel greedy wrong
3    i am ever feeling nostalgic about the fireplac...
4                                 i am feeling grouchy
Name: text, dtype: object

In [291]:
def cleaning_repeating_char(text):
    return re.sub(r'([a-z])\1+', r'\1', text)
df['text'] = df['text'].apply(lambda x: cleaning_repeating_char(x))
df['text'].head()

0                               i didnt fel humiliated
1    i can go from feling so hopeles to so damned h...
2        im grabing a minute to post i fel gredy wrong
3    i am ever feling nostalgic about the fireplace...
4                                  i am feling grouchy
Name: text, dtype: object

In [292]:
def cleaning_URLs(data):
    return re.sub('((www.[^s]+)|(https?://[^s]+))',' ',data)
df['text'] = df['text'].apply(lambda x: cleaning_URLs(x))
df['text'].head()

0                               i didnt fel humiliated
1    i can go from feling so hopeles to so damned h...
2        im grabing a minute to post i fel gredy wrong
3    i am ever feling nostalgic about the fireplace...
4                                  i am feling grouchy
Name: text, dtype: object

In [293]:
def cleaning_numbers(data):
    return re.sub('[0-9]+', '', data)
df['text'] = df['text'].apply(lambda x: cleaning_numbers(x))
df['text'].head()

0                               i didnt fel humiliated
1    i can go from feling so hopeles to so damned h...
2        im grabing a minute to post i fel gredy wrong
3    i am ever feling nostalgic about the fireplace...
4                                  i am feling grouchy
Name: text, dtype: object

In [294]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
df.head()

[nltk_data] Downloading package stopwords to /home/paula/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,text,label
0,didnt fel humiliated,0
1,go feling hopeles damned hopeful around someon...,0
2,im grabing minute post fel gredy wrong,3
3,ever feling nostalgic fireplace wil know stil ...,2
4,feling grouchy,3


In [295]:
from nltk.stem.porter import PorterStemmer
ps= PorterStemmer()
y=[]

def stem_words(text):
    for i in text:
        y.append(ps.stem(i))
    z=y[:]
    y.clear()
    return z
df['text']=df['text'].apply(stem_words)
df.head()

Unnamed: 0,text,label
0,"[d, i, d, n, t, , f, e, l, , h, u, m, i, l, ...",0
1,"[g, o, , f, e, l, i, n, g, , h, o, p, e, l, ...",0
2,"[i, m, , g, r, a, b, i, n, g, , m, i, n, u, ...",3
3,"[e, v, e, r, , f, e, l, i, n, g, , n, o, s, ...",2
4,"[f, e, l, i, n, g, , g, r, o, u, c, h, y]",3


In [296]:
def joinback2(list_input):
    return "".join(list_input)
    


df['text']=df['text'].apply(joinback2)
df.head()



Unnamed: 0,text,label
0,didnt fel humiliated,0
1,go feling hopeles damned hopeful around someon...,0
2,im grabing minute post fel gredy wrong,3
3,ever feling nostalgic fireplace wil know stil ...,2
4,feling grouchy,3


In [297]:
X=df['text']
y=df.label
X_train, X_test, y_train,y_test = train_test_split(X,y,test_size=0.3) 

In [298]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=1500)


In [299]:
X_train = cv.fit_transform(X_train).toarray()
X_test=cv.transform(X_test).toarray()

In [300]:
pac=PassiveAggressiveClassifier(max_iter=50)
pac.fit(X_train,y_train)

PassiveAggressiveClassifier(max_iter=50)

In [301]:
y_pred=pac.predict(X_test)
score=accuracy_score(y_test,y_pred)
print(f'Accuracy: {round(score*100,2)}%')


Accuracy: 84.89%


In [302]:
log_reg = LogisticRegression(max_iter=50).fit(X_train, y_train)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [303]:
y_predicted = log_reg.predict(X_test)
score=accuracy_score(y_test,y_predicted)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 88.05%


In [304]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', n_estimators=100,learning_rate=0.2)

In [305]:
model.fit(X_train, y_train)


Parameters: { use_label_encoder } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='mlogloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.2, max_delta_step=0,
              max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=0,
              num_parallel_tree=1, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

In [306]:
y_pred = model.predict(X_test)
score=accuracy_score(y_test,y_pred)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 88.61%
