In [202]:
# machine learning specialisation coursera 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time


import nltk
from nltk.tokenize import TweetTokenizer
from gensim.parsing.porter import PorterStemmer

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline


In [203]:
tweets_df = pd.read_csv('data/sentiment.csv')
tweets_df.shape

(605, 18)

In [204]:
tweets_df.head()

Unnamed: 0.1,Unnamed: 0,user_id,tweet_id,username,location,following,followers,twt_created_at,total_tweets,retweet_count,text,hashtags,mentions,textblob_polarity,nltk_compound,avg_sentiment,textblob_sentiment,nltk_sentiment
0,0,2,2,pspatilsbi,Bangalore,325,25,2022-11-08 22:08:44+00:00,2704,0,agenda great old god blees end,[],"[{'screen_name': 'INCIndia', 'name': 'Congress...",0.45,0.7351,1,1,1
1,1,3,3,ththegde,"Kandivali East, Mumbai",582,57,2022-11-08 22:00:49+00:00,1969,0,please allow citizen buy forex investment like...,[],"[{'screen_name': 'PMOIndia', 'name': 'PMO Indi...",0.0,0.802,1,0,1
2,2,5,5,rupz_boruah,"Chabua, India",14,33,2022-11-08 21:54:37+00:00,309,0,please take necessary action neurologist amc d...,"[{'text': 'Dr_Dhrubajyoti_Kurmi', 'indices': [...","[{'screen_name': 'MoHFW_INDIA', 'name': 'Minis...",0.033333,0.6369,1,1,1
3,3,8,8,lazizpizza99,"Jasola Vihar, New Delhi",23,1,2022-11-08 21:28:48+00:00,6,0,sleeping suddenly bed start shaking ignored ke...,"[{'text': 'peace', 'indices': [231, 237]}, {'t...","[{'screen_name': 'LtGovDelhi', 'name': 'LG Del...",0.198333,0.6597,1,1,1
4,4,11,11,UtkarshMishra_9,"Noida, India",707,1122,2022-11-08 21:14:55+00:00,5764,0,estimated magnitude earthquake affected countr...,"[{'text': 'earthquake', 'indices': [137, 148]}...","[{'screen_name': 'ANI', 'name': 'ANI', 'id': 3...",0.0,-0.1531,-1,0,-1


In [205]:
# dropping user_id, username, location, following, followers, twt_created_at, total_tweets, retweet_count, hashtags, mentions, tweet_id_dup
tweets_df.drop(['Unnamed: 0', 'user_id', 'tweet_id', 'username', 'location', 'following', 'followers', 'twt_created_at', 'total_tweets', 'retweet_count', 'hashtags', 'mentions', 'textblob_polarity', 'nltk_compound', 'textblob_sentiment', 'nltk_sentiment'], axis=1, inplace=True)

In [206]:
tweets_df.columns

Index(['text', 'avg_sentiment'], dtype='object')

In [207]:
tweets_df.head()

Unnamed: 0,text,avg_sentiment
0,agenda great old god blees end,1
1,please allow citizen buy forex investment like...,1
2,please take necessary action neurologist amc d...,1
3,sleeping suddenly bed start shaking ignored ke...,1
4,estimated magnitude earthquake affected countr...,-1


In [208]:
tweets_df['avg_sentiment'].value_counts()

 1    380
-1    187
 0     38
Name: avg_sentiment, dtype: int64

In [209]:
def creating_tokens(df):
    tokens = list()
    tokenizer = TweetTokenizer()
    
    for tweets in df.loc[:, 'text']:
        # print(len(tokenizer.tokenize(tweets)))
        tokens.append(tokenizer.tokenize(tweets))
    
    df['tokens'] = tokens
    
    porter_stemmer = PorterStemmer()
    # Get the stemmed_tokens
    df['stemmed_tokens'] = [[porter_stemmer.stem(word) for word in tokens] for tokens in df['tokens']]
    df['stemmed_tokens'].head(10)
    
    
    return df

In [210]:
tweets_df = creating_tokens(tweets_df)
tweets_df.head()

Unnamed: 0,text,avg_sentiment,tokens,stemmed_tokens
0,agenda great old god blees end,1,"[agenda, great, old, god, blees, end]","[agenda, great, old, god, blee, end]"
1,please allow citizen buy forex investment like...,1,"[please, allow, citizen, buy, forex, investmen...","[pleas, allow, citizen, bui, forex, invest, li..."
2,please take necessary action neurologist amc d...,1,"[please, take, necessary, action, neurologist,...","[pleas, take, necessari, action, neurologist, ..."
3,sleeping suddenly bed start shaking ignored ke...,1,"[sleeping, suddenly, bed, start, shaking, igno...","[sleep, suddenli, bed, start, shake, ignor, ke..."
4,estimated magnitude earthquake affected countr...,-1,"[estimated, magnitude, earthquake, affected, c...","[estim, magnitud, earthquak, affect, countri, ..."


In [211]:
from sklearn.model_selection import train_test_split

def split_data(df, test_size):
    x_train, x_test, y_train, y_test = train_test_split(df['stemmed_tokens'], df['avg_sentiment'], test_size=test_size, random_state=42, stratify=df['avg_sentiment'])
    
    print(y_train.value_counts())
    print(y_test.value_counts())
    # print(type(x_train))
    # print(type(y_train))
    
    x_train = x_train.to_frame()
    x_train = x_train.reset_index()
    
    x_test = x_test.to_frame()
    x_test = x_test.reset_index()
    
    y_train = y_train.to_frame()
    y_train = y_train.reset_index()
    
    y_test = y_test.to_frame()
    y_test = y_test.reset_index()
    
    return x_train, x_test, y_train, y_test

In [212]:
x_train, x_test, y_train, y_test = split_data(tweets_df, 0.3)

 1    266
-1    131
 0     26
Name: avg_sentiment, dtype: int64
 1    114
-1     56
 0     12
Name: avg_sentiment, dtype: int64


In [213]:
print(x_train.head())
print(x_test.head())
print(y_train.head())
print(y_test.head())

   index                                     stemmed_tokens
0    564  [street, light, road, name, veer, banda, vaira...
1    441  [deserv, true, employe, award, honest, whichev...
2    447  [gujarat, stone, pelt, vand, bharat, express, ...
3    581  [street, light, instal, wall, street, pole, in...
4    551  [kab, theek, hoga, yeh, citizen, share, mcd, c...
   index                                     stemmed_tokens
0     90  [know, hindu, chang, reach, top, posit, know, ...
1    602  [open, fire, garbag, park, wast, caus, pollut,...
2     45  [proof, heart, honour, prime, minist, shri, na...
3     29  [theme, promot, adolesc, friendli, health, car...
4    117  [democraci, india, evm, tamper, medium, becam,...
   index  avg_sentiment
0    564              1
1    441              1
2    447             -1
3    581              1
4    551             -1
   index  avg_sentiment
0     90              1
1    602             -1
2     45              1
3     29              1
4    117        

In [214]:
max_len = -1
for tweets in tweets_df.loc[:, 'stemmed_tokens']:
    if(max_len < len(tweets)):
        max_len = len(tweets)
    

In [215]:
max_len

36

In [216]:
from gensim.models import Word2Vec
import time

OUTPUT_FOLDER = '/Users/nitanshjain/Documents/Thapar 4th Sem/Machine Learing/Machine_Learning_Project/'

start_time = time.time()
tokens = pd.Series(tweets_df['stemmed_tokens']).values
# print(tokens)
word2vec_model_file = OUTPUT_FOLDER + 'word2vec_' + str(200) + '.model'

w2v_model = Word2Vec(tokens, min_count=1, vector_size=200, window=5, workers=4, sg=2)
w2v_model.train(tokens, epochs=10, total_examples=len(tokens))
print("Time taken to train word2vec model: " + str(time.time() - start_time))
w2v_model.save(word2vec_model_file)


Time taken to train word2vec model: 1.1380188465118408


In [217]:
def create_file(create_file, model_file, x):
    sg_w2v_model = Word2Vec.load(model_file)
    
    with open(create_file, 'w+') as word2vec_file:
        for index, row in x.iterrows():
            model_vector = (np.mean([sg_w2v_model.wv[token] for token in row['stemmed_tokens']], axis=0)).tolist()
            if index == 0:
                header = ",".join(str(ele) for ele in range(200))
                word2vec_file.write(header)
                word2vec_file.write("\n")
            
            if type(model_vector) is list:
                line1 = ",".join( [str(vector_element) for vector_element in model_vector] )
            else:
                line1 = ",".join([str(0) for i in range(200)])
            word2vec_file.write(line1)
            word2vec_file.write('\n')
    
    df = pd.read_csv(create_file)
    return df
        

In [218]:
word2vec_train_filename = OUTPUT_FOLDER + 'word2vec_train_' + str(200) + '.csv'
word2vec_train_df = create_file(word2vec_train_filename, word2vec_model_file, x_train)
print(word2vec_train_df.shape)
word2vec_train_df.head()

(423, 200)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,0.020882,0.010146,0.074476,0.049863,0.122592,-0.127202,-0.025988,0.255989,-0.063943,0.106702,...,0.094013,-0.115644,0.007459,-0.069337,0.146214,0.070044,0.053953,-0.078277,-0.090076,-0.027871
1,0.030421,0.011001,0.073398,0.042254,0.128152,-0.121796,-0.026794,0.244542,-0.053698,0.107955,...,0.089291,-0.101986,-0.00742,-0.06888,0.131901,0.070164,0.055305,-0.077764,-0.083086,-0.025791
2,0.026502,0.008255,0.064975,0.039736,0.108406,-0.111629,-0.027741,0.211317,-0.047374,0.098843,...,0.076891,-0.087726,-0.007771,-0.060584,0.118688,0.06191,0.044924,-0.067158,-0.073745,-0.022742
3,0.018516,0.010517,0.077214,0.051298,0.128104,-0.131632,-0.02771,0.269085,-0.064869,0.111912,...,0.097766,-0.11711,-0.000688,-0.074179,0.143129,0.072559,0.055359,-0.080812,-0.093793,-0.030234
4,0.013198,-0.014307,0.086592,0.056287,0.138616,-0.117392,-0.037586,0.271665,-0.035025,0.126754,...,0.121644,-0.113244,-0.050501,-0.093834,0.085799,0.076067,0.057251,-0.081211,-0.082183,-0.032743


In [219]:
word2vec_test_filename = OUTPUT_FOLDER + 'word2vec_test_' + str(200) + '.csv'
word2vec_test_df = create_file(word2vec_test_filename, word2vec_model_file, x_test)
print(word2vec_test_df.shape)
word2vec_test_df.head()

(182, 200)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,0.026803,0.010217,0.076619,0.044262,0.128775,-0.130777,-0.033468,0.244803,-0.051639,0.109792,...,0.093495,-0.10699,-0.006401,-0.063361,0.131392,0.068798,0.047431,-0.075939,-0.08787,-0.026534
1,0.019942,0.005275,0.078289,0.05452,0.124541,-0.122664,-0.029338,0.251487,-0.050012,0.114454,...,0.099729,-0.11412,-0.012928,-0.073292,0.122156,0.071235,0.052237,-0.07949,-0.083768,-0.028576
2,0.054947,0.055499,0.053978,0.021544,0.139769,-0.160743,-0.030389,0.266557,-0.057817,0.081011,...,0.072207,-0.071787,0.000779,-0.065455,0.173246,0.077635,0.075101,-0.088403,-0.089713,-0.022631
3,0.049917,0.028666,0.056204,0.027117,0.129673,-0.141489,-0.0221,0.249026,-0.049515,0.08803,...,0.081327,-0.079331,-0.008764,-0.065936,0.153854,0.073913,0.069424,-0.080221,-0.085031,-0.02976
4,0.032248,0.017993,0.060072,0.031912,0.1138,-0.122264,-0.025206,0.222788,-0.049385,0.087883,...,0.075618,-0.084216,-0.003909,-0.061549,0.132866,0.063816,0.050377,-0.071298,-0.07621,-0.024081


In [220]:
from sklearn.preprocessing import MinMaxScaler
mm = MinMaxScaler()

word2vec_train_scaled_df = pd.DataFrame(mm.fit_transform(word2vec_train_df))
word2vec_test_scaled_df = pd.DataFrame(mm.fit_transform(word2vec_test_df))

word2vec_train_scaled_df.head()

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

y_train_scaled = le.fit_transform(y_train['avg_sentiment'])
y_test_scaled = le.fit_transform(y_test['avg_sentiment'])

# Decision Tree

In [221]:
# Initialize the model
clf_decision_word2vec = DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=42)

start_time = time.time()
# Fit the model
clf_decision_word2vec.fit(word2vec_train_df, y_train['avg_sentiment'])
print("Time taken to fit the model with word2vec vectors: " + str(time.time() - start_time))

Time taken to fit the model with word2vec vectors: 0.05437898635864258


In [222]:
from sklearn.metrics import classification_report
        
y_pred_word2vec = clf_decision_word2vec.predict(word2vec_test_df)
print(classification_report(y_test['avg_sentiment'], y_pred_word2vec))

              precision    recall  f1-score   support

          -1       0.45      0.09      0.15        56
           0       0.00      0.00      0.00        12
           1       0.64      0.95      0.76       114

    accuracy                           0.62       182
   macro avg       0.36      0.35      0.30       182
weighted avg       0.54      0.62      0.52       182



In [230]:
dt_pipe = Pipeline([('mms', MinMaxScaler()),
                    ('dt', DecisionTreeClassifier())])
params = [{
    'dt__criterion':['gini', 'entropy'],
    'dt__max_depth':[3, 5, 7, 9, 11],
    'dt__random_state':[42]
}]

dt_pipe.get_params().keys()
gs_dt = GridSearchCV(dt_pipe,
                    param_grid=params,
                    scoring='accuracy',
                    cv=5)
gs_dt.fit(word2vec_train_df, y_train['avg_sentiment'])
gs_dt.best_params_
gs_dt.best_score_

0.6052941176470588

# Random Forest Classifier

In [224]:
clf_random_word2vec = RandomForestClassifier(n_estimators=300, criterion='entropy', max_depth=9, random_state=42)

start_time = time.time()
clf_random_word2vec.fit(word2vec_train_df, y_train['avg_sentiment'])
print("Time taken to fit the model with word2vec vectors: " + str(time.time() - start_time))


Time taken to fit the model with word2vec vectors: 1.9543437957763672


In [225]:
y_pred_word2vec = clf_decision_word2vec.predict(word2vec_test_df)
print(classification_report(y_test['avg_sentiment'], y_pred_word2vec))

              precision    recall  f1-score   support

          -1       0.45      0.09      0.15        56
           0       0.00      0.00      0.00        12
           1       0.64      0.95      0.76       114

    accuracy                           0.62       182
   macro avg       0.36      0.35      0.30       182
weighted avg       0.54      0.62      0.52       182



In [226]:
rfc_pipe = Pipeline([('mms', MinMaxScaler()),
                    ('rfc', RandomForestClassifier())])
params = [{
    'rfc__n_estimators':[50, 75, 100, 200, 300],
    'rfc__criterion':['gini', 'entropy'],
    'rfc__max_depth':[3, 5, 7, 9, 11],
    'rfc__random_state':[42]
}]

rfc_pipe.get_params().keys()
gs_rfc = GridSearchCV(rfc_pipe,
                    param_grid=params,
                    scoring='accuracy',
                    cv=5)
gs_rfc.fit(word2vec_train_df, y_train['avg_sentiment'])
gs_rfc.best_params_
gs_rfc.best_score_

0.6312605042016807

# Multinomial Naive Bayes

In [227]:
clf_multinomial_nb_word2vec = MultinomialNB()

start_time = time.time()
clf_multinomial_nb_word2vec.fit(word2vec_train_scaled_df, y_train['avg_sentiment'])
print("Time taken to fit the model with word2vec vectors: " + str(time.time() - start_time))

Time taken to fit the model with word2vec vectors: 0.012979984283447266


In [228]:
y_pred_word2vec = clf_decision_word2vec.predict(word2vec_test_df)
print(classification_report(y_test['avg_sentiment'], y_pred_word2vec))

              precision    recall  f1-score   support

          -1       0.45      0.09      0.15        56
           0       0.00      0.00      0.00        12
           1       0.64      0.95      0.76       114

    accuracy                           0.62       182
   macro avg       0.36      0.35      0.30       182
weighted avg       0.54      0.62      0.52       182



In [229]:
mnb_pipe = Pipeline([('mms', MinMaxScaler()),
                    ('mnb', MultinomialNB())])
params = [{
    'mnb__fit_prior':[True, False]
}]

mnb_pipe.get_params().keys()
gs_mnb = GridSearchCV(mnb_pipe,
                    param_grid=params,
                    scoring='accuracy',
                    cv=5)
gs_mnb.fit(word2vec_train_df, y_train['avg_sentiment'])
gs_mnb.best_params_
gs_mnb.best_score_

0.6004761904761905