In [91]:
# machine learning specialisation coursera 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time


import nltk
from nltk.tokenize import TweetTokenizer
from gensim.parsing.porter import PorterStemmer

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


In [92]:
tweets_df = pd.read_csv('data/sentiment.csv')
tweets_df.shape

(651, 18)

In [93]:
tweets_df.head()

Unnamed: 0.1,Unnamed: 0,user_id,tweet_id,username,location,following,followers,twt_created_at,total_tweets,retweet_count,text,hashtags,mentions,textblob_polarity,nltk_compound,avg_sentiment,textblob_sentiment,nltk_sentiment
0,0,2,2,pspatilsbi,Bangalore,325,25,2022-11-08 22:08:44+00:00,2704,0,agenda great old god blees end,[],"[{'screen_name': 'INCIndia', 'name': 'Congress...",0.45,0.7351,1,1,1
1,1,3,3,ththegde,"Kandivali East, Mumbai",582,57,2022-11-08 22:00:49+00:00,1969,0,please allow citizen buy forex investment like...,[],"[{'screen_name': 'PMOIndia', 'name': 'PMO Indi...",0.0,0.802,1,0,1
2,2,5,5,rupz_boruah,"Chabua, India",14,33,2022-11-08 21:54:37+00:00,309,0,please take necessary action neurologist amc d...,"[{'text': 'Dr_Dhrubajyoti_Kurmi', 'indices': [...","[{'screen_name': 'MoHFW_INDIA', 'name': 'Minis...",0.033333,0.6369,1,1,1
3,3,8,8,lazizpizza99,"Jasola Vihar, New Delhi",23,1,2022-11-08 21:28:48+00:00,6,0,sleeping suddenly bed start shaking ignored ke...,"[{'text': 'peace', 'indices': [231, 237]}, {'t...","[{'screen_name': 'LtGovDelhi', 'name': 'LG Del...",0.198333,0.6597,1,1,1
4,4,11,11,UtkarshMishra_9,"Noida, India",707,1122,2022-11-08 21:14:55+00:00,5764,0,estimated magnitude earthquake affected countr...,"[{'text': 'earthquake', 'indices': [137, 148]}...","[{'screen_name': 'ANI', 'name': 'ANI', 'id': 3...",0.0,-0.1531,-1,0,-1


In [94]:
# dropping user_id, username, location, following, followers, twt_created_at, total_tweets, retweet_count, hashtags, mentions, tweet_id_dup
tweets_df.drop(['Unnamed: 0', 'user_id', 'tweet_id', 'username', 'location', 'following', 'followers', 'twt_created_at', 'total_tweets', 'retweet_count', 'hashtags', 'mentions', 'textblob_polarity', 'nltk_compound', 'textblob_sentiment', 'nltk_sentiment'], axis=1, inplace=True)

In [95]:
tweets_df.columns

Index(['text', 'avg_sentiment'], dtype='object')

In [96]:
tweets_df.head()

Unnamed: 0,text,avg_sentiment
0,agenda great old god blees end,1
1,please allow citizen buy forex investment like...,1
2,please take necessary action neurologist amc d...,1
3,sleeping suddenly bed start shaking ignored ke...,1
4,estimated magnitude earthquake affected countr...,-1


In [97]:
tweets_df['avg_sentiment'].value_counts()

 1    418
-1    195
 0     38
Name: avg_sentiment, dtype: int64

In [98]:
def creating_tokens(df):
    tokens = list()
    tokenizer = TweetTokenizer()
    
    for tweets in df.loc[:, 'text']:
        # print(len(tokenizer.tokenize(tweets)))
        tokens.append(tokenizer.tokenize(tweets))
    
    df['tokens'] = tokens
    
    porter_stemmer = PorterStemmer()
    # Get the stemmed_tokens
    df['stemmed_tokens'] = [[porter_stemmer.stem(word) for word in tokens] for tokens in df['tokens']]
    df['stemmed_tokens'].head(10)
    
    
    return df

In [99]:
tweets_df = creating_tokens(tweets_df)
tweets_df.head()

Unnamed: 0,text,avg_sentiment,tokens,stemmed_tokens
0,agenda great old god blees end,1,"[agenda, great, old, god, blees, end]","[agenda, great, old, god, blee, end]"
1,please allow citizen buy forex investment like...,1,"[please, allow, citizen, buy, forex, investmen...","[pleas, allow, citizen, bui, forex, invest, li..."
2,please take necessary action neurologist amc d...,1,"[please, take, necessary, action, neurologist,...","[pleas, take, necessari, action, neurologist, ..."
3,sleeping suddenly bed start shaking ignored ke...,1,"[sleeping, suddenly, bed, start, shaking, igno...","[sleep, suddenli, bed, start, shake, ignor, ke..."
4,estimated magnitude earthquake affected countr...,-1,"[estimated, magnitude, earthquake, affected, c...","[estim, magnitud, earthquak, affect, countri, ..."


In [100]:
from sklearn.model_selection import train_test_split

def split_data(df, test_size):
    x_train, x_test, y_train, y_test = train_test_split(df['stemmed_tokens'], df['avg_sentiment'], test_size=test_size, random_state=42, stratify=df['avg_sentiment'])
    
    print(y_train.value_counts())
    print(y_test.value_counts())
    # print(type(x_train))
    # print(type(y_train))
    
    x_train = x_train.to_frame()
    x_train = x_train.reset_index()
    
    x_test = x_test.to_frame()
    x_test = x_test.reset_index()
    
    y_train = y_train.to_frame()
    y_train = y_train.reset_index()
    
    y_test = y_test.to_frame()
    y_test = y_test.reset_index()
    
    return x_train, x_test, y_train, y_test

In [101]:
x_train, x_test, y_train, y_test = split_data(tweets_df, 0.3)

 1    292
-1    136
 0     27
Name: avg_sentiment, dtype: int64
 1    126
-1     59
 0     11
Name: avg_sentiment, dtype: int64


In [102]:
print(x_train.head())
print(x_test.head())
print(y_train.head())
print(y_test.head())

   index                                     stemmed_tokens
0    106  [narendra, modi, launch, logo, said, logo, mes...
1     64  [could, prevent, mani, scam, dynast, massiv, d...
2     32  [journei, toward, chang, initi, good, report, ...
3    118  [democraci, india, evm, tamper, medium, becam,...
4    167  [free, medic, camp, organis, jan, aushadhi, op...
   index                                     stemmed_tokens
0    548  [gujarat, give, love, sweet, nashta, food, cha...
1    532  [rememb, great, person, let, gener, judg, extr...
2    209  [vasudaiva, kutumbakam, signatur, india, compa...
3     28  [claim, submit, onlin, portal, yahi, statu, sh...
4    173  [delhi, terribl, vouch, hvng, experienc, last,...
   index  avg_sentiment
0    106              1
1     64              1
2     32              1
3    118              0
4    167              1
   index  avg_sentiment
0    548              1
1    532              1
2    209              1
3     28              0
4    173        

In [103]:
max_len = -1
for tweets in tweets_df.loc[:, 'stemmed_tokens']:
    if(max_len < len(tweets)):
        max_len = len(tweets)
    

In [104]:
max_len

36

In [105]:
from gensim.models import Word2Vec
import time

OUTPUT_FOLDER = '/Users/nitanshjain/Documents/Thapar 4th Sem/Machine Learing/Machine_Learning_Project/'

start_time = time.time()
tokens = pd.Series(tweets_df['stemmed_tokens']).values
# print(tokens)
word2vec_model_file = OUTPUT_FOLDER + 'word2vec_' + str(200) + '.model'

w2v_model = Word2Vec(tokens, min_count=1, vector_size=200, window=5, workers=4, sg=2)
w2v_model.train(tokens, epochs=10, total_examples=len(tokens))
print("Time taken to train word2vec model: " + str(time.time() - start_time))
w2v_model.save(word2vec_model_file)


Time taken to train word2vec model: 1.063798189163208


In [106]:
def create_file(create_file, model_file, x):
    sg_w2v_model = Word2Vec.load(model_file)
    
    with open(create_file, 'w+') as word2vec_file:
        for index, row in x.iterrows():
            model_vector = (np.mean([sg_w2v_model.wv[token] for token in row['stemmed_tokens']], axis=0)).tolist()
            if index == 0:
                header = ",".join(str(ele) for ele in range(200))
                word2vec_file.write(header)
                word2vec_file.write("\n")
            
            if type(model_vector) is list:
                line1 = ",".join( [str(vector_element) for vector_element in model_vector] )
            else:
                line1 = ",".join([str(0) for i in range(200)])
            word2vec_file.write(line1)
            word2vec_file.write('\n')
    
    df = pd.read_csv(create_file)
    return df
        

In [107]:
word2vec_train_filename = OUTPUT_FOLDER + 'word2vec_train_' + str(200) + '.csv'
word2vec_train_df = create_file(word2vec_train_filename, word2vec_model_file, x_train)
print(word2vec_train_df.shape)
word2vec_train_df.head()

(455, 200)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,-0.018076,0.011372,0.038198,0.057504,0.056289,-0.104743,-0.077448,0.227835,-0.069157,0.112864,...,0.144789,-0.136987,-0.011252,-0.10648,0.085528,0.035771,0.096498,-0.182156,-0.09149,-0.025682
1,0.029739,0.009849,0.021209,0.04107,0.041798,-0.079154,-0.032831,0.209595,-0.040423,0.094141,...,0.122804,-0.112102,-0.048178,-0.096391,0.092991,0.04701,0.075095,-0.136985,-0.066338,-0.012703
2,0.019496,0.008106,0.032317,0.053849,0.062072,-0.079141,-0.055643,0.210309,-0.053082,0.107435,...,0.142372,-0.122349,-0.026313,-0.082683,0.081486,0.056014,0.084179,-0.14358,-0.086266,-0.016943
3,0.010925,0.005897,0.034394,0.052326,0.050722,-0.078212,-0.054323,0.195932,-0.053481,0.097929,...,0.130955,-0.116084,-0.021956,-0.078397,0.072493,0.049883,0.083022,-0.135542,-0.078814,-0.015669
4,0.014852,0.006526,0.029912,0.041831,0.033212,-0.066289,-0.052569,0.176798,-0.036338,0.083058,...,0.110183,-0.103352,-0.028663,-0.081328,0.071953,0.039065,0.068604,-0.121026,-0.057256,-0.009746


In [108]:
word2vec_test_filename = OUTPUT_FOLDER + 'word2vec_test_' + str(200) + '.csv'
word2vec_test_df = create_file(word2vec_test_filename, word2vec_model_file, x_test)
print(word2vec_test_df.shape)
word2vec_test_df.head()

(196, 200)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,0.037974,0.003105,0.013955,0.045686,0.067164,-0.074759,-0.034472,0.197542,-0.038955,0.09114,...,0.110857,-0.112666,-0.037386,-0.073576,0.072823,0.03376,0.069813,-0.129657,-0.069322,-0.022718
1,0.021099,0.009306,0.03422,0.050636,0.06021,-0.085057,-0.050388,0.219376,-0.052754,0.107717,...,0.137475,-0.128444,-0.034326,-0.092264,0.089104,0.050955,0.08763,-0.149854,-0.086597,-0.017834
2,-0.026014,0.002494,0.038307,0.058227,0.045578,-0.085853,-0.088061,0.195048,-0.070608,0.100574,...,0.143563,-0.113067,0.007735,-0.087587,0.072766,0.051799,0.086476,-0.159611,-0.097626,-0.01196
3,0.046584,0.01331,0.019839,0.048586,0.074956,-0.083865,-0.021533,0.242438,-0.051148,0.106349,...,0.143391,-0.139552,-0.065461,-0.09251,0.112321,0.045782,0.084568,-0.153077,-0.073667,-0.02109
4,0.011807,0.003984,0.027028,0.041721,0.037679,-0.064722,-0.048218,0.173453,-0.033045,0.087069,...,0.110211,-0.100096,-0.029775,-0.081044,0.071949,0.042363,0.069896,-0.121527,-0.058047,-0.014548


# Decision Tree

In [109]:
# Initialize the model
clf_decision_word2vec = DecisionTreeClassifier()

start_time = time.time()
# Fit the model
clf_decision_word2vec.fit(word2vec_train_df, y_train['avg_sentiment'])
print("Time taken to fit the model with word2vec vectors: " + str(time.time() - start_time))

Time taken to fit the model with word2vec vectors: 0.07840299606323242


In [110]:
from sklearn.metrics import classification_report
        
y_pred_word2vec = clf_decision_word2vec.predict(word2vec_test_df)
print(classification_report(y_test['avg_sentiment'], y_pred_word2vec))

              precision    recall  f1-score   support

          -1       0.41      0.44      0.43        59
           0       0.06      0.09      0.07        11
           1       0.72      0.66      0.69       126

    accuracy                           0.56       196
   macro avg       0.40      0.40      0.39       196
weighted avg       0.59      0.56      0.57       196

