In [173]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
import time
from nltk.tokenize import TweetTokenizer
from gensim.parsing.porter import PorterStemmer

from sklearn.tree import DecisionTreeClassifier


In [174]:
tweets_df = pd.read_csv('data/sentiment.csv')
tweets_df.shape

(1574, 19)

In [175]:
tweets_df.head()

Unnamed: 0.1,Unnamed: 0,user_id,tweet_id,username,location,following,followers,twt_created_at,total_tweets,retweet_count,text,hashtags,mentions,tweet_id_dup,textblob_polarity,nltk_compound,avg_sentiment,textblob_sentiment,nltk_sentiment
0,0,1.11337e+18,1.58927e+18,LifestyleVishnu,"Ambikapur, India",304,286,2022-11-06 15:04:46+00:00,5764,0,india demanding arrest deepak chaurasia pocso ...,"[{'text': 'NoActionOnArrestWarrant', 'indices'...","[{'screen_name': 'AjayHimatlal', 'name': 'Ajay...",1.589273e+18,-0.05,-0.875,-1,-1,-1
1,1,2249900000.0,1.58927e+18,prkgarg,GLOBE,709,203,2022-11-06 15:04:40+00:00,24799,0,get vote gujarat soon entire gang jail looting...,[],"[{'screen_name': 'ArvindKejriwal', 'name': 'Ar...",1.589273e+18,-0.05,0.0,-1,-1,0
2,2,7.80817e+17,1.58927e+18,SunilBhatM,"New Delhi, India",930,2284,2022-11-06 15:04:38+00:00,8235,0,magnificent visited pradhan manthri sangrahala...,[],"[{'screen_name': 'narendramodi', 'name': 'Nare...",1.589273e+18,0.386667,0.8807,1,1,1
3,3,1.38846e+18,1.58927e+18,veerappavenkap1,"Bengaluru, India",4924,5453,2022-11-06 15:03:55+00:00,53735,0,almost year bjp ruled gujarat chacha still tal...,[],"[{'screen_name': 'PTI_News', 'name': 'Press Tr...",1.589272e+18,0.0,0.0,0,0,0
4,4,1.46932e+18,1.58927e+18,MukeshS68108786,,173,903,2022-11-06 15:00:20+00:00,14241,2,time gurugram police come action money minded ...,"[{'text': 'NoActionOnArrestWarrant', 'indices'...","[{'screen_name': 'AJAYGUP69169747', 'name': 'A...",1.589271e+18,0.183333,-0.6486,-1,1,-1


In [176]:
# dropping user_id, username, location, following, followers, twt_created_at, total_tweets, retweet_count, hashtags, mentions, tweet_id_dup
tweets_df.drop(['Unnamed: 0', 'user_id', 'tweet_id', 'username', 'location', 'following', 'followers', 'twt_created_at', 'total_tweets', 'retweet_count', 'hashtags', 'mentions', 'textblob_polarity', 'nltk_compound', 'textblob_sentiment', 'nltk_sentiment'], axis=1, inplace=True)

In [177]:
tweets_df.columns

Index(['text', 'tweet_id_dup', 'avg_sentiment'], dtype='object')

In [178]:
tweets_df.head()

Unnamed: 0,text,tweet_id_dup,avg_sentiment
0,india demanding arrest deepak chaurasia pocso ...,1.589273e+18,-1
1,get vote gujarat soon entire gang jail looting...,1.589273e+18,-1
2,magnificent visited pradhan manthri sangrahala...,1.589273e+18,1
3,almost year bjp ruled gujarat chacha still tal...,1.589272e+18,0
4,time gurugram police come action money minded ...,1.589271e+18,-1


In [179]:
tweets_df['avg_sentiment'].value_counts()

 1    861
-1    580
 0    133
Name: avg_sentiment, dtype: int64

In [180]:
def creating_tokens(df):
    tokens = list()
    tokenizer = TweetTokenizer()
    
    for tweets in df.loc[:, 'text']:
        # print(len(tokenizer.tokenize(tweets)))
        tokens.append(tokenizer.tokenize(tweets))
    
    df['tokens'] = tokens
    
    porter_stemmer = PorterStemmer()
    # Get the stemmed_tokens
    df['stemmed_tokens'] = [[porter_stemmer.stem(word) for word in tokens] for tokens in df['tokens']]
    df['stemmed_tokens'].head(10)
    
    
    return df

In [181]:
tweets_df = creating_tokens(tweets_df)
tweets_df.head()

Unnamed: 0,text,tweet_id_dup,avg_sentiment,tokens,stemmed_tokens
0,india demanding arrest deepak chaurasia pocso ...,1.589273e+18,-1,"[india, demanding, arrest, deepak, chaurasia, ...","[india, demand, arrest, deepak, chaurasia, poc..."
1,get vote gujarat soon entire gang jail looting...,1.589273e+18,-1,"[get, vote, gujarat, soon, entire, gang, jail,...","[get, vote, gujarat, soon, entir, gang, jail, ..."
2,magnificent visited pradhan manthri sangrahala...,1.589273e+18,1,"[magnificent, visited, pradhan, manthri, sangr...","[magnific, visit, pradhan, manthri, sangrahala..."
3,almost year bjp ruled gujarat chacha still tal...,1.589272e+18,0,"[almost, year, bjp, ruled, gujarat, chacha, st...","[almost, year, bjp, rule, gujarat, chacha, sti..."
4,time gurugram police come action money minded ...,1.589271e+18,-1,"[time, gurugram, police, come, action, money, ...","[time, gurugram, polic, come, action, monei, m..."


In [182]:
from sklearn.model_selection import train_test_split

def split_data(df, test_size):
    x_train, x_test, y_train, y_test = train_test_split(df['stemmed_tokens'], df['avg_sentiment'], test_size=test_size, random_state=42, stratify=df['avg_sentiment'])
    
    print(y_train.value_counts())
    print(y_test.value_counts())
    # print(type(x_train))
    # print(type(y_train))
    
    x_train = x_train.to_frame()
    x_train = x_train.reset_index()
    
    x_test = x_test.to_frame()
    x_test = x_test.reset_index()
    
    y_train = y_train.to_frame()
    y_train = y_train.reset_index()
    
    y_test = y_test.to_frame()
    y_test = y_test.reset_index()
    
    return x_train, x_test, y_train, y_test

In [183]:
x_train, x_test, y_train, y_test = split_data(tweets_df, 0.3)

 1    602
-1    406
 0     93
Name: avg_sentiment, dtype: int64
 1    259
-1    174
 0     40
Name: avg_sentiment, dtype: int64


In [184]:
print(x_train.head())
print(x_test.head())
print(y_train.head())
print(y_test.head())

   index                                     stemmed_tokens
0   1176  [light, hamilton, road, gurugram, last, month,...
1    205  [piror, matter, home, first, seat, first, piro...
2   1226  [citizen, well, awar, ill, benefit, work, gras...
3    721  [dai, left, mega, event, venu, central, univer...
4    675  [newli, dedic, shri, mahak, lok, shri, offer, ...
   index                                     stemmed_tokens
0   1126  [sometim, water, secur, hour, boundari, stp, w...
1   1142  [todai, indian, express, see, educ, model, eno...
2    348  [gave, good, railwai, minist, suresh, prabhu, ...
3   1337  [instead, delhi, could, develop, central, citi...
4   1568  [guess, lot, improv, done, seat, need, outsmar...
   index  avg_sentiment
0   1176              1
1    205              1
2   1226              1
3    721              0
4    675              1
   index  avg_sentiment
0   1126             -1
1   1142              0
2    348             -1
3   1337             -1
4   1568        

In [185]:
from gensim.models import Word2Vec
import time

OUTPUT_FOLDER = '/Users/nitanshjain/Documents/Thapar 4th Sem/Machine Learing/Machine_Learning_Project/'

start_time = time.time()
tokens = pd.Series(tweets_df['stemmed_tokens']).values
# print(tokens)
word2vec_model_file = OUTPUT_FOLDER + 'word2vec_' + str(5000) + '.model'

w2v_model = Word2Vec(tokens, min_count=1, vector_size=5000, window=5, workers=4, sg=2)
print("Time taken to train word2vec model: " + str(time.time() - start_time))
w2v_model.save(word2vec_model_file)


Time taken to train word2vec model: 5.136754035949707


In [186]:
def create_file(create_file, model_file, x):
    sg_w2v_model = Word2Vec.load(model_file)
    
    with open(create_file, 'w+') as word2vec_file:
        for index, row in x.iterrows():
            model_vector = (np.mean([sg_w2v_model.wv[token] for token in row['stemmed_tokens']], axis=0)).tolist()
            if index == 0:
                header = ",".join(str(ele) for ele in range(5000))
                word2vec_file.write(header)
                word2vec_file.write("\n")
            
            if type(model_vector) is list:
                line1 = ",".join( [str(vector_element) for vector_element in model_vector] )
            else:
                line1 = ",".join([str(0) for i in range(5000)])
            word2vec_file.write(line1)
            word2vec_file.write('\n')
    
    df = pd.read_csv(create_file)
    return df
        

In [187]:
word2vec_train_filename = OUTPUT_FOLDER + 'word2vec_train_' + str(5000) + '.csv'
word2vec_train_df = create_file(word2vec_train_filename, word2vec_model_file, x_train)
print(word2vec_train_df.shape)
word2vec_train_df.head()

(1101, 5000)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4990,4991,4992,4993,4994,4995,4996,4997,4998,4999
0,0.020854,0.01881,0.015521,-0.010192,-0.004975,-0.015644,0.016676,0.006706,0.010173,0.020653,...,-0.000926,-0.047451,-0.021683,-0.006292,0.017268,0.03374,-0.020861,-0.016758,-0.025478,-0.011899
1,0.015238,0.013708,0.01132,-0.007324,-0.00362,-0.011353,0.012229,0.004828,0.007363,0.014947,...,-0.000583,-0.034631,-0.015766,-0.004721,0.012495,0.024428,-0.015217,-0.012071,-0.018645,-0.008696
2,0.024836,0.022093,0.01827,-0.012116,-0.005836,-0.018641,0.019734,0.007889,0.012159,0.024413,...,-0.000956,-0.056088,-0.025878,-0.007447,0.020325,0.040023,-0.024704,-0.019872,-0.030141,-0.014099
3,0.013617,0.013752,0.010346,-0.004657,-0.004618,-0.008596,0.010451,0.00408,0.005624,0.01427,...,-0.001358,-0.030844,-0.01278,-0.005003,0.012245,0.020455,-0.014675,-0.011344,-0.017423,-0.008194
4,0.012724,0.01147,0.00943,-0.006169,-0.002989,-0.009467,0.010323,0.004108,0.006103,0.012436,...,-0.000534,-0.028951,-0.013141,-0.003973,0.010413,0.020345,-0.012704,-0.010054,-0.015533,-0.007209


In [188]:
word2vec_test_filename = OUTPUT_FOLDER + 'word2vec_test_' + str(5000) + '.csv'
word2vec_test_df = create_file(word2vec_test_filename, word2vec_model_file, x_test)
print(word2vec_test_df.shape)
word2vec_test_df.head()

(473, 5000)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4990,4991,4992,4993,4994,4995,4996,4997,4998,4999
0,0.016199,0.014561,0.01222,-0.007963,-0.003597,-0.011951,0.013637,0.005241,0.007505,0.015465,...,-0.000668,-0.037562,-0.016491,-0.005059,0.013056,0.0258,-0.015963,-0.012042,-0.019934,-0.009071
1,0.023004,0.020816,0.016935,-0.010886,-0.00566,-0.016898,0.018264,0.007344,0.011194,0.022824,...,-0.001055,-0.051947,-0.023798,-0.007079,0.019,0.036897,-0.023072,-0.018507,-0.028133,-0.013115
2,0.019047,0.017101,0.014114,-0.009247,-0.004601,-0.014295,0.015192,0.006182,0.009275,0.018743,...,-0.000722,-0.043251,-0.019941,-0.00577,0.015626,0.030842,-0.01899,-0.015257,-0.023224,-0.010795
3,0.014967,0.013758,0.011148,-0.006791,-0.003859,-0.010802,0.011882,0.00474,0.006986,0.014852,...,-0.000845,-0.033879,-0.015204,-0.004718,0.012505,0.023677,-0.015154,-0.012036,-0.018426,-0.008568
4,0.018456,0.016626,0.013695,-0.008992,-0.004398,-0.013847,0.014819,0.00594,0.009029,0.018088,...,-0.000671,-0.041918,-0.01925,-0.0056,0.015066,0.02969,-0.018451,-0.014573,-0.022513,-0.01053


In [189]:
print(y_train.shape)
print(y_test.shape)

(1101, 2)
(473, 2)


# Decision Tree

In [190]:
# Initialize the model
clf_decision_word2vec = DecisionTreeClassifier()

start_time = time.time()
# Fit the model
clf_decision_word2vec.fit(word2vec_train_df, y_train['avg_sentiment'])
print("Time taken to fit the model with word2vec vectors: " + str(time.time() - start_time))

Time taken to fit the model with word2vec vectors: 7.080389738082886


In [191]:
from sklearn.metrics import classification_report
        
y_pred_word2vec = clf_decision_word2vec.predict(word2vec_test_df)
print(classification_report(y_test['avg_sentiment'], y_pred_word2vec))

              precision    recall  f1-score   support

          -1       0.51      0.48      0.49       174
           0       0.21      0.20      0.21        40
           1       0.63      0.66      0.65       259

    accuracy                           0.56       473
   macro avg       0.45      0.45      0.45       473
weighted avg       0.55      0.56      0.55       473



https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=&cad=rja&uact=8&ved=2ahUKEwitu6WthJ77AhX_TmwGHYzTBjMQFnoECBcQAQ&url=https%3A%2F%2Fmedium.com%2F%40zafaralibagh6%2Fa-simple-word2vec-tutorial-61e64e38a6a1&usg=AOvVaw3tHKEk24OxG_LwAiMr2wZs