In [392]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from nltk.tokenize import TweetTokenizer
from gensim.parsing.porter import PorterStemmer


In [393]:
tweets_df = pd.read_csv('data/sentiment.csv')
tweets_df.shape

(1574, 19)

In [394]:
tweets_df.head()

Unnamed: 0.1,Unnamed: 0,user_id,tweet_id,username,location,following,followers,twt_created_at,total_tweets,retweet_count,text,hashtags,mentions,tweet_id_dup,textblob_polarity,nltk_compound,avg_sentiment,textblob_sentiment,nltk_sentiment
0,0,1.11337e+18,1.58927e+18,LifestyleVishnu,"Ambikapur, India",304,286,2022-11-06 15:04:46+00:00,5764,0,india demanding arrest deepak chaurasia pocso ...,"[{'text': 'NoActionOnArrestWarrant', 'indices'...","[{'screen_name': 'AjayHimatlal', 'name': 'Ajay...",1.589273e+18,-0.05,-0.875,-1,-1,-1
1,1,2249900000.0,1.58927e+18,prkgarg,GLOBE,709,203,2022-11-06 15:04:40+00:00,24799,0,get vote gujarat soon entire gang jail looting...,[],"[{'screen_name': 'ArvindKejriwal', 'name': 'Ar...",1.589273e+18,-0.05,0.0,-1,-1,0
2,2,7.80817e+17,1.58927e+18,SunilBhatM,"New Delhi, India",930,2284,2022-11-06 15:04:38+00:00,8235,0,magnificent visited pradhan manthri sangrahala...,[],"[{'screen_name': 'narendramodi', 'name': 'Nare...",1.589273e+18,0.386667,0.8807,1,1,1
3,3,1.38846e+18,1.58927e+18,veerappavenkap1,"Bengaluru, India",4924,5453,2022-11-06 15:03:55+00:00,53735,0,almost year bjp ruled gujarat chacha still tal...,[],"[{'screen_name': 'PTI_News', 'name': 'Press Tr...",1.589272e+18,0.0,0.0,0,0,0
4,4,1.46932e+18,1.58927e+18,MukeshS68108786,,173,903,2022-11-06 15:00:20+00:00,14241,2,time gurugram police come action money minded ...,"[{'text': 'NoActionOnArrestWarrant', 'indices'...","[{'screen_name': 'AJAYGUP69169747', 'name': 'A...",1.589271e+18,0.183333,-0.6486,-1,1,-1


In [395]:
# dropping user_id, username, location, following, followers, twt_created_at, total_tweets, retweet_count, hashtags, mentions, tweet_id_dup
tweets_df.drop(['Unnamed: 0', 'user_id', 'tweet_id', 'username', 'location', 'following', 'followers', 'twt_created_at', 'total_tweets', 'retweet_count', 'hashtags', 'mentions', 'textblob_polarity', 'nltk_compound', 'textblob_sentiment', 'nltk_sentiment'], axis=1, inplace=True)

In [396]:
tweets_df.columns

Index(['text', 'tweet_id_dup', 'avg_sentiment'], dtype='object')

In [397]:
tweets_df.head()

Unnamed: 0,text,tweet_id_dup,avg_sentiment
0,india demanding arrest deepak chaurasia pocso ...,1.589273e+18,-1
1,get vote gujarat soon entire gang jail looting...,1.589273e+18,-1
2,magnificent visited pradhan manthri sangrahala...,1.589273e+18,1
3,almost year bjp ruled gujarat chacha still tal...,1.589272e+18,0
4,time gurugram police come action money minded ...,1.589271e+18,-1


In [398]:
tweets_df['avg_sentiment'].value_counts()

 1    861
-1    580
 0    133
Name: avg_sentiment, dtype: int64

In [399]:
def creating_tokens(df):
    tokens = list()
    tokenizer = TweetTokenizer()
    
    for tweets in df.loc[:, 'text']:
        # print(len(tokenizer.tokenize(tweets)))
        tokens.append(tokenizer.tokenize(tweets))
    
    df['tokens'] = tokens
    
    porter_stemmer = PorterStemmer()
    # Get the stemmed_tokens
    df['stemmed_tokens'] = [[porter_stemmer.stem(word) for word in tokens] for tokens in df['tokens']]
    df['stemmed_tokens'].head(10)
    
    
    return df

In [400]:
tweets_df = creating_tokens(tweets_df)
tweets_df.head()

Unnamed: 0,text,tweet_id_dup,avg_sentiment,tokens,stemmed_tokens
0,india demanding arrest deepak chaurasia pocso ...,1.589273e+18,-1,"[india, demanding, arrest, deepak, chaurasia, ...","[india, demand, arrest, deepak, chaurasia, poc..."
1,get vote gujarat soon entire gang jail looting...,1.589273e+18,-1,"[get, vote, gujarat, soon, entire, gang, jail,...","[get, vote, gujarat, soon, entir, gang, jail, ..."
2,magnificent visited pradhan manthri sangrahala...,1.589273e+18,1,"[magnificent, visited, pradhan, manthri, sangr...","[magnific, visit, pradhan, manthri, sangrahala..."
3,almost year bjp ruled gujarat chacha still tal...,1.589272e+18,0,"[almost, year, bjp, ruled, gujarat, chacha, st...","[almost, year, bjp, rule, gujarat, chacha, sti..."
4,time gurugram police come action money minded ...,1.589271e+18,-1,"[time, gurugram, police, come, action, money, ...","[time, gurugram, polic, come, action, monei, m..."


In [401]:
from sklearn.model_selection import train_test_split

def split_data(df, test_size):
    x_train, x_test, y_train, y_test = train_test_split(df['stemmed_tokens'], df['avg_sentiment'], test_size=test_size, random_state=42, stratify=df['avg_sentiment'])
    
    print(y_train.value_counts())
    print(y_test.value_counts())
    # print(type(x_train))
    # print(type(y_train))
    
    x_train = x_train.to_frame()
    x_train = x_train.reset_index()
    
    x_test = x_test.to_frame()
    x_test = x_test.reset_index()
    
    y_train = y_train.to_frame()
    y_train = y_train.reset_index()
    
    y_test = y_test.to_frame()
    y_test = y_test.reset_index()
    
    return x_train, x_test, y_train, y_test

In [402]:
x_train, x_test, y_train, y_test = split_data(tweets_df, 0.3)

 1    602
-1    406
 0     93
Name: avg_sentiment, dtype: int64
 1    259
-1    174
 0     40
Name: avg_sentiment, dtype: int64


https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=&cad=rja&uact=8&ved=2ahUKEwitu6WthJ77AhX_TmwGHYzTBjMQFnoECBcQAQ&url=https%3A%2F%2Fmedium.com%2F%40zafaralibagh6%2Fa-simple-word2vec-tutorial-61e64e38a6a1&usg=AOvVaw3tHKEk24OxG_LwAiMr2wZs

In [403]:
print(x_train.head())
print(x_test.head())
print(y_train.head())
print(y_test.head())

   index                                     stemmed_tokens
0   1176  [light, hamilton, road, gurugram, last, month,...
1    205  [piror, matter, home, first, seat, first, piro...
2   1226  [citizen, well, awar, ill, benefit, work, gras...
3    721  [dai, left, mega, event, venu, central, univer...
4    675  [newli, dedic, shri, mahak, lok, shri, offer, ...
   index                                     stemmed_tokens
0   1126  [sometim, water, secur, hour, boundari, stp, w...
1   1142  [todai, indian, express, see, educ, model, eno...
2    348  [gave, good, railwai, minist, suresh, prabhu, ...
3   1337  [instead, delhi, could, develop, central, citi...
4   1568  [guess, lot, improv, done, seat, need, outsmar...
   index  avg_sentiment
0   1176              1
1    205              1
2   1226              1
3    721              0
4    675              1
   index  avg_sentiment
0   1126             -1
1   1142              0
2    348             -1
3   1337             -1
4   1568        

In [404]:
from gensim.models import Word2Vec
import time

OUTPUT_FOLDER = '/Users/nitanshjain/Documents/Thapar 4th Sem/Machine Learing/Machine_Learning_Project/'

start_time = time.time()
tokens = pd.Series(tweets_df['stemmed_tokens']).values
word2vec_model_file = OUTPUT_FOLDER + 'word2vec_' + str(100) + '.model'

w2v_model = Word2Vec(tokens, min_count=1, vector_size=100, window=5, workers=4, sg=1)
print("Time taken to train word2vec model: " + str(time.time() - start_time))
w2v_model.save(word2vec_model_file)


Time taken to train word2vec model: 0.47362303733825684


In [405]:
sg_w2v_model = Word2Vec.load(word2vec_model_file)

print("Index of the word 'action':")
print(sg_w2v_model.wv.key_to_index["action"])
# Total number of the words 
print(len(sg_w2v_model.wv))
# Print the size of the word2vec vector for one word
print("Length of the vector generated for a word")
print(len(sg_w2v_model.wv['action']))
# Get the mean for the vectors for an example review
print("Print the length after taking average of all word vectors in a sentence:")
print(len(np.mean([sg_w2v_model.wv[token] for token in tweets_df['stemmed_tokens'][0]], axis=0)))
print(len(tweets_df['stemmed_tokens'][0]))



Index of the word 'action':
6
5227
Length of the vector generated for a word
100
Print the length after taking average of all word vectors in a sentence:
100
25


In [406]:
word2vec_filename = OUTPUT_FOLDER + 'train_review_word2vec.csv'
with open(word2vec_filename, 'w+') as word2vec_file:
    for index, row in x_train.iterrows():
        model_vector = (np.mean([sg_w2v_model.wv[token] for token in row['stemmed_tokens']], axis=0)).tolist()
        if index == 0:
            header = ",".join(str(ele) for ele in range(100))
            word2vec_file.write(header)
            word2vec_file.write("\n")
        # Check if the line exists else it is vector of zeros
        if type(model_vector) is list:  
            line1 = ",".join( [str(vector_element) for vector_element in model_vector] )
        else:
            line1 = ",".join([str(0) for i in range(100)])
        word2vec_file.write(line1)
        word2vec_file.write('\n')

In [408]:
import time
from sklearn.tree import DecisionTreeClassifier
word2vec_df = pd.read_csv(word2vec_filename)
#Initialize the model
clf_decision_word2vec = DecisionTreeClassifier()

start_time = time.time()
# Fit the model
clf_decision_word2vec.fit(word2vec_df, y_train['avg_sentiment'])
print("Time taken to fit the model with word2vec vectors: " + str(time.time() - start_time))

Time taken to fit the model with word2vec vectors: 0.16178512573242188


In [411]:
from sklearn.metrics import classification_report
test_features_word2vec = []
for index, row in x_test.iterrows():
    model_vector = np.mean([sg_w2v_model.wv[token] for token in row['stemmed_tokens']], axis=0)
    if type(model_vector) is list:
        test_features_word2vec.append(model_vector)
    else:
        test_features_word2vec.append(np.array([0 for i in range(100)]))
test_predictions_word2vec = clf_decision_word2vec.predict(test_features_word2vec)
print(classification_report(y_test['avg_sentiment'],test_predictions_word2vec))

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00       174
           0       0.08      1.00      0.16        40
           1       0.00      0.00      0.00       259

    accuracy                           0.08       473
   macro avg       0.03      0.33      0.05       473
weighted avg       0.01      0.08      0.01       473



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
