In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
import time
from nltk.tokenize import TweetTokenizer
from gensim.parsing.porter import PorterStemmer

from sklearn.tree import DecisionTreeClassifier


In [15]:
tweets_df = pd.read_csv('/Users/nitanshjain/Documents/Thapar 4th Sem/Machine Learing/Machine_Learning_Project/data/final_manual_priority.csv')
tweets_df.shape

(198, 19)

In [16]:
tweets_df.head()

Unnamed: 0,S.no.,user_id,tweet_id,username,location,following,followers,twt_created_at,total_tweets,retweet_count,text,hashtags,mentions,textblob_polarity,nltk_compound,avg_sentiment,textblob_sentiment,nltk_sentiment,priority
0,4,11,11,UtkarshMishra_9,"Noida, India",707,1122,2022-11-08 21:14:55+00:00,5764,0,estimated magnitude earthquake affected countr...,"[{'text': 'earthquake', 'indices': [137, 148]}...","[{'screen_name': 'ANI', 'name': 'ANI', 'id': 3...",0.0,-0.1531,-1,0,-1,0
1,8,18,18,GirjeshKPatel,"‚Ä°¬ß‚â§‚Ä°¬ß√±‚Ä°¬ß¬Æ‚Ä°¬ß√§, ‚Ä°¬ß‚â†‚Ä°¬ß√¶...",164,988,2022-11-08 20:54:48+00:00,522,0,heavy roorke uttrakhand second horrible moment,"[{'text': 'earthquake', 'indices': [6, 17]}]","[{'screen_name': 'ZeeNews', 'name': 'Zee News'...",-0.4,-0.5423,-1,-1,-1,1
2,10,25,25,TheAnantpandit,"New Delhi, India",409,19,2022-11-08 20:46:16+00:00,152,0,earthquake magnitude occurred ist lat long dep...,[],"[{'screen_name': 'Indiametdept', 'name': 'Indi...",-0.05,0.0,-1,-1,0,0
3,11,31,31,kanhagupta21,Allahabad,182,13,2022-11-08 20:38:44+00:00,73,0,horrible ended running outside home safe,[],"[{'screen_name': 'ANI', 'name': 'ANI', 'id': 3...",-0.166667,-0.1531,-1,-1,-1,1
4,13,42,42,Kunalgupta_voi,,21,1,2022-11-08 20:10:38+00:00,1,0,choking wakeup antismog gun installed watering...,"[{'text': 'DelhiPollution', 'indices': [253, 2...","[{'screen_name': 'ArvindKejriwal', 'name': 'Ar...",0.0,-0.6597,-1,0,-1,1


In [17]:
# dropping user_id, username, location, following, followers, twt_created_at, total_tweets, retweet_count, hashtags, mentions, tweet_id_dup
tweets_df.drop(['S.no.', 'user_id', 'username', 'location', 'following', 'followers', 'twt_created_at', 'total_tweets', 'retweet_count', 'hashtags', 'mentions', 'textblob_polarity', 'nltk_compound', 'textblob_sentiment', 'nltk_sentiment', 'avg_sentiment'], axis=1, inplace=True)

In [18]:
tweets_df.columns

Index(['tweet_id', 'text', 'priority'], dtype='object')

In [19]:
tweets_df.head()

Unnamed: 0,tweet_id,text,priority
0,11,estimated magnitude earthquake affected countr...,0
1,18,heavy roorke uttrakhand second horrible moment,1
2,25,earthquake magnitude occurred ist lat long dep...,0
3,31,horrible ended running outside home safe,1
4,42,choking wakeup antismog gun installed watering...,1


In [20]:
tweets_df['priority'].value_counts()

1    109
0     89
Name: priority, dtype: int64

In [21]:
def creating_tokens(df):
    tokens = list()
    tokenizer = TweetTokenizer()
    
    for tweets in df.loc[:, 'text']:
        # print(len(tokenizer.tokenize(tweets)))
        tokens.append(tokenizer.tokenize(tweets))
    
    df['tokens'] = tokens
    
    porter_stemmer = PorterStemmer()
    # Get the stemmed_tokens
    df['stemmed_tokens'] = [[porter_stemmer.stem(word) for word in tokens] for tokens in df['tokens']]
    df['stemmed_tokens'].head(10)
    
    
    return df

In [22]:
tweets_df = creating_tokens(tweets_df)
tweets_df.head()

Unnamed: 0,tweet_id,text,priority,tokens,stemmed_tokens
0,11,estimated magnitude earthquake affected countr...,0,"[estimated, magnitude, earthquake, affected, c...","[estim, magnitud, earthquak, affect, countri, ..."
1,18,heavy roorke uttrakhand second horrible moment,1,"[heavy, roorke, uttrakhand, second, horrible, ...","[heavi, roork, uttrakhand, second, horribl, mo..."
2,25,earthquake magnitude occurred ist lat long dep...,0,"[earthquake, magnitude, occurred, ist, lat, lo...","[earthquak, magnitud, occur, ist, lat, long, d..."
3,31,horrible ended running outside home safe,1,"[horrible, ended, running, outside, home, safe]","[horribl, end, run, outsid, home, safe]"
4,42,choking wakeup antismog gun installed watering...,1,"[choking, wakeup, antismog, gun, installed, wa...","[choke, wakeup, antismog, gun, instal, water, ..."


In [23]:
from sklearn.model_selection import train_test_split

def split_data(df, test_size):
    x_train, x_test, y_train, y_test = train_test_split(df['stemmed_tokens'], df['priority'], test_size=test_size, random_state=42, stratify=df['priority'])
    
    print(y_train.value_counts())
    print(y_test.value_counts())
    # print(type(x_train))
    # print(type(y_train))
    
    x_train = x_train.to_frame()
    x_train = x_train.reset_index()
    
    x_test = x_test.to_frame()
    x_test = x_test.reset_index()
    
    y_train = y_train.to_frame()
    y_train = y_train.reset_index()
    
    y_test = y_test.to_frame()
    y_test = y_test.reset_index()
    
    return x_train, x_test, y_train, y_test

In [24]:
x_train, x_test, y_train, y_test = split_data(tweets_df, 0.3)

1    76
0    62
Name: priority, dtype: int64
1    33
0    27
Name: priority, dtype: int64


In [25]:
print(x_train.head())
print(x_test.head())
print(y_train.head())
print(y_test.head())

   index                                     stemmed_tokens
0    164  [forgiv, rapist, murder, cruel, enter, mandir,...
1     36  [socha, puch, illeg, resort, demolish, still, ...
2     92  [todai, whole, year, complet, dai, written, pa...
3     42  [see, ground, realiti, statu, sardar, pathet, ...
4     50  [dirti, monei, aap, need, investig, lie, detec...
   index                                     stemmed_tokens
0    188  [hai, chor, sath, deta, hai, pich, leta, hai, ...
1     23  [travel, pass, markundi, toll, acp, toll, pvt,...
2    174  [final, todai, yet, mcd, remov, broken, bench,...
3     73  [crore, public, monei, invest, project, clean,...
4    152  [histor, judgement, suprem, court, put, stamp,...
   index  priority
0    164         0
1     36         0
2     92         0
3     42         1
4     50         0
   index  priority
0    188         1
1     23         1
2    174         0
3     73         1
4    152         0


In [26]:
from gensim.models import Word2Vec
import time

OUTPUT_FOLDER = '/Users/nitanshjain/Documents/Thapar 4th Sem/Machine Learing/Machine_Learning_Project/'

start_time = time.time()
tokens = pd.Series(tweets_df['stemmed_tokens']).values
# print(tokens)
word2vec_model_file = OUTPUT_FOLDER + 'word2vec_priority' + str(200) + '.model'

w2v_model = Word2Vec(tokens, min_count=1, vector_size=200, window=8, workers=7, sg=3)
print("Time taken to train word2vec model: " + str(time.time() - start_time))
w2v_model.save(word2vec_model_file)


Time taken to train word2vec model: 0.20366311073303223


In [27]:
def create_file(create_file, model_file, x):
    sg_w2v_model = Word2Vec.load(model_file)
    
    with open(create_file, 'w+') as word2vec_file:
        for index, row in x.iterrows():
            model_vector = (np.mean([sg_w2v_model.wv[token] for token in row['stemmed_tokens']], axis=0)).tolist()
            if index == 0:
                header = ",".join(str(ele) for ele in range(200))
                word2vec_file.write(header)
                word2vec_file.write("\n")
            
            if type(model_vector) is list:
                line1 = ",".join( [str(vector_element) for vector_element in model_vector] )
            else:
                line1 = ",".join([str(0) for i in range(200)])
            word2vec_file.write(line1)
            word2vec_file.write('\n')
    
    df = pd.read_csv(create_file)
    return df
        

In [28]:
word2vec_train_filename = OUTPUT_FOLDER + 'word2vec_train_priority' + str(200) + '.csv'
word2vec_train_df = create_file(word2vec_train_filename, word2vec_model_file, x_train)
print(word2vec_train_df.shape)
word2vec_train_df.head()

(138, 200)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,0.000638,0.000772,0.000872,0.002585,0.001369,-0.000683,-0.000171,0.001276,-0.000407,0.001717,...,0.001726,-0.001953,-0.000707,-0.001829,0.001028,0.002074,0.000339,-0.002011,6.7e-05,-4.9e-05
1,0.000644,0.000996,8.2e-05,0.002669,0.001502,-0.001288,-0.00196,0.003513,-0.001415,0.004125,...,0.0026,-0.001403,-0.001382,-0.001717,0.002783,0.002223,0.000329,-0.002829,0.00052,-0.000732
2,0.000347,-0.000391,0.001353,0.002739,0.003886,-0.003617,0.000627,0.004398,-0.002802,0.001932,...,0.002224,-0.003431,-0.001322,-0.002669,0.003696,0.000559,0.000296,-0.003966,-0.000804,-0.000751
3,-0.000556,-0.000983,0.000412,0.002801,0.003366,-0.00248,-0.001213,0.002929,-0.001465,0.002784,...,0.001999,-0.003171,5.3e-05,-0.002314,0.003037,0.002208,0.000873,-0.003116,-0.000339,0.001023
4,0.000443,0.000362,0.001485,0.005043,0.004761,-0.004497,-0.000259,0.006412,-0.002305,0.002547,...,0.003758,-0.004256,-0.000566,-0.003826,0.002803,0.004777,0.00054,-0.00509,-0.002272,-0.000429


In [29]:
word2vec_test_filename = OUTPUT_FOLDER + 'word2vec_test_priority' + str(200) + '.csv'
word2vec_test_df = create_file(word2vec_test_filename, word2vec_model_file, x_test)
print(word2vec_test_df.shape)
word2vec_test_df.head()

(60, 200)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,0.000391,-0.000852,0.00112,0.004581,0.003083,-0.003476,0.000676,0.004705,-0.002794,0.002152,...,0.001359,-0.004149,-0.001546,-0.00479,0.002363,0.002791,0.000607,-0.004625,-0.001515,0.000891
1,0.000784,0.000684,0.001435,0.002856,0.002304,-0.001798,-0.000542,0.002502,-0.001741,0.001353,...,0.001797,-0.002888,-0.001447,-0.001975,0.001432,0.001059,0.000344,-0.002106,-0.000862,0.001031
2,-0.000996,0.001554,0.00085,0.003801,0.002613,-0.002178,-0.000713,0.003097,-0.000857,0.00322,...,0.001431,-0.002543,-0.000391,-0.0022,0.002228,0.002687,0.000906,-0.002576,3.1e-05,0.000341
3,-6.3e-05,-0.000919,0.000612,0.003401,0.004513,-0.001472,-0.000966,0.004367,-0.00072,0.002222,...,0.001283,-0.003026,-0.000391,-0.002425,0.002589,0.00101,0.000417,-0.004035,-0.000447,-0.000616
4,0.00027,0.000614,0.001371,0.000918,0.003644,-0.002987,-0.000252,0.003833,-0.002494,0.002543,...,0.000773,-0.002993,0.001154,-0.001238,0.002974,0.001893,-2.3e-05,-0.001819,-0.000207,-0.000664


In [30]:
print(y_train.shape)
print(y_test.shape)

(138, 2)
(60, 2)


# Models

*Decision Tree*

In [31]:
# Initialize the model
clf_decision_word2vec = DecisionTreeClassifier()

start_time = time.time()
# Fit the model
clf_decision_word2vec.fit(word2vec_train_df, y_train['priority'])
print("Time taken to fit the model with word2vec vectors: " + str(time.time() - start_time))

Time taken to fit the model with word2vec vectors: 0.023982763290405273


In [32]:
from sklearn.metrics import classification_report
        
y_pred_word2vec = clf_decision_word2vec.predict(word2vec_test_df)
print(classification_report(y_test['priority'], y_pred_word2vec))

              precision    recall  f1-score   support

           0       0.56      0.52      0.54        27
           1       0.63      0.67      0.65        33

    accuracy                           0.60        60
   macro avg       0.59      0.59      0.59        60
weighted avg       0.60      0.60      0.60        60



*Random Forest Classifier*


In [33]:
from sklearn.ensemble import RandomForestClassifier

clf_Random_Forest_Classifier = RandomForestClassifier(n_estimators = 100) 
 
clf_Random_Forest_Classifier.fit(word2vec_train_df, y_train['priority'])
 
print("Time taken to fit the model with word2vec vectors: " + str(time.time() - start_time))

Time taken to fit the model with word2vec vectors: 0.5915660858154297


In [34]:
from sklearn.metrics import classification_report

y_pred_randomforest = clf_Random_Forest_Classifier.predict(word2vec_test_df)

print(classification_report(y_test['priority'], y_pred_randomforest))

              precision    recall  f1-score   support

           0       0.68      0.48      0.57        27
           1       0.66      0.82      0.73        33

    accuracy                           0.67        60
   macro avg       0.67      0.65      0.65        60
weighted avg       0.67      0.67      0.66        60



*Multinomial Gaussian Naive Bayes*

In [35]:
from sklearn.naive_bayes import GaussianNB

clf_GNB = GaussianNB()
clf_GNB.fit(word2vec_train_df, y_train['priority'])
print("Time taken to fit the model with word2vec vectors: " + str(time.time() - start_time))

Time taken to fit the model with word2vec vectors: 0.7547149658203125


In [36]:
from sklearn.metrics import classification_report

y_pred_GNB = clf_GNB.predict(word2vec_test_df)

print(classification_report(y_test['priority'], y_pred_GNB))

              precision    recall  f1-score   support

           0       0.57      0.74      0.65        27
           1       0.72      0.55      0.62        33

    accuracy                           0.63        60
   macro avg       0.65      0.64      0.63        60
weighted avg       0.65      0.63      0.63        60

