###Forecasting of Retweets for Tweets during Demonetization in India - Program and steps

In [0]:
#importing libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import re
import nltk
import keras
from keras.layers import Embedding
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from keras import Sequential
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import multilabel_confusion_matrix
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation, Input, Lambda
from keras.models import Model

In [0]:
#Reading the csv file
df = pd.read_csv('demonetization-tweets.csv', encoding = 'unicode-escape')

# creating a new dataframe with the relevant columns
data = df[['text', 'Time', 'retweetCount' ]]

In [65]:
data.head()

Unnamed: 0,text,Time,retweetCount
0,RT @rssurjewala: Critical question: Was PayTM ...,6:40:30 PM,331
1,RT @Hemant_80: Did you vote on #Demonetization...,6:40:29 PM,66
2,"RT @roshankar: Former FinSec, RBI Dy Governor,...",6:40:03 PM,12
3,RT @ANI_news: Gurugram (Haryana): Post office ...,6:39:59 PM,338
4,RT @satishacharya: Reddy Wedding! @mail_today ...,6:39:39 PM,120


**K Means Clustering**

The complete Dataset has been divided into four groups based on the number of retweets.
All those tweets whose retweets fall in a certain range, will be grouped together. 
This was done using K-Means. The 'Retweets' attribute is given an input to K-Means function which then groups the number of retweets into four groups.  

In [66]:
#KMeans clustering
X = data.retweetCount.values.reshape(-1,1)
kMeans = KMeans(n_clusters=4, random_state = 0).fit(X)

#creating a new group with kMeans.labels_
data['group'] = kMeans.labels_

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


K-Means brings together tweets which are somewhat correlated to each other. This is because the tweets that produce similar number of retweets should have some relationship betwen them. The 4 groups with their respective range of retweets (min, max) are generated below-

In [67]:
data.groupby(['group']).describe()

Unnamed: 0_level_0,retweetCount,retweetCount,retweetCount,retweetCount,retweetCount,retweetCount,retweetCount,retweetCount
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
group,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0,11561.0,40.684456,54.761859,0.0,0.0,13.0,71.0,230.0
1,1722.0,1233.387921,186.720006,862.0,960.0,1333.0,1333.0,2507.0
2,1648.0,428.760316,160.371473,237.0,275.0,331.0,637.0,762.0
3,9.0,4698.0,708.0,3754.0,3754.0,5170.0,5170.0,5170.0


### Text Preprocessing

In [68]:
#function to remove a certain pattern in the text
def remove_pattern(input_txt, pattern):  
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
        
    return input_txt
  
data['text'] = np.vectorize(remove_pattern)(data['text'], "@[\w]*")
data['text'].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


0    RT : Critical question: Was PayTM informed abo...
1    RT : Did you vote on #Demonetization on Modi s...
2    RT : Former FinSec, RBI Dy Governor, CBDT Chai...
3    RT : Gurugram (Haryana): Post office employees...
4    RT : Reddy Wedding!  cartoon #demonetization #...
Name: text, dtype: object

In [69]:
#Removing all characters beside alphabets and #
data['text'] = data['text'].str.replace("[^a-zA-Z#]", " ") 
data['text'].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


0    RT   Critical question  Was PayTM informed abo...
1    RT   Did you vote on #Demonetization on Modi s...
2    RT   Former FinSec  RBI Dy Governor  CBDT Chai...
3    RT   Gurugram  Haryana   Post office employees...
4    RT   Reddy Wedding   cartoon #demonetization #...
Name: text, dtype: object

In [0]:
#removing all words of size less than 3 like 'the', 'and' which do not represent any sentiment and do not play a role in predicting the outcome
data['text'] = data['text'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

In [71]:
#tokenization - creating tokens for words

tokenized_tweet = data['text'].apply(lambda x: x.split())
tokenized_tweet.head()

0    [Critical, question, PayTM, informed, about, #...
1                [vote, #Demonetization, Modi, survey]
2    [Former, FinSec, Governor, CBDT, Chair, Harvar...
3    [Gurugram, Haryana, Post, office, employees, p...
4    [Reddy, Wedding, cartoon, #demonetization, #Re...
Name: text, dtype: object

In [72]:
# Stemmming - to have a single word for words having the same meaning but used in different forms like play, playing, played etc. 

from nltk.stem.porter import *
stemmer = PorterStemmer()

tokenized_tweet = tokenized_tweet.apply(lambda x: [stemmer.stem(i) for i in x]) # stemming
tokenized_tweet.head()



0    [critic, question, paytm, inform, about, #demo...
1                       [vote, #demonet, modi, survey]
2    [former, finsec, governor, cbdt, chair, harvar...
3    [gurugram, haryana, post, offic, employe, prov...
4    [reddi, wed, cartoon, #demonet, #reddywed, htt...
Name: text, dtype: object

In [0]:
#clubbing the words/ tokens back to sentences
for i in range(len(tokenized_tweet)):
    tokenized_tweet[i] = ' '.join(tokenized_tweet[i])

In [0]:
#using Keras preprocessing functions for preparing the input dataset

#tokenizing the processed text again
tk = Tokenizer(lower = True)
tk.fit_on_texts(tokenized_tweet)               

#representing the word by a number equal to the frequency of its occcurence.
X_seq = tk.texts_to_sequences(tokenized_tweet)   

#limiting the length of each sentence to 100. Padding with 0 if sentence is short
X_pad = pad_sequences(X_seq, maxlen=100, padding='post')

In [0]:
#Binarizing the output label i.e. representing each label in binary format

from sklearn import preprocessing
y = data['group']
lb = preprocessing.LabelBinarizer()
lb.fit(y)
m = lb.transform(y)

In [0]:
# Preparing train and test datasets

from sklearn.utils import shuffle
X_pad, m, y  = shuffle(X_pad, m, y, random_state = 0)
X_train = X_pad[:-4000]
X_test  = X_pad[-4000:]
y_train = m[:-4000]
y_test = m[-4000:]


## Evaluation
### 1. Classification accuracy

In [0]:
def create_model():
  vocabulary_size = len(tk.word_counts.keys())+1
  max_words = 100
  embedding_size = 32
  model = Sequential()
  model.add(Embedding(vocabulary_size, embedding_size, input_length=max_words))
  model.add(Dropout(0.2))
  model.add(Conv1D(100, kernel_size=8, activation='relu'))
  model.add(MaxPooling1D(pool_size=4))
  model.add(LSTM(200,return_sequences=True))
  model.add(LSTM(200))
  model.add(Dense(4, activation='softmax'))
  model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
  return model

In [0]:
 estimator = KerasClassifier(build_fn=create_model, epochs=10, batch_size=100, verbose=20)

In [0]:
kfold = KFold(n_splits=3, shuffle=True, random_state=0)

In [81]:
results = cross_val_score(estimator, X_pad, y, cv=kfold)
print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Baseline: 98.76% (0.18%)


Classification Accuracy = **98.76%**

###2.  Precision and recall using confusion matrix

In [82]:
# using confusion matrix
model = create_model()

clf = model.fit(X_train, y_train, epochs = 15, batch_size = 500)

out = model.predict(X_test)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


The predicted output will be floating point values and must be rounded to the nearest integers

In [0]:
#The predicted output will be floating point values and must be rounded to the nearest integers

out1 = np.round(out)
k = out1.astype(int)

In [84]:
from sklearn.metrics import multilabel_confusion_matrix

multilabel_confusion_matrix(y_test, k, labels = [0,1,2,3])

array([[[ 882,    8],
        [  68, 3042]],

       [[3487,   58],
        [   3,  452]],

       [[3557,   10],
        [  20,  413]],

       [[3998,    0],
        [   2,    0]]])

Precision = True Positives / (True Positives + False Positives);
Recall  = True Positives / (True Positives + Falsev Negatives);

True positives = 3042 + 452 + 413 + 0 = 3907;
False Negatives = 68 + 3 + 20 + 2 = 93;
False Positives = 8 + 58 + 10 + 0 = 76; 

 Precision = **98**;
 Recall = **97.6;**