In [24]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow import keras

In [25]:
data = pd.read_csv('Twitter_Data.csv')
data.head()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [26]:
data.isnull().sum()

clean_text    4
category      7
dtype: int64

In [27]:
data.shape


(162980, 2)

In [28]:
data.category.nunique(), data.category.unique()

(3, array([-1.,  0.,  1., nan]))

In [29]:
data = data.dropna()
data.isnull().sum()

clean_text    0
category      0
dtype: int64

In [30]:
data = data.sample(frac = 1).reset_index(drop = True)
data.head(5)

Unnamed: 0,clean_text,category
0,need investigate whether most the electoral bo...,1.0
1,watch full prime minister narendra modis first...,1.0
2,yes great human being with enormous knowledge ...,1.0
3,last checked modi from gujarat but hes the fro...,0.0
4,always feel that you are sensible person would...,0.0


In [32]:
labels = pd.get_dummies(data.category)
labels.columns = ["negative", "neutral", "positive"]
labels.head(5)

Unnamed: 0,negative,neutral,positive
0,False,False,True
1,False,False,True
2,False,False,True
3,False,True,False
4,False,True,False


In [33]:
data = data.drop(columns = "category")
data.head(3)

Unnamed: 0,clean_text
0,need investigate whether most the electoral bo...
1,watch full prime minister narendra modis first...
2,yes great human being with enormous knowledge ...


In [34]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
tokenizer = Tokenizer(num_words = 8150, lower = True, split = " ", oov_token = "~")
tokenizer.fit_on_texts(data["clean_text"])

In [35]:
word_index = tokenizer.word_index
len(word_index)

113679

In [36]:
print(list(word_index.keys())[:100])

['~', 'modi', 'the', 'and', 'for', 'you', 'not', 'will', 'this', 'india', 'are', 'that', 'with', 'has', 'have', 'but', 'all', 'from', 'was', 'his', 'they', 'bjp', 'who', 'people', 'what', 'congress', 'like', 'your', 'only', 'now', 'narendra', 'can', 'why', 'our', 'its', 'govt', 'about', 'one', 'vote', 'dont', 'him', 'just', 'when', 'their', 'modis', 'more', 'country', 'how', 'election', 'years', 'should', 'time', 'rahul', 'did', 'indian', 'any', 'nation', 'against', 'even', 'after', 'then', 'also', 'which', 'there', 'out', 'know', 'government', 'again', 'them', 'get', 'power', 'want', 'had', 'gandhi', 'because', 'than', 'minister', 'good', 'give', 'these', 'been', 'would', 'space', 'money', 'said', 'says', 'under', 'see', 'hai', 'done', 'credit', 'party', 'sir', 'pakistan', 'make', 'were', 'prime', 'every', 'think', 'say']


In [37]:
data["clean_text"] = tokenizer.texts_to_sequences(data["clean_text"])

In [38]:
data.head(3)

Unnamed: 0,clean_text
0,"[121, 4611, 661, 174, 3, 1098, 2154, 356, 63, ..."
1,"[194, 304, 97, 77, 31, 45, 140, 108, 354, 335,..."
2,"[179, 127, 1148, 138, 13, 7165, 887, 4, 3272, ..."


In [39]:
len(data.clean_text[0]), len(data.clean_text[1]), len(data.clean_text[2])

(31, 17, 24)

In [40]:
tweets = pad_sequences(data["clean_text"])

In [41]:
tweets[0].shape, tweets[1].shape, tweets[2].shape

((52,), (52,), (52,))

In [42]:
tweets.shape

(162969, 52)

In [43]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(tweets, labels, test_size = 0.15)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((138523, 52), (138523, 3), (24446, 52), (24446, 3))

In [44]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size = 0.2)
print(f"""
Training set: tweets = {X_train.shape}, labels = {y_train.shape},
Validation set: tweets = {X_valid.shape}, labels = {y_valid.shape},
Test set: tweets = {X_test.shape}, labels = {y_test.shape}
""")


Training set: tweets = (110818, 52), labels = (110818, 3),
Validation set: tweets = (27705, 52), labels = (27705, 3),
Test set: tweets = (24446, 52), labels = (24446, 3)



In [45]:
model = keras.models.Sequential([
    keras.layers.Embedding(input_dim = 8150, output_dim = 32),
    keras.layers.GRU(128),
    keras.layers.Dense(128, activation = "leaky_relu", kernel_initializer = "he_normal", kernel_regularizer = "l1"),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(3, activation = "softmax", kernel_initializer = "glorot_normal")
])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          260800    
                                                                 
 gru (GRU)                   (None, 128)               62208     
                                                                 
 dense (Dense)               (None, 128)               16512     
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 3)                 387       
                                                                 
Total params: 339907 (1.30 MB)
Trainable params: 339907 (1.30 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [46]:
model.compile(optimizer = 'adam',loss = 'categorical_crossentropy',metrics = ['accuracy'])

In [48]:
history = model.fit(
    X_train, y_train, 
    epochs = 1, validation_data = (X_valid, y_valid),
    callbacks = [keras.callbacks.EarlyStopping(patience = 3, restore_best_weights = True)]
)



In [49]:
model.evaluate(X_test, y_test)



[0.3002081513404846, 0.9538983702659607]