#                    TCS iON: Rio 45- Automate detection of different sentiments from                                                       textual comments and feedback


#                    PIYUSH TRIVEDI
#                    IIT KANPUR

# Importing Modules

In [1]:
import tensorflow as tf
import numpy as np
import re
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

## Checking the Tensorflow Version

In [2]:
tf.__version__

'2.9.1'

## Reading in the Twitter Data obtained from Kaggle

### Data Link: Charan Gowda, Anirudh, Akshay Pai, &amp; Chaithanya kumar A. (2019). <i>Twitter and Reddit Sentimental analysis Dataset</i> [Data set]. Kaggle. https://doi.org/10.34740/KAGGLE/DS/429085

## Here I am performing sentiment analysis only on the twitter data.

In [3]:
df = pd.read_csv('Twitter_Data.csv')

### Checking the Head of the Data

In [4]:
df.head(10)

Unnamed: 0,Clean_Text,Category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0
5,kiya tho refresh maarkefir comment karo,0.0
6,surat women perform yagna seeks divine grace f...,0.0
7,this comes from cabinet which has scholars lik...,0.0
8,with upcoming election india saga going import...,1.0
9,gandhi was gay does modi,1.0


## Code : Sentiment 
##  1  : Positive ¶
##  0  : Neutral
##  -1 : Negative

In [5]:
df.shape

(162980, 2)

### Reviewing the Columns in the Data

In [6]:
df.columns

Index(['Clean_Text', 'Category'], dtype='object')

In [7]:
df.index

RangeIndex(start=0, stop=162980, step=1)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162980 entries, 0 to 162979
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Clean_Text  162976 non-null  object 
 1   Category    162973 non-null  float64
dtypes: float64(1), object(1)
memory usage: 2.5+ MB


### Checking the NaN values in the Clean_Text Column

In [9]:
df[df['Clean_Text'].isnull()]

Unnamed: 0,Clean_Text,Category
148,,0.0
158694,,-1.0
159443,,0.0
160560,,1.0


### Checking the NaN values in the Category Column

In [10]:
df[df['Category'].isnull()]

Unnamed: 0,Clean_Text,Category
130448,the foundation stone northeast gas grid inaugu...,
155642,dear terrorists you can run but you cant hide ...,
155698,offense the best defence with mission shakti m...,
155770,have always heard politicians backing out thei...,
158693,modi government plans felicitate the faceless ...,
159442,chidambaram gives praises modinomics,
160559,the reason why modi contested from seats 2014 ...,


In [11]:
df.isnull().sum()

Clean_Text    4
Category      7
dtype: int64

### Dropping the NaN Values 

In [12]:
df = df.dropna()

In [13]:
df

Unnamed: 0,Clean_Text,Category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0
...,...,...
162975,why these 456 crores paid neerav modi not reco...,-1.0
162976,dear rss terrorist payal gawar what about modi...,-1.0
162977,did you cover her interaction forum where she ...,0.0
162978,there big project came into india modi dream p...,0.0


In [14]:
df['Clean_Text'] = df['Clean_Text'].astype('str')

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 162969 entries, 0 to 162979
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Clean_Text  162969 non-null  object 
 1   Category    162969 non-null  float64
dtypes: float64(1), object(1)
memory usage: 3.7+ MB


### Dataframe information after dropping the NaN values

In [16]:
df.isnull().any()

Clean_Text    False
Category      False
dtype: bool

In [17]:
X = df['Clean_Text']
y = df['Category']

In [18]:
# Changing the Datatype to float64
y = y.apply(pd.to_numeric)

In [19]:
X

0         when modi promised “minimum government maximum...
1         talk all the nonsense and continue all the dra...
2         what did just say vote for modi  welcome bjp t...
3         asking his supporters prefix chowkidar their n...
4         answer who among these the most powerful world...
                                ...                        
162975    why these 456 crores paid neerav modi not reco...
162976    dear rss terrorist payal gawar what about modi...
162977    did you cover her interaction forum where she ...
162978    there big project came into india modi dream p...
162979    have you ever listen about like gurukul where ...
Name: Clean_Text, Length: 162969, dtype: object

In [20]:
y

0        -1.0
1         0.0
2         1.0
3         1.0
4         1.0
         ... 
162975   -1.0
162976   -1.0
162977    0.0
162978    0.0
162979    1.0
Name: Category, Length: 162969, dtype: float64

## Importing nltk and stopwords

In [21]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer


## Importing modules

In [22]:
import random
import string   
from string import punctuation
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import twitter_samples
from tensorflow.keras.preprocessing.text import Tokenizer                        
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PIYUSH\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Text Cleaning and Preprocessing

## Removing stopwords and punctuation

In [23]:
#Importing the english stop words list from NLTK
stopwords_english = stopwords.words('english') 

print('Stop words\n')
print(stopwords_english)

print('\nPunctuation\n')
print(string.punctuation)


Stop words

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so

In [24]:
df=df.dropna(axis=0)

In [25]:
df.isnull().sum()

Clean_Text    0
Category      0
dtype: int64

In [26]:
# Removing the stopwords from clean texts
df['Clean_Text'] = df['Clean_Text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords_english)]))

In [27]:
df['Clean_Text'][0]

'modi promised “minimum government maximum governance” expected begin difficult job reforming state take years get justice state business exit psus temples'

In [28]:
# Removing the punctuations from tweets
df['Clean_Text'] = df['Clean_Text'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

In [29]:
df['Clean_Text'][0]

'modi promised minimum government maximum governance expected begin difficult job reforming state take years get justice state business exit psus temples'

## Stemming the words using Porter Stemmer

In [30]:
PS = PorterStemmer() 
def stemming(word):
    Corpus=[]
    for i in word.split():
        list1.append(PS.stem(i))
    return ' '.join(Corpus)
    
df['Clean_Text'] = df['Clean_Text'].apply(lambda x:stemming(x))

In [31]:
df['Category'] = [2 if x == -1 else x for x in df['Category']]

In [56]:
df_1=df.copy()

In [57]:
df_1

Unnamed: 0,Clean_Text,Category
0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2.0
1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.0
2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1.0
3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1.0
4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1.0
...,...,...
162975,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2.0
162976,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2.0
162977,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.0
162978,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.0


## Importing the necessary modules

In [33]:
from tensorflow.keras.preprocessing.text import Tokenizer                        
from nltk.tokenize import TweetTokenizer
from sklearn.preprocessing import LabelEncoder
from tensorflow import keras
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense,Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import TfidfVectorizer

## Tokenizing the words using Tokenizer

In [34]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df.Clean_Text)
word_index = tokenizer.word_index

In [35]:
vocab_size = len(word_index)+1

## Padding the Tweets

In [36]:
# Padding the tokenized sequences to the same length

max_length = 200
lines = pad_sequences(tokenizer.texts_to_sequences(df.Clean_Text),
                        maxlen = max_length)

In [37]:
df.Clean_Text = lines.tolist()

In [38]:
df.Category.value_counts()

1.0    72249
0.0    55211
2.0    35509
Name: Category, dtype: int64

In [39]:
df.head(10)

Unnamed: 0,Clean_Text,Category
0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2.0
1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.0
2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1.0
3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1.0
4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1.0
5,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.0
6,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.0
7,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.0
8,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1.0
9,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1.0


In [40]:
df.isnull().sum()

Clean_Text    0
Category      0
dtype: int64

## Splitting the Data

In [41]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df['Clean_Text'],df['Category'],test_size=0.2,random_state=101)
X_train = np.vstack(X_train.values)
y_train = np.vstack(y_train.values)
X_val = np.vstack(X_test.values)
y_val = np.vstack(y_test.values)


## Doing the Sequential Modelling 

In [42]:
model = tf.keras.Sequential()

# Adding the Input layer
model.add(Input(shape=(None,)))

# Adding the Embedding layer
model.add(Embedding(input_dim=vocab_size,output_dim=200,trainable=True))

# Adding the LSTM layer
model.add(LSTM(64, activation='relu'))

# Adding the Fully connected layer
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))

# Adding the Output layer
model.add(Dense(3, activation='sigmoid'))

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 200)         17711000  
                                                                 
 lstm (LSTM)                 (None, 64)                67840     
                                                                 
 dense (Dense)               (None, 64)                4160      
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 3)                 195       
                                                                 
Total params: 17,783,195
Trainable params: 17,783,195
Non-trainable params: 0
_________________________________________________________________


## Adding the optimizer and the loss function

## Training the model on train data

In [43]:
model.compile(optimizer='adam', loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy'])
history = model.fit(X_train, y_train, epochs=5, batch_size=200, verbose=1, shuffle=True, validation_data=(X_val,y_val))

Epoch 1/5


  return dispatch_target(*args, **kwargs)


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


## Accuracy and loss

In [49]:
predictions = model.predict(X_val)
print(history.history)

{'loss': [0.5787506103515625, 0.3435381054878235, 0.2642001211643219, 0.5425295829772949, 0.23189449310302734], 'accuracy': [0.7716280221939087, 0.8802991509437561, 0.9085254073143005, 0.8704276084899902, 0.9211735129356384], 'val_loss': [0.4068510830402374, 0.40860816836357117, 0.43343231081962585, 0.5205487012863159, 0.5245438814163208], 'val_accuracy': [0.8566914200782776, 0.8561698198318481, 0.8451248407363892, 0.8213781714439392, 0.8288028240203857]}


In [50]:
predictions

array([[0.3799397 , 0.80422616, 0.15706287],
       [0.01334224, 0.9460631 , 0.41554096],
       [0.16208653, 0.97524476, 0.03513708],
       ...,
       [0.04793998, 0.9369156 , 0.17117883],
       [0.92011046, 0.50704247, 0.07800508],
       [0.821929  , 0.46599087, 0.18397337]], dtype=float32)

In [51]:
m = len(predictions)

In [52]:
m

32594

## Evaluation of the Model on the test data

In [53]:
print("Evaluating on Test data")
results = model.evaluate(X_val, y_val, batch_size=128)
print("Test Loss, Test Accuracy:", results)

Evaluating on Test data
Test Loss, Test Accuracy: [0.5245440602302551, 0.8288028240203857]


###  Sow we can see that we are able to attain the accuracy of 82.88 %.