# 205229118
# Mahalakshmi S

## Lab17. Text Classification using Simple RNN
Objectives: In this lab, you will implement custom sentiment analysis model using Simple RNN. 
## Steps
### Import the libraries

In [1]:
import csv
import tensorflow as tf
import numpy as np
import pandas as pd
import nltk 
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding,SimpleRNN
from nltk.corpus import stopwords
nltk.download('stopwords')
STOPWORDS = set(stopwords.words('english'))
from sklearn.model_selection import train_test_split

import matplotlib
import matplotlib.pyplot as plt

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\MAHALAKSHMI\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
com = pd.read_csv("comments.csv", encoding= 'unicode_escape')
com.head()

Unnamed: 0,label,movie review
0,0,"When a new employee joins a team, the initial ..."
1,0,home comes with its own distractions.
2,0,Avoid work calls post-work hours unless absolu...
3,0,Risk of feeling burnt out
4,0,real challenge for employers to monitor perfor...


In [3]:
X=[]
y = com['label']
for review in com['movie review']:
    filtered_sentence = [w.lower() for w in review.split() if not w in STOPWORDS]
    X.append(filtered_sentence)
X = pd.Series(X)  

## Dataset Preparation

In [4]:
X_train,X_val,y_train,y_val = train_test_split(X,y,train_size=0.7)

In [5]:
print(X_train.shape)
print(y_train.shape)

(14,)
(14,)


In [6]:
print(X_val.shape)
print(y_val.shape)

(6,)
(6,)


## Pre-processing the Text

In [7]:
train_token = Tokenizer(num_words=500,oov_token='<oov>')
train_token.fit_on_texts(X_train)
word_index = train_token.word_index
train_sequence = train_token.texts_to_sequences(X_train)

In [8]:
dict(list(word_index.items())[0:10])

{'<oov>': 1,
 'working': 2,
 'when': 3,
 'new': 4,
 'employee': 5,
 'joins': 6,
 'team,': 7,
 'initial': 8,
 'week': 9,
 'two': 10}

In [9]:
vocab_si = len(train_token.word_index) + 1
vocab_si

95

In [10]:
train_sequence[5]

[3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]

In [11]:
print(len(train_sequence[0]))

15


In [12]:
print(len(train_sequence[10]))

3


In [13]:
train_padded = pad_sequences(train_sequence,maxlen=100,padding='post')

In [14]:
train_padded[5]

array([ 3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0])

In [15]:
val_token = Tokenizer(num_words=100,oov_token='<oov>')
val_token.fit_on_texts(X_val)
val_index = val_token.word_index
val_sequence = val_token.texts_to_sequences(X_val)

In [16]:
val_sequence[5]

[4, 3, 44, 45, 46, 47, 48, 49, 50, 51, 52, 2, 5]

In [17]:
val_padded = pad_sequences(val_sequence,maxlen=100,padding='post')

In [18]:
val_padded[5]

array([ 4,  3, 44, 45, 46, 47, 48, 49, 50, 51, 52,  2,  5,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0])

In [19]:
print(len(val_sequence[0]))

8


In [20]:
print(len(val_sequence[4]))

5


## Model Creation

In [21]:
model_rnn = Sequential()
model_rnn.add(Embedding(5000,8,input_length=150))
model_rnn.add(SimpleRNN(32,activation='relu'))
model_rnn.add(Embedding(5000,64,input_length=150))
model_rnn.add(SimpleRNN(128,activation='tanh' ))
model_rnn.add(Dense('1',activation='sigmoid'))

In [22]:
model_rnn.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 150, 8)            40000     
_________________________________________________________________
simple_rnn (SimpleRNN)       (None, 32)                1312      
_________________________________________________________________
embedding_1 (Embedding)      (None, 32, 64)            320000    
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, 128)               24704     
_________________________________________________________________
dense (Dense)                (None, 1)                 129       
Total params: 386,145
Trainable params: 386,145
Non-trainable params: 0
_________________________________________________________________


In [23]:
model_rnn.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [24]:
history = model_rnn.fit(train_padded,y_train,epochs=15,verbose=2,batch_size=25)

Epoch 1/15
1/1 - 2s - loss: 0.7048 - accuracy: 0.4286
Epoch 2/15
1/1 - 0s - loss: 0.7044 - accuracy: 0.5714
Epoch 3/15
1/1 - 0s - loss: 0.6868 - accuracy: 0.5714
Epoch 4/15
1/1 - 0s - loss: 0.6830 - accuracy: 0.5714
Epoch 5/15
1/1 - 0s - loss: 0.6850 - accuracy: 0.5714
Epoch 6/15
1/1 - 0s - loss: 0.6854 - accuracy: 0.5714
Epoch 7/15
1/1 - 0s - loss: 0.6845 - accuracy: 0.5714
Epoch 8/15
1/1 - 0s - loss: 0.6835 - accuracy: 0.5714
Epoch 9/15
1/1 - 0s - loss: 0.6829 - accuracy: 0.5714
Epoch 10/15
1/1 - 0s - loss: 0.6830 - accuracy: 0.5714
Epoch 11/15
1/1 - 0s - loss: 0.6833 - accuracy: 0.5714
Epoch 12/15
1/1 - 0s - loss: 0.6835 - accuracy: 0.5714
Epoch 13/15
1/1 - 0s - loss: 0.6834 - accuracy: 0.5714
Epoch 14/15
1/1 - 0s - loss: 0.6831 - accuracy: 0.5714
Epoch 15/15
1/1 - 0s - loss: 0.6829 - accuracy: 0.5714


In [25]:
model_rnn.evaluate(val_padded,y_val)



[0.7489405274391174, 0.3333333432674408]

In [26]:
model_rnn3 = Sequential()
model_rnn3.add(Embedding(5000,512,input_length=150))
model_rnn3.add(SimpleRNN(32,activation='relu'))
model_rnn3.add(Embedding(5000,64,input_length=150))
model_rnn3.add(SimpleRNN(32,activation='relu'))
model_rnn3.add(Embedding(5000,128,input_length=150))
model_rnn3.add(SimpleRNN(32,activation='relu'))
model_rnn3.add(Dense('1',activation='sigmoid'))

In [27]:
model_rnn3.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 150, 512)          2560000   
_________________________________________________________________
simple_rnn_2 (SimpleRNN)     (None, 32)                17440     
_________________________________________________________________
embedding_3 (Embedding)      (None, 32, 64)            320000    
_________________________________________________________________
simple_rnn_3 (SimpleRNN)     (None, 32)                3104      
_________________________________________________________________
embedding_4 (Embedding)      (None, 32, 128)           640000    
_________________________________________________________________
simple_rnn_4 (SimpleRNN)     (None, 32)                5152      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                

In [28]:
model_rnn3.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [29]:
history3=model_rnn3.fit(train_padded,y_train,epochs=10,verbose=2,batch_size=25)

Epoch 1/10
1/1 - 2s - loss: 0.6859 - accuracy: 0.5714
Epoch 2/10
1/1 - 0s - loss: 0.6843 - accuracy: 0.5714
Epoch 3/10
1/1 - 0s - loss: 0.6832 - accuracy: 0.5714
Epoch 4/10
1/1 - 0s - loss: 0.6829 - accuracy: 0.5714
Epoch 5/10
1/1 - 0s - loss: 0.6833 - accuracy: 0.5714
Epoch 6/10
1/1 - 0s - loss: 0.6837 - accuracy: 0.5714
Epoch 7/10
1/1 - 0s - loss: 0.6837 - accuracy: 0.5714
Epoch 8/10
1/1 - 0s - loss: 0.6834 - accuracy: 0.5714
Epoch 9/10
1/1 - 0s - loss: 0.6831 - accuracy: 0.5714
Epoch 10/10
1/1 - 0s - loss: 0.6830 - accuracy: 0.5714


In [30]:
model_rnn3.evaluate(val_padded,y_val)



[0.7507752776145935, 0.3333333432674408]

In [31]:
model_rnn5 = Sequential()
model_rnn5.add(Embedding(5000,128,input_length=150))
model_rnn5.add(SimpleRNN(16,activation='relu'))
model_rnn5.add(Embedding(5000,64,input_length=150))
model_rnn5.add(SimpleRNN(8,activation='relu'))
model_rnn5.add(Embedding(5000,512,input_length=150))
model_rnn5.add(SimpleRNN(32,activation='relu'))
model_rnn5.add(Embedding(5000,2056,input_length=150))
model_rnn5.add(SimpleRNN(16,activation='relu'))
model_rnn5.add(Dense('1',activation='sigmoid'))

In [32]:
model_rnn5.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 150, 128)          640000    
_________________________________________________________________
simple_rnn_5 (SimpleRNN)     (None, 16)                2320      
_________________________________________________________________
embedding_6 (Embedding)      (None, 16, 64)            320000    
_________________________________________________________________
simple_rnn_6 (SimpleRNN)     (None, 8)                 584       
_________________________________________________________________
embedding_7 (Embedding)      (None, 8, 512)            2560000   
_________________________________________________________________
simple_rnn_7 (SimpleRNN)     (None, 32)                17440     
_________________________________________________________________
embedding_8 (Embedding)      (None, 32, 2056)         

In [33]:
model_rnn5.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [34]:
history5=model_rnn5.fit(train_padded,y_train,epochs=10,verbose=2,batch_size=25)

Epoch 1/10
1/1 - 5s - loss: 0.6908 - accuracy: 0.5714
Epoch 2/10
1/1 - 0s - loss: 0.6830 - accuracy: 0.5714
Epoch 3/10
1/1 - 0s - loss: 0.6865 - accuracy: 0.5714
Epoch 4/10
1/1 - 0s - loss: 0.6852 - accuracy: 0.5714
Epoch 5/10
1/1 - 0s - loss: 0.6838 - accuracy: 0.5714
Epoch 6/10
1/1 - 0s - loss: 0.6830 - accuracy: 0.5714
Epoch 7/10
1/1 - 0s - loss: 0.6838 - accuracy: 0.5714
Epoch 8/10
1/1 - 0s - loss: 0.6835 - accuracy: 0.5714
Epoch 9/10
1/1 - 0s - loss: 0.6829 - accuracy: 0.5714
Epoch 10/10
1/1 - 0s - loss: 0.6832 - accuracy: 0.5714


In [35]:
model_rnn5.evaluate(val_padded,y_val)



[0.7628531455993652, 0.3333333432674408]

In [37]:
r5 = model_rnn5.predict(trail_padded)
label = ['positive','negative']
print(r,label[np.argmax(trail_padedd)>50])

NameError: name 'trail_padded' is not defined