<a href="https://colab.research.google.com/github/oyyarko/deeplearning_rnn/blob/master/sentiment_analysis_lstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%tensorflow_version 2.x
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, SpatialDropout1D, Embedding
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
import re

TensorFlow 2.x selected.


In [0]:
data = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Sentiment.csv')
#keep necessary columns
data = data[['text', 'sentiment']]

In [4]:
data = data[data.sentiment != "Neutral"]
data['text'] = data['text'].apply(lambda x: x.lower())
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-Z0-9\s]', '', x)))

print(data[data['sentiment'] == 'Positive'].size)
print(data[data['sentiment'] == 'Negative'].size)

4472
16986


In [0]:
#Replacing RT from the text
for idx, row in data.iterrows():
    row[0] = row[0].replace('rt', ' ')

In [0]:
max_features = 2000
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)
X = pad_sequences(X)

In [0]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_features, embed_dim, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
#model.add(LSTM(256, dropout=0.2, recurrent_dropout=0.3, return_sequences=True))
#model.add(LSTM(364, dropout=0.2, recurrent_dropout=0.3))
model.add(Dense(2, activation='softmax'))

In [18]:
model.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 28, 128)           256000    
_________________________________________________________________
spatial_dropout1d_6 (Spatial (None, 28, 128)           0         
_________________________________________________________________
lstm_13 (LSTM)               (None, 196)               254800    
_________________________________________________________________
dense_4 (Dense)              (None, 2)                 394       
Total params: 511,194
Trainable params: 511,194
Non-trainable params: 0
_________________________________________________________________


In [0]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [20]:
Y = pd.get_dummies(data['sentiment']).values
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)
print(X_train.shape, ' ', y_train.shape)
print(X_test.shape, ' ', y_test.shape)

(7188, 28)   (7188, 2)
(3541, 28)   (3541, 2)


In [21]:
batch_size = 32
model.fit(X_train, y_train, epochs=20, batch_size=batch_size, verbose=2)

Train on 7188 samples
Epoch 1/20
7188/7188 - 38s - loss: 0.4400 - accuracy: 0.8137
Epoch 2/20
7188/7188 - 36s - loss: 0.3248 - accuracy: 0.8621
Epoch 3/20
7188/7188 - 38s - loss: 0.2803 - accuracy: 0.8834
Epoch 4/20
7188/7188 - 36s - loss: 0.2528 - accuracy: 0.8955
Epoch 5/20
7188/7188 - 36s - loss: 0.2277 - accuracy: 0.9122
Epoch 6/20
7188/7188 - 35s - loss: 0.2043 - accuracy: 0.9190
Epoch 7/20
7188/7188 - 36s - loss: 0.1823 - accuracy: 0.9295
Epoch 8/20
7188/7188 - 37s - loss: 0.1708 - accuracy: 0.9316
Epoch 9/20
7188/7188 - 37s - loss: 0.1505 - accuracy: 0.9410
Epoch 10/20
7188/7188 - 36s - loss: 0.1390 - accuracy: 0.9437
Epoch 11/20
7188/7188 - 37s - loss: 0.1343 - accuracy: 0.9481
Epoch 12/20
7188/7188 - 36s - loss: 0.1236 - accuracy: 0.9514
Epoch 13/20
7188/7188 - 36s - loss: 0.1154 - accuracy: 0.9521
Epoch 14/20
7188/7188 - 36s - loss: 0.1097 - accuracy: 0.9551
Epoch 15/20
7188/7188 - 36s - loss: 0.1045 - accuracy: 0.9569
Epoch 16/20
7188/7188 - 36s - loss: 0.1010 - accuracy: 0.

<tensorflow.python.keras.callbacks.History at 0x7f89b53a0780>

In [22]:
validation_size=  1500
X_validate = X_test[-validation_size:]
y_validate = y_test[-validation_size:]
score, acc = model.evaluate(X_test, y_test, verbose=2, batch_size=batch_size)

print("score: %.2f"%(score))
print("acc: %.2f"%(acc))

3541/3541 - 2s - loss: 0.8435 - accuracy: 0.8145
score: 0.84
acc: 0.81


In [26]:
pos_cnt, neg_cnt, pos_correct, neg_correct = 0, 0, 0, 0
for x in range(len(X_validate)):
    result = model.predict(X_validate[x].reshape(1, X_test.shape[1]), batch_size=1, verbose=2)
    if np.argmax(result) == np.argmax(y_validate[x]):
        if np.argmax(y_validate[x] == 0):
            neg_correct += 1
        else:
            pos_correct += 1
    
    if np.argmax(y_validate[x]) == 0:
        neg_cnt += 1
    else:
        pos_cnt += 1

print("pos_acc", pos_correct/pos_cnt*100, "%")
print("neg_acc", neg_correct/neg_cnt*100, "%")

1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1/1 - 0s
1

In [33]:
twt = input('Enter Tweet: ')
twt = tokenizer.texts_to_sequences(twt)

twt = pad_sequences(twt, maxlen=28, dtype='int32', value=0)
#print(twt)
sentiment = model.predict(twt, batch_size=1, verbose=2)[0]
if(np.argmax(sentiment)==0):
    print("Negative")
elif(np.argmax(sentiment)==1):
    print("Positive")

Enter Tweet: After huge time we will kill this coronavirus
45/45 - 0s
Negative
