In [2]:
# Usual data representation and manipulation libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

# Evaluation libraries
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Libraries for feature engineering
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from tensorflow import keras
from tensorflow.keras.models import Model,Sequential
from tensorflow.keras.layers import Dense, Input, Dropout, LSTM, Activation, Embedding
from tensorflow.keras.layers import Dropout, Activation, SpatialDropout1D
#from tensorflow.keras.layers.embeddings import Embedding
from tensorflow.keras.preprocessing import sequence,text
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from matplotlib import pyplot

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
normalized_food_reviews = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/normalized_food_review.csv")

max_fet = 2000
tokenizer = Tokenizer(num_words=max_fet, split=' ')
tokenizer.fit_on_texts(normalized_food_reviews['review'].iloc[:15000].values)
X = tokenizer.texts_to_sequences(normalized_food_reviews['review'].iloc[:15000].values)
#tokenizer.fit_on_texts(normalized_food_reviews['review'].values)
#X = tokenizer.texts_to_sequences(normalized_food_reviews['review'].values)
X = pad_sequences(X)

y = pd.get_dummies(normalized_food_reviews["sentiment"].iloc[:15000]).values


# extract data for model evaluation the training and test is split into 70% and 30%
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.3)

print("Shape X_train: {}, Shape of X_test: {}".format(X_train.shape, X_test.shape))
print("Shape Y_train: {}, Shape of Y_test: {}".format(Y_train.shape, Y_test.shape))

Shape X_train: (10500, 697), Shape of X_test: (4500, 697)
Shape Y_train: (10500, 2), Shape of Y_test: (4500, 2)


In [5]:
embed_dim = 128
lstm_out = 100
model = Sequential()
model.add(Embedding(max_fet, embed_dim, input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 697, 128)          256000    
                                                                 
 spatial_dropout1d (SpatialD  (None, 697, 128)         0         
 ropout1D)                                                       
                                                                 
 lstm (LSTM)                 (None, 100)               91600     
                                                                 
 dense (Dense)               (None, 2)                 202       
                                                                 
Total params: 347,802
Trainable params: 347,802
Non-trainable params: 0
_________________________________________________________________
None


In [6]:
batch_size = 32
model.fit(X_train, Y_train, epochs = 10, batch_size=batch_size, verbose = 2)

Epoch 1/10
329/329 - 548s - loss: 0.4958 - accuracy: 0.7615 - 548s/epoch - 2s/step
Epoch 2/10
329/329 - 497s - loss: 0.3646 - accuracy: 0.8439 - 497s/epoch - 2s/step
Epoch 3/10
329/329 - 499s - loss: 0.3103 - accuracy: 0.8741 - 499s/epoch - 2s/step
Epoch 4/10
329/329 - 502s - loss: 0.2731 - accuracy: 0.8898 - 502s/epoch - 2s/step
Epoch 5/10
329/329 - 496s - loss: 0.2411 - accuracy: 0.9054 - 496s/epoch - 2s/step
Epoch 6/10
329/329 - 496s - loss: 0.2139 - accuracy: 0.9159 - 496s/epoch - 2s/step
Epoch 7/10
329/329 - 498s - loss: 0.1950 - accuracy: 0.9213 - 498s/epoch - 2s/step
Epoch 8/10
329/329 - 490s - loss: 0.1721 - accuracy: 0.9327 - 490s/epoch - 1s/step
Epoch 9/10
329/329 - 499s - loss: 0.1635 - accuracy: 0.9359 - 499s/epoch - 2s/step
Epoch 10/10
329/329 - 486s - loss: 0.1430 - accuracy: 0.9454 - 486s/epoch - 1s/step


<keras.callbacks.History at 0x7fb65ebb2550>

In [7]:
y_pred_lstm = model.predict(X_test)


In [8]:
print(model.history)

<keras.callbacks.History object at 0x7fb65a7e4390>


In [9]:
score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

141/141 - 25s - loss: 0.6364 - accuracy: 0.8024 - 25s/epoch - 176ms/step
score: 0.64
acc: 0.80


In [10]:
y_test_arg=np.argmax(Y_test,axis=1)
Y_pred_lstm = np.argmax(model.predict(X_test),axis=1)
print('Confusion Matrix')
cf_matrix=confusion_matrix(y_test_arg, Y_pred_lstm)
print(cf_matrix)

f1_score_calc = cf_matrix[0][0] / (cf_matrix[0][0] + 0.5 * (cf_matrix[0][1] + cf_matrix[1][0]))
print('F1-score: %.3f' % f1_score_calc)


Confusion Matrix
[[1725  515]
 [ 374 1886]]
F1-score: 0.795


In [11]:
print("LSTM - Amazon review sentiment analysis")
print("The model accuracy score is: {}".format(accuracy_score(y_test_arg, Y_pred_lstm)))
print("The model precision score is: {}".format(precision_score(y_test_arg, Y_pred_lstm, average="weighted")))
print("The model recall score is: {}".format(recall_score(y_test_arg, Y_pred_lstm, average="weighted")))
print("The model F1-score is: {}".format(f1_score(y_test_arg, Y_pred_lstm, average="weighted")))

print(classification_report(y_test_arg, Y_pred_lstm))

LSTM - Amazon review sentiment analysis
The model accuracy score is: 0.8024444444444444
The model precision score is: 0.8035822791974062
The model recall score is: 0.8024444444444444
The model F1-score is: 0.8022226934976316
              precision    recall  f1-score   support

           0       0.82      0.77      0.80      2240
           1       0.79      0.83      0.81      2260

    accuracy                           0.80      4500
   macro avg       0.80      0.80      0.80      4500
weighted avg       0.80      0.80      0.80      4500



# Deployment