In [1]:
# Usual data representation and manipulation libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

# Evaluation libraries
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Libraries for feature engineering
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from tensorflow import keras
from tensorflow.keras.models import Model,Sequential
from tensorflow.keras.layers import Dense, Input, Dropout, LSTM, Activation, Embedding
from tensorflow.keras.layers import Dropout, Activation, SpatialDropout1D
#from tensorflow.keras.layers.embeddings import Embedding
from tensorflow.keras.preprocessing import sequence,text
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from matplotlib import pyplot

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
normalized_food_reviews = pd.read_csv("/content/drive/MyDrive/Colab_Notebooks/normalized_food_review.csv")

max_fet = 2000
tokenizer = Tokenizer(num_words=max_fet, split=' ')
#tokenizer.fit_on_texts(normalized_food_reviews['review'].iloc[:15000].values)
#X = tokenizer.texts_to_sequences(normalized_food_reviews['review'].iloc[:15000].values)
tokenizer.fit_on_texts(normalized_food_reviews['review'].values)
X = tokenizer.texts_to_sequences(normalized_food_reviews['review'].values)
X = pad_sequences(X)

#y = pd.get_dummies(normalized_food_reviews["sentiment"].iloc[:15000]).values
y = pd.get_dummies(normalized_food_reviews["sentiment"]).values


# extract data for model evaluation the training and test is split into 70% and 30%
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.3)

print("Shape X_train: {}, Shape of X_test: {}".format(X_train.shape, X_test.shape))
print("Shape Y_train: {}, Shape of Y_test: {}".format(Y_train.shape, Y_test.shape))

Shape X_train: (35000, 692), Shape of X_test: (15000, 692)
Shape Y_train: (35000, 2), Shape of Y_test: (15000, 2)


In [5]:
embed_dim = 128
lstm_out = 100
model = Sequential()
model.add(Embedding(max_fet, embed_dim, input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 692, 128)          256000    
                                                                 
 spatial_dropout1d (SpatialD  (None, 692, 128)         0         
 ropout1D)                                                       
                                                                 
 lstm (LSTM)                 (None, 100)               91600     
                                                                 
 dense (Dense)               (None, 2)                 202       
                                                                 
Total params: 347,802
Trainable params: 347,802
Non-trainable params: 0
_________________________________________________________________
None


In [6]:
batch_size = 100
model.history = model.fit(X_train, Y_train, epochs = 2, batch_size=batch_size, verbose = 2)

Epoch 1/2
350/350 - 938s - loss: 0.4372 - accuracy: 0.7968 - 938s/epoch - 3s/step
Epoch 2/2
350/350 - 922s - loss: 0.3559 - accuracy: 0.8466 - 922s/epoch - 3s/step


In [7]:
y_pred_lstm = model.predict(X_test)


In [8]:
print(model.history)

<keras.callbacks.History object at 0x7f38b32f6cd0>


Evaluation Metrics

In [9]:
score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

150/150 - 63s - loss: 0.3554 - accuracy: 0.8463 - 63s/epoch - 420ms/step
score: 0.36
acc: 0.85


In [10]:
y_test_arg=np.argmax(Y_test,axis=1)
Y_pred_lstm = np.argmax(model.predict(X_test),axis=1)
print('Confusion Matrix')
cf_matrix=confusion_matrix(y_test_arg, Y_pred_lstm)
print(cf_matrix)

f1_score_calc = cf_matrix[0][0] / (cf_matrix[0][0] + 0.5 * (cf_matrix[0][1] + cf_matrix[1][0]))
print('F1-score: %.3f' % f1_score_calc)


Confusion Matrix
[[6667  905]
 [1401 6027]]
F1-score: 0.853


In [11]:
print("LSTM - Amazon review sentiment analysis")
print("The model accuracy score is: {}".format(accuracy_score(y_test_arg, Y_pred_lstm)))
print("The model precision score is: {}".format(precision_score(y_test_arg, Y_pred_lstm, average="weighted")))
print("The model recall score is: {}".format(recall_score(y_test_arg, Y_pred_lstm, average="weighted")))
print("The model F1-score is: {}".format(f1_score(y_test_arg, Y_pred_lstm, average="weighted")))

print(classification_report(y_test_arg, Y_pred_lstm))

LSTM - Amazon review sentiment analysis
The model accuracy score is: 0.8462666666666666
The model precision score is: 0.8476916756902738
The model recall score is: 0.8462666666666666
The model F1-score is: 0.8460493772841581
              precision    recall  f1-score   support

           0       0.83      0.88      0.85      7572
           1       0.87      0.81      0.84      7428

    accuracy                           0.85     15000
   macro avg       0.85      0.85      0.85     15000
weighted avg       0.85      0.85      0.85     15000



# Deployment

In [15]:
import pickle

pickle.dump(model, open('lstmModel.pkl', 'wb'))
 
model1 = pickle.load(open('lstmModel.pkl', 'rb'))
 
#model.predict(patient1)[0]

INFO:tensorflow:Assets written to: ram://d2bab468-dd6a-4119-bef9-de32888adfef/assets


INFO:tensorflow:Assets written to: ram://d2bab468-dd6a-4119-bef9-de32888adfef/assets


In [None]:
/content/lstmModel.pkl

In [19]:
import sqlite3
from sqlalchemy import create_engine
engine = create_engine('sqlite:///'+ "AmaxonFoodReview.db")

normalized_food_reviews = normalized_food_reviews.drop(columns =["Review"])
normalized_food_reviews.to_sql('AmaxonFoodReview', engine, if_exists='replace', index=False)

In [14]:
import joblib
 
joblib.dump(model, 'model_save2')
 
model2 = joblib.load('model_save2')
 
#model2.predict(1)

INFO:tensorflow:Assets written to: ram://04ad5944-07bd-4fc0-8b2e-a0afdaf2c8c0/assets


INFO:tensorflow:Assets written to: ram://04ad5944-07bd-4fc0-8b2e-a0afdaf2c8c0/assets
