In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding, Bidirectional
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras.utils import pad_sequences
from keras.callbacks import EarlyStopping

# nltk
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

# regex
import re

%matplotlib inline

LSTM model from https://www.kaggle.com/code/kredy10/simple-lstm-for-text-classification

Bidirectional LSTM model from https://keras.io/examples/nlp/bidirectional_lstm_imdb/ and 

https://machinelearningmastery.com/develop-bidirectional-lstm-sequence-classification-python-keras/

In [None]:
train_data = pd.read_csv('final_data/train.csv')
test_data = pd.read_csv('final_data/test.csv')
# split train_data to X_train and y_train
X_train = train_data['review']
y_train = train_data['rating']
# split test_data to X_test and y_test
X_test = test_data['review']
y_test = test_data['rating']
# split validation_data to X_val and y_val
X_val = test_data['review']
y_val = test_data['rating']

In [None]:
# remove stopwords and perform stemming
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    text = text.split()
    text = [ps.stem(word) for word in text if word not in stop_words]
    text = ' '.join(text)
    return text

In [None]:
# preprocess X_train and X_val
X_train = X_train.apply(preprocess)
X_val = X_val.apply(preprocess)

In [None]:
max_words = 5000 # consider only top 10000 words
max_len = 100 # consider only first 200 words of each review
# tokenize the data
tok = Tokenizer(num_words=max_words)
# fit the tokenizer on corpus
tok.fit_on_texts(X_train)
sequences = tok.texts_to_sequences(X_train)
X_train = pad_sequences(sequences,maxlen=max_len)
# fit the tokenizer on X_val
tok.fit_on_texts(X_val)
sequences = tok.texts_to_sequences(X_val)
X_val = pad_sequences(sequences,maxlen=max_len)

In [None]:
# let tensorflow allocate memory as needed
import tensorflow as tf
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.compat.v1.Session(config=config)

In [None]:
# bidirectional LSTM 
def biLSTM():
    # Input for variable-length sequences of integers
    inputs = Input(shape=(None,), dtype="int32")
    # Embed each integer in a 128-dimensional vector
    x = Embedding(max_words, 128)(inputs)
    # Add 2 bidirectional LSTMs
    x = Bidirectional(LSTM(64, return_sequences=True))(x)
    x = Bidirectional(LSTM(64))(x)
    # Add a classifier
    outputs = Dense(1, activation="sigmoid")(x)
    model = Model(inputs, outputs)
    return model
    
biLSTM_model = biLSTM()
biLSTM_model.summary()

In [27]:
biLSTM_model.compile("adam", "binary_crossentropy", metrics=["accuracy"])
biLSTM_model.fit(X_train, y_train, batch_size=64, epochs=100, validation_data=(X_val, y_val), callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])



KeyboardInterrupt: 

In [None]:
# simple RNN structure for LSTM
def RNN():
    inputs = Input(name='inputs',shape=[max_len])
    layer = Embedding(max_words,50,input_length=max_len)(inputs)
    layer = LSTM(64)(layer)
    layer = Dense(256,name='FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.5)(layer)
    layer = Dense(1,name='out_layer')(layer)
    layer = Activation('sigmoid')(layer)
    model = Model(inputs=inputs,outputs=layer)
    return model

In [None]:
# call and compile the model
model = RNN()
model.summary()
model.compile(loss='binary_crossentropy',optimizer=RMSprop(),metrics=['accuracy'])

In [None]:
model.fit(X_train,y_train,batch_size=128,epochs=10,
          validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])

In [None]:
# test the model
test_sequences = tok.texts_to_sequences(X_test)
test_sequences_matrix = pad_sequences(test_sequences,maxlen=max_len)
accr = model.evaluate(test_sequences_matrix,y_test)

In [None]:
# evalute biLSTM model
accr = biLSTM_model.evaluate(test_sequences_matrix,y_test)

In [None]:
# evalute biLSTM model
accr = biLSTM_model.evaluate(test_sequences_matrix,y_test)

In [None]:
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))