In [1]:
import gensim.downloader
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import spacy
from gensim.models import KeyedVectors
from keras.layers import Dense, Flatten, LSTM, GRU, Bidirectional
from keras.models import Sequential
from keras.preprocessing import sequence, text
from sklearn.metrics import precision_score, recall_score, f1_score, cohen_kappa_score, classification_report
from sklearn.model_selection import train_test_split
from spacy.tokenizer import Tokenizer

In [2]:
data = pd.read_csv('data\preprocessed.csv').dropna()

In [3]:
nlp = spacy.load("en_core_web_sm")
#glove = gensim.downloader.load('glove-twitter-200')
#glove.save("glove.model")
glove = KeyedVectors.load("glove.model")

In [4]:
def get_vector(sent):
    for word in sent.lower().split(' '):
        if word.isalpha() == True:
            try:
                return glove[word].tolist()
            except KeyError:
                return glove['unk'].tolist()
        else:
            return glove['unk'].tolist()

In [5]:
def pad_trunc(data, maxlen):
    new_data = []
    zero_vector = []
    for _ in range(len(data[0][0])):
        zero_vector.append(0.0)
    for sample in data:
        if len(sample) > maxlen:
            temp = sample[:maxlen]
        elif len(sample) < maxlen:
            temp = sample
            additional_elems = maxlen - len(sample)
            for _ in range(additional_elems):
                temp.append(zero_vector)
        else:
            temp = sample
        new_data.append(temp)
        
    return new_data

In [6]:
data['vectors'] =  data['Reviews_cleaned'].apply(lambda x: [get_vector(x)])

In [7]:
X = data['vectors']
y = data["Category"]
X = np.array(pad_trunc(X, 10))

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [9]:
X_train.shape, X_test.shape, X.shape

((32419, 10, 200), (8105, 10, 200), (40524, 10, 200))

In [10]:
model = Sequential()
model.add(Bidirectional(LSTM(32, return_sequences=True, input_shape=(10,200))))
model.add(LSTM(32))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])

In [11]:
history = model.fit(X_train, y_train,
                    epochs=12,
                    batch_size=128,
                    validation_split=0.2)

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


In [12]:
y_pred = np.round(model.predict(X_test))
cohen_kappa_score(y_test, y_pred)

0.3933655482261611

In [13]:
f1_score(y_test, y_pred)

0.8436441727925993