In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing, naive_bayes, svm
import numpy as np

Dataset for positive and negative Amazon reviews  
Downloaded from Github: https://gist.github.com/kunalj101/ad1d9c58d338e20d09ff26bcc06c4235  
The dataset contains text with reviews for products boughts, as well as a label for this review.  
__label__2 represents positive reviews, while __label__1 represents negative reviews  

In [2]:
data = open('C:/Users/anna/Downloads/amazon/amazon_data/corpus', encoding="utf8").read()
labels, texts = [], []

In [3]:
for i, line in enumerate(data.split("\n")):
    l = line.split()
    labels.append(l[0])
    texts.append(" ".join(l[1:]))

In [4]:
# create a datafrme to store text and label
data = pd.DataFrame()
data['text'] = texts
data['label'] = labels

In [5]:
data.head(10)

Unnamed: 0,text,label
0,Stuning even for the non-gamer: This sound tra...,__label__2
1,The best soundtrack ever to anything.: I'm rea...,__label__2
2,Amazing!: This soundtrack is my favorite music...,__label__2
3,Excellent Soundtrack: I truly like this soundt...,__label__2
4,"Remember, Pull Your Jaw Off The Floor After He...",__label__2
5,an absolute masterpiece: I am quite sure any o...,__label__2
6,"Buyer beware: This is a self-published book, a...",__label__1
7,Glorious story: I loved Whisper of the wicked ...,__label__2
8,A FIVE STAR BOOK: I just finished reading Whis...,__label__2
9,Whispers of the Wicked Saints: This was a easy...,__label__2


In [6]:
test_size = 0.2  # we define 80% train - 20% test
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label'], test_size=test_size, random_state=42)

In [7]:
# As the output gets 2 values, we hot-encode those values
y_train = preprocessing.LabelEncoder().fit_transform(y_train)
y_test = preprocessing.LabelEncoder().fit_transform(y_test)

# Feature extraction methods: 
Bags of words, Tf-Idf, n-grams

## Bags-of-words
The simplest and most intuitive NLP method , is the “bags-of-words”.
It ignores sentence structure and simply counts how often each word occurs.   
CountVectorizer allows us to use the bags-of-words approach, by converting a collection of text documents into a matrix of token counts.
We instantiate the CountVectorizer and fit it to our training data, converting our collection of text documents into a matrix of token counts.

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
# extracting words (at least 2 letters or numbers), then converts everything to lowercase 
# and builds a vocabulary using these tokens
vect1 = CountVectorizer().fit(X_train)
# get the built vocabularies
features = vect1.get_feature_names()

In [9]:
# size of the built vocabulary
num_of_feat = len(features)
print(num_of_feat)

28031


In [10]:
# transform the documents of X_train to a document term matrix, which gives us the bags-of-word representation of X_train
X_train_vectorized1 = vect1.transform(X_train)
# The entries in this matrix are the number of times each word appears in each document. 
# Because the number of words in the vocabulary is so much larger than the number of words that might appear in a single text, 
# most entries of this matrix are zero.
X_train_vectorized1.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

## Tf-idf
Tf-idf allows us to weight terms based on how important they are to a document.  
For example, in a large text corpus, some words will be present very often but will carry very little meaningful information about the actual contents of the document. Those can be words such as 'the', 'a', 'I', 'is', 'are' etc.
So, we will instantiate the tf–idf vectorizer and fit it to our training data.

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
# minimum document frequency min_df
# keeps words that only appear in this number of documents we define
min_df = 4  
vect2 = TfidfVectorizer(min_df = min_df).fit(X_train)

# now we have a new number of features
feat2 = vect2.get_feature_names()
num_of_feat2 = len(feat2)
print(num_of_feat2)

8092


In [12]:
X_train_vectorized2 = vect2.transform(X_train)
X_train_vectorized2.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

## n-grams
One way to prevent misclassification is to add n-grams. 
For example, bigrams (2-grams) count pairs of adjacent words, and could give us features such as 'bad' versus 'not bad'.   
Thus, we are refitting our training set (specifying a minimum document frequency), and extracting 1-grams and 2-grams.

In [13]:
vect3 = CountVectorizer(min_df = min_df, ngram_range = (1,2)).fit(X_train)
feat3 = vect3.get_feature_names()
num_of_feat3 = len(feat3)
print(num_of_feat3)

X_train_vectorized3 = vect3.transform(X_train)

28637


# Classification Algorithms
The following algorithms were used:  
KNN, Logistic regression, Naive Bayes, Support Vector Machine (SVM), Feedforward neural network (1, 3 and 5 layers), LSTM, Bidirectional LSTM, GRU

## K-nearest Neighbour

In [14]:
from sklearn.neighbors import KNeighborsClassifier

def KNN_classifier(X_train, y_train, X_test, y_test, vect):  
    neigh = KNeighborsClassifier(n_neighbors=15)
    neigh.fit(X_train, y_train)
    neigh_pred = neigh.predict(vect.transform(X_test))
    neigh_acc = accuracy_score(y_test, neigh_pred)
    return neigh, neigh_pred, neigh_acc

## Logistic regression

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

def logistic_regr(X_train, y_train, X_test, y_test, vect):
    # make an instance of the logistic regression curve
    model = LogisticRegression(max_iter = 1000) # max_iter prevents it from crashing, as logistic regression is computationally heavy
    
    # fit the train data on the logistic model
    model.fit(X_train, y_train)
    
    # predict by inserting test data to the model
    predictions = model.predict(vect.transform(X_test))
    
    # test the accuracy of the predictions
    acc = accuracy_score(y_test, predictions)
    
    return model, predictions, acc

## Naive Bayes

In [16]:
def naive_bayes_classifier(X_train, y_train, X_test, y_test, vect):
    bayes = naive_bayes.MultinomialNB()
    bayes.fit(X_train, y_train)
    bayes_pred = bayes.predict(vect.transform(X_test))
    bayes_acc = accuracy_score(y_test, bayes_pred)
    return bayes, bayes_pred, bayes_acc

## Support vector machine

In [17]:
# SVM classifies data by separating the different classes with the help of a hyperplane between the classes.
# The optimal hyperplane should be such as to maximize its distance as much as possible from all the data points from 
# any of the classes

def SVM_classifier(X_train, y_train, X_test, y_test, vect):  
    sv_model = svm.SVC()
    sv_model.fit(X_train, y_train)
    sv_pred = sv_model.predict(vect.transform(X_test))
    sv_acc = accuracy_score(y_test, sv_pred)
    return sv_model, sv_pred, sv_acc

## Simple feedforward neural network

In [18]:
# Feedforward neural network with one hidden layer

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential, load_model

def neural_net(X_train, y_train, X_test, y_test, vect):
    n_epochs = 5
    
    # the size of the train test defines the number of input neurons
    input_size = X_train.shape[1]
    
    model = tf.keras.Sequential()
    
    # hidden layer of 32 neurons
    model.add(layers.Dense(16, input_dim = input_size))
    model.add(layers.Activation('relu'))
    
    # output layer of 1 neuron
    model.add(layers.Dense(1, activation='sigmoid'))
    model.summary()
    
    model.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(0.01), metrics=['accuracy'])

    model.fit(X_train, y_train, epochs=n_epochs)
    
    # predict the labels on test dataset
    nn_pred = model.predict(vect.transform(X_test))
    nn_pred = nn_pred.argmax(axis=-1)
    
    # evaluate gives the loss and the accuracy of the model
    results = model.evaluate(vect.transform(X_test), y_test)
    
    return model, nn_pred, results


## 3-layer neural network

In [19]:
# Feedforward neural network with one hidden layer

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential, load_model

def neural_deep_net(X_train, y_train, X_test, y_test, vect):
    n_epochs = 5
    
    # the size of the train test defines the number of input neurons
    input_size = X_train.shape[1]
    
    model = tf.keras.Sequential()
    
    # hidden layer 1
    model.add(layers.Dense(16, input_dim = input_size))
    model.add(layers.Activation('relu'))
    
    # hidden layer 2
    model.add(layers.Dense(32))
    model.add(layers.Activation('relu'))
    
    # hidden layer 3
    model.add(layers.Dense(16))
    model.add(layers.Activation('relu'))
    
    # output layer of 1 neuron
    model.add(layers.Dense(1, activation='sigmoid'))
    model.summary()
    
    model.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(0.01), metrics=['accuracy'])

    model.fit(X_train, y_train, epochs=n_epochs)
    
    # predict the labels on test dataset
    nn_pred = model.predict(vect.transform(X_test))
    nn_pred = nn_pred.argmax(axis=-1)
    
    # evaluate gives the loss and the accuracy of the model
    results = model.evaluate(vect.transform(X_test), y_test)
    
    return model, nn_pred, results


## 5-layer neural network

In [20]:
# Feedforward neural network with one hidden layer

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential, load_model

def neural_deep_net2(X_train, y_train, X_test, y_test, vect):
    n_epochs = 5
    
    # the size of the train test defines the number of input neurons
    input_size = X_train.shape[1]
    
    model = tf.keras.Sequential()
    
    # hidden layer 1
    model.add(layers.Dense(16, input_dim = input_size))
    model.add(layers.Activation('relu'))
    
    # hidden layer 2
    model.add(layers.Dense(32))
    model.add(layers.Activation('relu'))
    
    #hidden layer 3
    model.add(layers.Dense(32))
    model.add(layers.Activation('relu'))
    
    #hidden layer 4
    model.add(layers.Dense(32))
    model.add(layers.Activation('relu'))
    
    # hidden layer 5
    model.add(layers.Dense(16))
    model.add(layers.Activation('relu'))
    
    # output layer of 1 neuron
    model.add(layers.Dense(1, activation='sigmoid'))
    model.summary()
    
    model.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(0.01), metrics=['accuracy'])

    model.fit(X_train, y_train, epochs=n_epochs)
    
    # predict the labels on test dataset
    nn_pred = model.predict(vect.transform(X_test))
    nn_pred = nn_pred.argmax(axis=-1)
    
    # evaluate gives the loss and the accuracy of the model
    results = model.evaluate(vect.transform(X_test), y_test)
    
    return model, nn_pred, results


Running the algorithms of KNN, Logistic regression, Naive Bayes, Support Vector Machine (SVM), Feedforward neural network (1, 3 and 5 layers) on Bag of Words

In [21]:
# Bag of words
print('Results for Bag of Words')
neigh2, neigh_pred2, neigh_acc2 = KNN_classifier(X_train_vectorized2, y_train, X_test, y_test, vect2)
print('KNN accuracy:', neigh_acc2)

Results for Bag of Words
KNN accuracy: 0.7725


In [22]:
model1, predictions1, acc1 = logistic_regr(X_train_vectorized1, y_train, X_test, y_test, vect1)
print('Logistic regression accuracy:', acc1)

Logistic regression accuracy: 0.8615


In [23]:
bayes1, bayes_pred1, bayes_acc1 = naive_bayes_classifier(X_train_vectorized1, y_train, X_test, y_test, vect1)
print('Naive Bayes accuracy:', bayes_acc1)

Naive Bayes accuracy: 0.8315


In [24]:
sv_model1, sv_pred1, sv_acc1 = SVM_classifier(X_train_vectorized1, y_train, X_test, y_test, vect1)
print('Support vector machine accuracy:', sv_acc1)

Support vector machine accuracy: 0.845


In [25]:
nn_classifier1, nn_pred1, result1 = neural_net(X_train_vectorized1, y_train, X_test, y_test, vect1)
print('\nFeedforward neural network accuracy:', result1[1])

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 16)                448512    
_________________________________________________________________
activation (Activation)      (None, 16)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 448,529
Trainable params: 448,529
Non-trainable params: 0
_________________________________________________________________
Train on 8000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

Feedforward neural network accuracy: 0.8545


In [26]:
deepmodel1, deep_pred1, deep_results1 = neural_deep_net(X_train_vectorized1, y_train, X_test, y_test, vect1)
print('\n3-layer neural network accuracy:', deep_results1[1])

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_2 (Dense)              (None, 16)                448512    
_________________________________________________________________
activation_1 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 32)                544       
_________________________________________________________________
activation_2 (Activation)    (None, 32)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 16)                528       
_________________________________________________________________
activation_3 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 1)                

In [27]:
deepmodel11, deep_pred11, deep_results11 = neural_deep_net2(X_train_vectorized1, y_train, X_test, y_test, vect1)
print('\n5-layer neural network accuracy:', deep_results11[1])

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_6 (Dense)              (None, 16)                448512    
_________________________________________________________________
activation_4 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_7 (Dense)              (None, 32)                544       
_________________________________________________________________
activation_5 (Activation)    (None, 32)                0         
_________________________________________________________________
dense_8 (Dense)              (None, 32)                1056      
_________________________________________________________________
activation_6 (Activation)    (None, 32)                0         
_________________________________________________________________
dense_9 (Dense)              (None, 32)               

We can observe that the results of the Machine Learning algorithms and the simple neural networks were very satisfying.   
The accuracy in most cases was more than 80%:  
KNN 77.25%  
Logistic regression 86.15%  
Naive Bayes 83.15%  
Support vector machine 84.5%  
Feedforward neural network 85.45%  
3-layer neural network 83.85%  
5-layer neural network 84.3%  

The best performance for bag of words was given from Logistic regression: 86.15%.

Running the algorithms of KNN, Logistic regression, Naive Bayes, Support Vector Machine (SVM), Feedforward neural network (1, 3 and 5 layers) on tf - idf

In [28]:
# tf - idf
print('Results for tf - idf')
neigh2, neigh_pred2, neigh_acc2 = KNN_classifier(X_train_vectorized2, y_train, X_test, y_test, vect2)
print('KNN accuracy:', neigh_acc2)

Results for tf - idf
KNN accuracy: 0.7725


In [29]:
model2, predictions2, acc2 = logistic_regr(X_train_vectorized2, y_train, X_test, y_test, vect2)
print('Logistic regression accuracy:', acc2)

Logistic regression accuracy: 0.8665


In [30]:
bayes2, bayes_pred2, bayes_acc2 = naive_bayes_classifier(X_train_vectorized2, y_train, X_test, y_test, vect2)
print('Naive Bayes accuracy:', bayes_acc2)

Naive Bayes accuracy: 0.833


In [31]:
sv_model2, sv_pred2, sv_acc2 = SVM_classifier(X_train_vectorized2, y_train, X_test, y_test, vect2)
print('Support vector machine accuracy:', sv_acc2)

Support vector machine accuracy: 0.877


In [32]:
nn_classifier2, nn_pred2, result2 = neural_net(X_train_vectorized2, y_train, X_test, y_test, vect2)
print('\nFeedforward neural network accuracy:', result2[1])

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_12 (Dense)             (None, 16)                129488    
_________________________________________________________________
activation_9 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_13 (Dense)             (None, 1)                 17        
Total params: 129,505
Trainable params: 129,505
Non-trainable params: 0
_________________________________________________________________
Train on 8000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

Feedforward neural network accuracy: 0.8415


In [33]:
deepmodel2, deep_pred2, deep_results2 = neural_deep_net(X_train_vectorized2, y_train, X_test, y_test, vect2)
print('\n3-layer neural network accuracy:', deep_results2[1])

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_14 (Dense)             (None, 16)                129488    
_________________________________________________________________
activation_10 (Activation)   (None, 16)                0         
_________________________________________________________________
dense_15 (Dense)             (None, 32)                544       
_________________________________________________________________
activation_11 (Activation)   (None, 32)                0         
_________________________________________________________________
dense_16 (Dense)             (None, 16)                528       
_________________________________________________________________
activation_12 (Activation)   (None, 16)                0         
_________________________________________________________________
dense_17 (Dense)             (None, 1)                

In [34]:
deepmodel22, deep_pred22, deep_results22 = neural_deep_net2(X_train_vectorized2, y_train, X_test, y_test, vect2)
print('\n5-layer neural network accuracy:', deep_results22[1])

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_18 (Dense)             (None, 16)                129488    
_________________________________________________________________
activation_13 (Activation)   (None, 16)                0         
_________________________________________________________________
dense_19 (Dense)             (None, 32)                544       
_________________________________________________________________
activation_14 (Activation)   (None, 32)                0         
_________________________________________________________________
dense_20 (Dense)             (None, 32)                1056      
_________________________________________________________________
activation_15 (Activation)   (None, 32)                0         
_________________________________________________________________
dense_21 (Dense)             (None, 32)               

The accuracy in most cases was again more than 80%:   
KNN 77.25%  
Logistic regression 86.65%  
Naive Bayes 83.3%   
Support vector machine 87.7%  
Feedforward neural network 84.9%  
3-layer neural network 83.9%  
5-layer neural network 83.65%

The best performance for tf - idf was given from support vector machine: 87.7%.

Running the algorithms of KNN, Logistic regression, Naive Bayes, Support Vector Machine (SVM), Feedforward neural network (1, 3 and 5 layers) on n-grams

In [35]:
# ngrams
print('Results for n-grams')
neigh3, neigh_pred3, neigh_acc3 = KNN_classifier(X_train_vectorized3, y_train, X_test, y_test, vect3)
print('KNN accuracy:', neigh_acc3)

Results for n-grams
KNN accuracy: 0.6045


In [36]:
model3, predictions3, acc3 = logistic_regr(X_train_vectorized3, y_train, X_test, y_test, vect3)
print('Accuracy:', acc3)

Accuracy: 0.879


In [37]:
bayes3, bayes_pred3, bayes_acc3 = naive_bayes_classifier(X_train_vectorized3, y_train, X_test, y_test, vect3)
print('Accuracy:', bayes_acc3)

Accuracy: 0.871


In [38]:
sv_model3, sv_pred3, sv_acc3 = SVM_classifier(X_train_vectorized3, y_train, X_test, y_test, vect3)
print('Accuracy:', sv_acc3)

Accuracy: 0.855


In [39]:
nn_classifier3, nn_pred3, result3 = neural_net(X_train_vectorized3, y_train, X_test, y_test, vect3)
print('\nAccuracy:', result3[1])

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_24 (Dense)             (None, 16)                458208    
_________________________________________________________________
activation_18 (Activation)   (None, 16)                0         
_________________________________________________________________
dense_25 (Dense)             (None, 1)                 17        
Total params: 458,225
Trainable params: 458,225
Non-trainable params: 0
_________________________________________________________________
Train on 8000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

Accuracy: 0.8795


In [40]:
deepmodel3, deep_pred3, deep_results3 = neural_deep_net(X_train_vectorized3, y_train, X_test, y_test, vect3)
print('\n3-layer neural network accuracy:', deep_results3[1])

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_26 (Dense)             (None, 16)                458208    
_________________________________________________________________
activation_19 (Activation)   (None, 16)                0         
_________________________________________________________________
dense_27 (Dense)             (None, 32)                544       
_________________________________________________________________
activation_20 (Activation)   (None, 32)                0         
_________________________________________________________________
dense_28 (Dense)             (None, 16)                528       
_________________________________________________________________
activation_21 (Activation)   (None, 16)                0         
_________________________________________________________________
dense_29 (Dense)             (None, 1)                

In [41]:
deepmodel33, deep_pred33, deep_results33 = neural_deep_net2(X_train_vectorized3, y_train, X_test, y_test, vect3)
print('\n5-layer neural network accuracy:', deep_results33[1])

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_30 (Dense)             (None, 16)                458208    
_________________________________________________________________
activation_22 (Activation)   (None, 16)                0         
_________________________________________________________________
dense_31 (Dense)             (None, 32)                544       
_________________________________________________________________
activation_23 (Activation)   (None, 32)                0         
_________________________________________________________________
dense_32 (Dense)             (None, 32)                1056      
_________________________________________________________________
activation_24 (Activation)   (None, 32)                0         
_________________________________________________________________
dense_33 (Dense)             (None, 32)               

The accuracy in most cases was again more than 80%:   
KNN 60.45%  
Logistic regression 87.9%  
Naive Bayes 87.1%  
Support vector machine 85.5%  
Feedforward neural network 87%    
3-layer neural network 87.6%  
5-layer neural network 86.2%

The best performance for n-grams was given from logistic regression: 87.9%.

We can see the most frequently used negative and positive words according to each model

In [42]:
def frequent_words(features, model):
    feature_names = np.array(features)
    # sort based on positive or negative weight for each word
    # smaller coefficient - more negative
    # larger coefficient - more positive
    sorted_coef_index = model.coef_[0].argsort()
    print('Smallest Coefs: \n{}\n'.format(feature_names[sorted_coef_index[:10]]))
    print('Largest Coefs: \n{}\n'.format(feature_names[sorted_coef_index[:-11:-1]]))
    return 

Bag of words

In [43]:
# Here we can see the most frequent negative and positive words for each model 
print('Logistic regression most frequent words on bag of words')
frequent_words(features, model1)

Logistic regression most frequent words on bag of words
Smallest Coefs: 
['boring' 'poor' 'worst' 'waste' 'disappointing' 'disappointed'
 'disappointment' 'stopped' 'horrible' 'terrible']

Largest Coefs: 
['excellent' 'perfect' 'loves' 'fantastic' 'wonderful' 'amazing' 'awesome'
 'great' 'extra' 'pleased']



tf - idf

In [44]:
print('Logistic regression most frequent words on tf - idf')
frequent_words(feat2, model2)

Logistic regression most frequent words on tf - idf
Smallest Coefs: 
['not' 'boring' 'waste' 'worst' 'poor' 'bad' 'disappointed' 'money' 'don'
 'disappointing']

Largest Coefs: 
['great' 'excellent' 'love' 'best' 'good' 'perfect' 'well' 'and' 'easy'
 'wonderful']



n - grams

In [45]:
print('Logistic regression most frequent words on n-grams')
frequent_words(feat3, model3)

Logistic regression most frequent words on n-grams
Smallest Coefs: 
['boring' 'poor' 'disappointed' 'disappointing' 'worst' 'waste'
 'the worst' 'not good' 'not worth' 'disappointment']

Largest Coefs: 
['excellent' 'perfect' 'great' 'love' 'amazing' 'loves' 'better than'
 'awesome' 'wonderful' 'not bad']



### LSTM & GRU with Embeddings results
We add them separately, as they take more time to run than the rest of the methods

## LSTM

In [None]:
# This implementation is very slow to train

from tensorflow.keras import layers
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense


def LSTM_net(X_train, y_train, X_test, y_test, vect):
    input_size = X_train.shape[1]
    embed_dim = 10
    lstm_dim = 10
    batch_size = 32
    n_epochs = 3

    model = Sequential()
    
    # Add the word embedding Layer
    model.add(Embedding(10, embed_dim, input_length = input_size))
    
    # Add the LSTM Layer
    model.add(LSTM(lstm_dim))
    
    # Output layer
    model.add(Dense(1,activation='softmax'))
    
    model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
    model.fit(X_train, y_train, epochs = n_epochs)
    
    # predict the labels on test dataset
    lstm_pred = model.predict(vect.transform(X_test))
    lstm_pred = nn_pred.argmax(axis=-1)
    
    # evaluate gives the loss and the accuracy of the model
    lstm_results = model.evaluate(vect.transform(X_test), y_test)
    
    return model, lstm_pred, lstm_results

## Bidirectional LSTM

In [None]:
# This implementation is very slow to train

from tensorflow.keras import layers
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Bidirectional

def BiLSTM_net(X_train, y_train, X_test, y_test, vect):
    input_size = X_train.shape[1]
    embed_dim = 10
    lstm_dim = 10
    batch_size = 32
    n_epochs = 3

    model = Sequential()
    
    # Add the word embedding Layer
    model.add(Embedding(1000, embed_dim, input_length = input_size))
    
    # Add the LSTM Layer
    model.add(Bidirectional(LSTM(lstm_dim)))
    
    # Output layer
    model.add(Dense(1,activation='softmax'))
    
    model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
    model.fit(X_train, y_train, epochs = n_epochs)
    
    # predict the labels on test dataset
    BiLSTM_pred = model.predict(vect.transform(X_test))
    BiLSTM_pred = nn_pred.argmax(axis=-1)
    
    # evaluate gives the loss and the accuracy of the model
    BiLSTM_results = model.evaluate(vect.transform(X_test), y_test)
    
    return model, BiLSTM_pred, BiLSTM_results


## GRU

In [None]:
# This implementation is very slow to train

from tensorflow.keras import layers
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import GRU
from tensorflow.keras.layers import Dense


def GRU_net(X_train, y_train, X_test, y_test, vect):
    input_size = X_train.shape[1]
    embed_dim = 10
    gru_dim = 10
    batch_size = 32
    n_epochs = 3

    model = Sequential()
    
    # Add the word embedding Layer
    model.add(Embedding(1000, embed_dim, input_length = input_size))
    
    # Add the LSTM Layer
    model.add(GRU(gru_dim))
    
    # Output layer
    model.add(Dense(1,activation='softmax'))
    
    model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
    model.fit(X_train, y_train, epochs = n_epochs)
    
    # predict the labels on test dataset
    gru_pred = model.predict(vect.transform(X_test))
    gru_pred = nn_pred.argmax(axis=-1)
    
    # evaluate gives the loss and the accuracy of the model
    gru_results = model.evaluate(vect.transform(X_test), y_test)
    
    return model, gru_pred, gru_results

LSTM on bag of words

In [None]:
lstm_model1, lstm_pred1, lstm_results1 = LSTM_net(X_train_vectorized1, y_train, X_test, y_test, vect1)
print('LSTM for bag of words')
print('Accuracy:', lstm_results1[1])

LSTM on Tf-idf

In [None]:
lstm_model2, lstm_pred2, lstm_results2 = LSTM_net(X_train_vectorized2, y_train, X_test, y_test, vect2)
print('LSTM for Tf-idf')
print('Accuracy:', lstm_results2[1])

LSTM on bag of n-grams

In [None]:
lstm_model3, lstm_pred3, lstm_results3 = LSTM_net(X_train_vectorized3, y_train, X_test, y_test, vect3)
print('LSTM for n-grams')
print('Accuracy:', lstm_results3[1])

BiLSTM on bag of words

In [None]:
BiLSTM_model1, BiLSTM_pred1, BiLSTM_results1 = BiLSTM_net(X_train_vectorized1, y_train, X_test, y_test, vect1)
print('Bidirectional LSTM for bag of words')
print('Accuracy:', BiLSTM_results1[1])

BiLSTM on Tf-idf

In [None]:
BiLSTM_model2, BiLSTM_pred2, BiLSTM_results2 = BiLSTM_net(X_train_vectorized2, y_train, X_test, y_test, vect2)
print('Bidirectional LSTM for Tf-idf')
print('Accuracy:', BiLSTM_results2[1])

BiLSTM on n-grams

In [None]:
BiLSTM_model3, BiLSTM_pred3, BiLSTM_results3 = BiLSTM_net(X_train_vectorized3, y_train, X_test, y_test, vect3)
print('Bidirectional LSTM for n-grams')
print('Accuracy:', BiLSTM_results3[1])

GRU on bag of words

In [None]:
gru_model1, gru_pred1, gru_results1 = GRU_net(X_train_vectorized1, y_train, X_test, y_test, vect1)
print('GRU for bag of words')
print('Accuracy:', gru_results1[1])

GRU on bag of Tf-idf

In [None]:
gru_model2, gru_pred2, gru_results2 = GRU_net(X_train_vectorized2, y_train, X_test, y_test, vect2)
print('GRU for Tf-idf')
print('Accuracy:', gru_results2[1])

GRU on bag of n-grams

In [None]:
gru_model3, gru_pred3, gru_results3 = GRU_net(X_train_vectorized3, y_train, X_test, y_test, vect3)
print('GRU for n-grams')
print('Accuracy:', gru_results3[1])