# Sentinel Analysis on IMDB

In [86]:
# Import the dependencies
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense, LSTM, Flatten, Dropout, Conv1D, MaxPooling1D, GRU, Bidirectional
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

In [9]:
# Define the number of words you want to use
max_words = 5000

# Define the training and test dataset
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=max_words)

print("Created test and training data.")

Created test and training data.


In [10]:
# Define the maximum length of a review
max_review_length = 500

# Pad the input sequences with 0's to make them all the same length
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)

print("Padded the input sequences with 0's to all be the same length.")

Padded the input sequences with 0's to all be the same length.


In [11]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

We will use scaled data for faster convergence and better accuracy

## Logistic Regression

In [32]:
from sklearn.linear_model import LogisticRegression

solvers = ['newton-cg', 'liblinear', 'lbfgs' , 'sag', 'saga']
cs = [0.1, 1, 10]


for solver in solvers:
    for c in cs:
        logisticRegr = LogisticRegression(C = c, solver =solver)
        logisticRegr.fit(X_train_scaled., y_train)
        print(f"Solver: {solver}, C: {c}, Accuracy: {logisticRegr.score(X_test_scaled, y_test)}")
        

Solver: newton-cg, C: 0.1, Accuracy: 0.51048
Solver: newton-cg, C: 1, Accuracy: 0.50912
Solver: newton-cg, C: 10, Accuracy: 0.50888
Solver: liblinear, C: 0.1, Accuracy: 0.51016
Solver: liblinear, C: 1, Accuracy: 0.50916
Solver: liblinear, C: 10, Accuracy: 0.50892
Solver: lbfgs, C: 0.1, Accuracy: 0.51052
Solver: lbfgs, C: 1, Accuracy: 0.50896
Solver: lbfgs, C: 10, Accuracy: 0.50896
Solver: sag, C: 0.1, Accuracy: 0.51048
Solver: sag, C: 1, Accuracy: 0.50908
Solver: sag, C: 10, Accuracy: 0.50884
Solver: saga, C: 0.1, Accuracy: 0.51048
Solver: saga, C: 1, Accuracy: 0.50912
Solver: saga, C: 10, Accuracy: 0.50892


Bad accuracy, almost the same as a random unbiased predictor!

## Naive Bayes

In [36]:
import pandas as pd
from sklearn.metrics import accuracy_score, log_loss, classification_report
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB, ComplementNB
classifiers = [
    GaussianNB(),
    MultinomialNB(),
    BernoulliNB(),
    ComplementNB(),               
                  ]
 
# Logging for Visual Comparison
log_cols=["Classifier", "Accuracy", "Log Loss"]
log = pd.DataFrame(columns=log_cols)
 
for clf in classifiers:
    clf.fit(X_train_scaled, y_train)
    name = clf.__class__.__name__
    
    print("="*30)
    print(name)
    
    print('****Results****')
    train_predictions = clf.predict(X_test_scaled)
    acc = accuracy_score(y_test, train_predictions)
    print("Accuracy: {:.4%}".format(acc))
    
    log_entry = pd.DataFrame([[name, acc*100, 11]], columns=log_cols)
    log = log.append(log_entry)
    
    print("="*30)

GaussianNB
****Results****
Accuracy: 50.4240%
MultinomialNB
****Results****
Accuracy: 49.9720%
BernoulliNB
****Results****
Accuracy: 50.9400%
ComplementNB
****Results****
Accuracy: 49.9720%


Identical perfomance to random unbiased predictor!

## Decision Tree (Random Forest, Adamboost)

In [8]:
from sklearn.ensemble import RandomForestClassifier

max_depths = [2, 5, 7, 10]
max_features = ["sqrt", "log2"]

# decision tree does not need scaled data to perform better
# the same applies to all ensembling methods using desicion tree as base estimator
for max_depth in max_depths:
    for n_feat in max_features:
        rfc = RandomForestClassifier(max_depth = max_depth, max_features = n_feat)
        rfc.fit(X_train, y_train)
        print(f"Max Depth: {max_depth}, Max features: {n_feat}, Accuracy: {rfc.score(X_test, y_test)}")

Max Depth: 2, Max features: sqrt, Accuracy: 0.5264
Max Depth: 2, Max features: log2, Accuracy: 0.52172
Max Depth: 5, Max features: sqrt, Accuracy: 0.5306
Max Depth: 5, Max features: log2, Accuracy: 0.52704
Max Depth: 7, Max features: sqrt, Accuracy: 0.53644
Max Depth: 7, Max features: log2, Accuracy: 0.531
Max Depth: 10, Max features: sqrt, Accuracy: 0.53788
Max Depth: 10, Max features: log2, Accuracy: 0.53324


Slighty better than a random unbiased predictor.

In [10]:
from sklearn.ensemble import AdaBoostClassifier

n_estimators_array = [10, 50, 100]
learning_rates = [0.1, 0.5, 1]

# The base estimator is Desicion Tree Classifier
for n_estimators in n_estimators_array:
    for lr in learning_rates:
        adc = AdaBoostClassifier(n_estimators = n_estimators, learning_rate = lr)
        adc.fit(X_train, y_train)
        print(f"Number of estimators: {n_estimators}, Learning rate: {lr}, Accuracy: {adc.score(X_test, y_test)}")

Number of estimators: 10, Learning rate: 0.1, Accuracy: 0.52096
Number of estimators: 10, Learning rate: 0.5, Accuracy: 0.52648
Number of estimators: 10, Learning rate: 1, Accuracy: 0.52624
Number of estimators: 50, Learning rate: 0.1, Accuracy: 0.53084
Number of estimators: 50, Learning rate: 0.5, Accuracy: 0.54184
Number of estimators: 50, Learning rate: 1, Accuracy: 0.5416
Number of estimators: 100, Learning rate: 0.1, Accuracy: 0.5388
Number of estimators: 100, Learning rate: 0.5, Accuracy: 0.54984
Number of estimators: 100, Learning rate: 1, Accuracy: 0.54508


Not much improvement

## SVM

In [12]:
from sklearn.svm import SVC

kernels = ['rbf', 'poly', 'linear' , 'sigmoid']
cs = [0.01, 1, 10]

# decision tree does not need scaled data to perform better
# the same applies to all ensembling methods using desicion tree as base estimator
for kernel in kernels:
    for c in cs:
        svc = SVC(kernel = kernel, C = c, max_iter = 500)
        svc.fit(X_train_scaled, y_train)
        print(f"Kernel: {kernel}, C: {c}, Accuracy: {svc.score(X_test_scaled, y_test)}")



Kernel: rbf, C: 0.01, Accuracy: 0.50432




Kernel: rbf, C: 1, Accuracy: 0.50008




Kernel: rbf, C: 10, Accuracy: 0.50604




Kernel: poly, C: 0.01, Accuracy: 0.49928




Kernel: poly, C: 1, Accuracy: 0.5




Kernel: poly, C: 10, Accuracy: 0.49352




Kernel: linear, C: 0.01, Accuracy: 0.48932




Kernel: linear, C: 1, Accuracy: 0.49668




Kernel: linear, C: 10, Accuracy: 0.50188




Kernel: sigmoid, C: 0.01, Accuracy: 0.50176




Kernel: sigmoid, C: 1, Accuracy: 0.49036




Kernel: sigmoid, C: 10, Accuracy: 0.488


Almost as bad as a random estimator, if not worse

## Fully Connected NN without Embedding layer

In [101]:
model = Sequential()
model.add(Dense(20, activation='relu', input_dim=500))
model.add(Dropout(0.3))
model.add(Dense(30, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [95]:
model.summary()

Model: "sequential_29"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_37 (Dense)            (None, 20)                10020     
                                                                 
 dropout_9 (Dropout)         (None, 20)                0         
                                                                 
 dense_38 (Dense)            (None, 30)                630       
                                                                 
 dropout_10 (Dropout)        (None, 30)                0         
                                                                 
 dense_39 (Dense)            (None, 1)                 31        
                                                                 
Total params: 10,681
Trainable params: 10,681
Non-trainable params: 0
_________________________________________________________________


In [102]:
model.fit(X_train, y_train, epochs=20, batch_size=64)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x168845599a0>

In [103]:
model_scores = model.evaluate(X_test, y_test, verbose=0)

# Print out the accuracy of the model on the test set
print("Model accuracy on the test dataset: {0:.2f}%".format(model_scores[1]*100))

Model accuracy on the test dataset: 50.01%


It is unable to learn. Let's try with scaled input

In [104]:
model = Sequential()
model.add(Dense(20, activation='relu', input_dim=500))
model.add(Dropout(0.3))
model.add(Dense(30, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X_train_scaled, y_train, epochs=20, batch_size=64)

model_scores = model.evaluate(X_test_scaled, y_test, verbose=0)

# Print out the accuracy of the model on the test set
print("Model accuracy on the test dataset: {0:.2f}%".format(model_scores[1]*100))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Model accuracy on the test dataset: 50.64%


Perfoms better on training set, but it is still as bad as before on test set. We can conclude that th NN does not learn the dependencies between words, therefore it is biased on the training set

# Using Embedding Layer

## Fully Connected NN 

In [45]:

# Define the layers in the model
embedding_vector_length = 32

# Define the layers in the model
model = Sequential()
model.add(Embedding(max_words, embedding_vector_length, input_length=max_review_length))
model.add(Flatten())
model.add(Dense(30, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [46]:
model.summary()

Model: "sequential_13"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_9 (Embedding)     (None, 500, 32)           160000    
                                                                 
 flatten_7 (Flatten)         (None, 16000)             0         
                                                                 
 dense_20 (Dense)            (None, 30)                480030    
                                                                 
 dropout_2 (Dropout)         (None, 30)                0         
                                                                 
 dense_21 (Dense)            (None, 1)                 31        
                                                                 
Total params: 640,061
Trainable params: 640,061
Non-trainable params: 0
_________________________________________________________________


In [47]:
model.fit(X_train, y_train, epochs=3, batch_size=64)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x168c63ea790>

In [49]:
model_scores = model.evaluate(X_test, y_test, verbose=0)

# Print out the accuracy of the model on the test set
print("Model accuracy on the test dataset: {0:.2f}%".format(model_scores[1]*100))

Model accuracy on the test dataset: 86.90%


# CNN

In [61]:
# Define the layers in the model
embedding_vector_length = 32

# Define the layers in the model
model = Sequential()
model.add(Embedding(max_words, embedding_vector_length, input_length=max_review_length))
model.add(Conv1D(30, kernel_size = 3, activation='relu'))
model.add(MaxPooling1D(2))
model.add(Flatten())
model.add(Dense(30, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [62]:
model.summary()

Model: "sequential_19"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_15 (Embedding)    (None, 500, 32)           160000    
                                                                 
 conv1d_4 (Conv1D)           (None, 498, 30)           2910      
                                                                 
 max_pooling1d (MaxPooling1D  (None, 249, 30)          0         
 )                                                               
                                                                 
 flatten_13 (Flatten)        (None, 7470)              0         
                                                                 
 dense_26 (Dense)            (None, 30)                224130    
                                                                 
 dropout_5 (Dropout)         (None, 30)                0         
                                                     

In [63]:
model.fit(X_train, y_train, epochs=3, batch_size=64)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x168c8a46e20>

In [64]:
model_scores = model.evaluate(X_test, y_test, verbose=0)

# Print out the accuracy of the model on the test set
print("Model accuracy on the test dataset: {0:.2f}%".format(model_scores[1]*100))

Model accuracy on the test dataset: 88.04%


# RNN (using LSTM)

In [68]:
# Define how long the embedding vector will be
embedding_vector_length = 32

# Define the layers in the model
model = Sequential()
model.add(Embedding(max_words, embedding_vector_length, input_length=max_review_length))
model.add(LSTM(20))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [69]:
model.summary()

Model: "sequential_21"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_17 (Embedding)    (None, 500, 32)           160000    
                                                                 
 lstm_1 (LSTM)               (None, 20)                4240      
                                                                 
 dense_29 (Dense)            (None, 1)                 21        
                                                                 
Total params: 164,261
Trainable params: 164,261
Non-trainable params: 0
_________________________________________________________________


In [70]:
model.fit(X_train, y_train, epochs=3, batch_size=64)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x168c9155ac0>

In [71]:
model_scores = model.evaluate(X_test, y_test, verbose=0)

# Print out the accuracy of the model on the test set
print("Model accuracy on the test dataset: {0:.2f}%".format(model_scores[1]*100))

Model accuracy on the test dataset: 86.76%


## CNN-RNN(using GRU) Hybrid

In [82]:
# Define the layers in the model
embedding_vector_length = 32

# Define the layers in the model
model = Sequential()
model.add(Embedding(max_words, embedding_vector_length, input_length=max_review_length))
model.add(Conv1D(30, kernel_size = 3, activation='relu'))
model.add(MaxPooling1D(2))
model.add(GRU(20))
model.add(Dense(20, activation = 'relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [83]:
model.summary()

Model: "sequential_25"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_21 (Embedding)    (None, 500, 32)           160000    
                                                                 
 conv1d_8 (Conv1D)           (None, 498, 30)           2910      
                                                                 
 max_pooling1d_4 (MaxPooling  (None, 249, 30)          0         
 1D)                                                             
                                                                 
 gru (GRU)                   (None, 20)                3120      
                                                                 
 dense_33 (Dense)            (None, 20)                420       
                                                                 
 dropout_7 (Dropout)         (None, 20)                0         
                                                     

In [84]:
model.fit(X_train, y_train, epochs=3, batch_size=64)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x168d128d880>

In [85]:
model_scores = model.evaluate(X_test, y_test, verbose=0)

# Print out the accuracy of the model on the test set
print("Model accuracy on the test dataset: {0:.2f}%".format(model_scores[1]*100))

Model accuracy on the test dataset: 88.48%


## Bidirectional RNN with 2 stacked recurrent layers

In [90]:
model = Sequential()
model.add(Embedding(max_words, embedding_vector_length, input_length=max_review_length))
model.add(Bidirectional(LSTM(5, return_sequences = True)))
model.add(Bidirectional(LSTM(5)))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [91]:
model.summary()

Model: "sequential_28"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_24 (Embedding)    (None, 500, 32)           160000    
                                                                 
 bidirectional_2 (Bidirectio  (None, 500, 10)          1520      
 nal)                                                            
                                                                 
 bidirectional_3 (Bidirectio  (None, 10)               640       
 nal)                                                            
                                                                 
 dropout_8 (Dropout)         (None, 10)                0         
                                                                 
 dense_36 (Dense)            (None, 1)                 11        
                                                                 
Total params: 162,171
Trainable params: 162,171
Non-t

In [92]:
model.fit(X_train, y_train, epochs=3, batch_size=64)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x168d408f9d0>

In [93]:
model_scores = model.evaluate(X_test, y_test, verbose=0)

# Print out the accuracy of the model on the test set
print("Model accuracy on the test dataset: {0:.2f}%".format(model_scores[1]*100))

Model accuracy on the test dataset: 85.18%


We can conclude that every NN performed much better than the previous models.The FNN is the fastest at training and evaluating while the Hybrid CNN-RNN has the best accuracy while is moderaly fast compared to RNN or Bidirectional RNN