<a href="https://colab.research.google.com/github/pihlnikl/Data-analysis/blob/master/sentiment_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# IMDb sentiment classifier
Comparing results of using CountVectorizer vs TfidfVectorizer



In [1]:
# Download the data
!wget -nc https://github.com/TurkuNLP/intro-to-nlp/raw/master/Data/imdb_train.json
!wget -nc http://dl.turkunlp.org/intro-to-nlp.tar.gz

--2021-12-20 15:31:08--  https://github.com/TurkuNLP/intro-to-nlp/raw/master/Data/imdb_train.json
Resolving github.com (github.com)... 140.82.113.4
Connecting to github.com (github.com)|140.82.113.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/TurkuNLP/intro-to-nlp/master/Data/imdb_train.json [following]
--2021-12-20 15:31:08--  https://raw.githubusercontent.com/TurkuNLP/intro-to-nlp/master/Data/imdb_train.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 33944099 (32M) [text/plain]
Saving to: ‘imdb_train.json’


2021-12-20 15:31:09 (144 MB/s) - ‘imdb_train.json’ saved [33944099/33944099]

--2021-12-20 15:31:09--  http://dl.turkunlp.org/intro-to-nlp.tar.gz
Resolving dl.turkunlp.org (dl.tu

In [2]:
# Import packages
import json
import random
with open("imdb_train.json") as f:
    data = json.load(f)
random.shuffle(data)

# Split data into labels and text
texts = [one_example["text"] for one_example in data]
labels = [one_example["class"] for one_example in data]

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
# Import TfidVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Split into 2 different train & test sets for comparison
train_texts, dev_texts, train_labels, dev_labels = train_test_split(texts,labels,test_size=0.2)
train_texts_T, dev_texts_T, train_labels_T, dev_labels_T = train_test_split(texts,labels,test_size=0.2)

# Define vectorizer using CountVectorizer
vectorizer = CountVectorizer(max_features=100000,binary=True,ngram_range=(1,2))
feature_matrix_train = vectorizer.fit_transform(train_texts)
feature_matrix_dev = vectorizer.transform(dev_texts)

# Lets compare with TfidVectorizer
vectorizer_T = TfidfVectorizer(max_features=100000,binary=True,ngram_range=(1,1))
# Same with the new vectorizer
feature_matrix_train_T = vectorizer_T.fit_transform(train_texts_T)
feature_matrix_dev_T = vectorizer_T.transform(dev_texts_T)

print("shape =",feature_matrix_train.shape)
print("shape =", feature_matrix_train_T.shape)

shape = (20000, 100000)
shape = (20000, 68320)




*   We can see that changing the lenght of n-grams already effects the shape of the data, which was previously (20000, 68390)



In [4]:
import tensorflow as tf
import numpy as np

# Function for converting matrix to tensor
def convert_sparse_matrix_to_sparse_tensor(X):
    coo = X.tocoo()
    indices = np.mat([coo.row, coo.col]).transpose()
    return tf.sparse.reorder(tf.SparseTensor(indices, coo.data, coo.shape))

# Convert basic matrix into tensor
feature_matrix_train_tf = convert_sparse_matrix_to_sparse_tensor(feature_matrix_train)
feature_matrix_dev_tf = convert_sparse_matrix_to_sparse_tensor(feature_matrix_dev)

# Same with TfidVectorizer
feature_matrix_train_tf_T = convert_sparse_matrix_to_sparse_tensor(feature_matrix_train_T)
feature_matrix_dev_tf_T = convert_sparse_matrix_to_sparse_tensor(feature_matrix_dev_T)

Now we have the feature matrix done! Next thing we need is the class labels:

In [5]:
from sklearn.preprocessing import LabelEncoder

# Define the labels for both datasets
label_encoder = LabelEncoder()
class_numbers_train = label_encoder.fit_transform(train_labels)
class_numbers_dev = label_encoder.transform(dev_labels)

# Same procedure with Tfid
class_numbers_train_T = label_encoder.fit_transform(train_labels_T)
class_numbers_dev_T = label_encoder.transform(dev_labels_T)

# Review can be either negative (neg) or positive (pos)
print("class_numbers shape =",class_numbers_train.shape)
print("class labels",label_encoder.classes_)

class_numbers shape = (20000,)
class labels ['neg' 'pos']


In [6]:
from keras.models import Model
from keras.layers import Input, Dense

example_count, feature_count = feature_matrix_train.shape
example_count2 = class_numbers_train.shape[0]
assert example_count == example_count2
class_count = len(label_encoder.classes_)

#Build the network:
inp = Input(shape=(feature_count,))
hidden = Dense(200,activation="tanh")(inp)
outp = Dense(class_count,activation="softmax")(hidden)
model = Model(inputs=[inp], outputs=[outp])

# The same procedures for Tfid
example_count_T, feature_count_T = feature_matrix_train_T.shape
example_count2_T = class_numbers_train_T.shape[0]
assert example_count_T == example_count2_T
class_count_T = len(label_encoder.classes_)

inp_T = Input(shape=(feature_count_T,))
hidden_T = Dense(200,activation="tanh")(inp_T)
outp_T = Dense(class_count,activation="softmax")(hidden_T)
model_T = Model(inputs=[inp_T], outputs=[outp_T])

In [7]:
model.compile(optimizer="sgd",loss="sparse_categorical_crossentropy",metrics=['accuracy'])
model_T.compile(optimizer="sgd",loss="sparse_categorical_crossentropy",metrics=['accuracy'])

A compiled model can be fitted on data:

In [8]:
hist=model.fit(feature_matrix_train_tf,class_numbers_train,\
               validation_data=(feature_matrix_dev_tf,class_numbers_dev),\
               batch_size=100,verbose=1,epochs=5)


Epoch 1/5


  "shape. This may consume a large amount of memory." % value)


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [9]:
hist_T=model_T.fit(feature_matrix_train_tf_T,class_numbers_train_T,\
               validation_data=(feature_matrix_dev_tf_T,class_numbers_dev_T),\
               batch_size=100,verbose=1,epochs=5)

Epoch 1/5


  "shape. This may consume a large amount of memory." % value)


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5




*   Increasing the ngram_range seems to have hurt the accuracy score

*   Accuracy with the original range was in the high 0.9x, where as the increased gives a score in the low 0.9x






In [10]:
print(hist.history["val_accuracy"])
print(hist_T.history["val_accuracy"])

[0.8284000158309937, 0.8529999852180481, 0.8610000014305115, 0.8672000169754028, 0.8709999918937683]
[0.5437999963760376, 0.6326000094413757, 0.703000009059906, 0.7771999835968018, 0.7906000018119812]



*   As we can see, there are some small, almost minimal differences in the results thus far between CountVectorizer and TfidVectorizer.

*   The largest differences seem to be in the loss categories




In [12]:
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras import optimizers
import tensorflow as tf
import os
import pickle

def save_model(file_name,model,label_encoder,vectorizer):
    """Saves model structure and vocabularies"""
    model_json = model.to_json()
    with open(file_name+".model.json", "w") as f:
        print(model_json,file=f)
    with open(file_name+".encoders.pickle","wb") as f:
        pickle.dump((label_encoder,vectorizer),f)
            
# Let's try a different optimizer!
opt = tf.optimizers.Adam()
model.compile(optimizer = opt, loss = "sparse_categorical_crossentropy", metrics = ['accuracy'])
model_T.compile(optimizer = opt, loss = "sparse_categorical_crossentropy", metrics = ['accuracy'])

# Save model and vocabularies, can be done before training
os.makedirs("models", exist_ok = True)
save_model("models/imdb_bow", model, label_encoder, vectorizer)
# Same for Tfid
os.makedirs("models_T",exist_ok = True)
save_model("models_T/imdb_bow", model_T, label_encoder, vectorizer_T)

# Callback function to save weights during training, if validation loss goes down
save_cb = ModelCheckpoint(filepath="models/imdb_bow.weights.h5", monitor='val_loss',\
                        verbose=1, save_best_only=True, mode='auto')
stop_cb = EarlyStopping(patience=2,verbose=1,restore_best_weights=True)

# Same for Tfid
save_cb_T = ModelCheckpoint(filepath="models_T/imdb_bow.weights.h5", monitor='val_loss',\
                        verbose=1, save_best_only=True, mode='auto')
stop_cb_T = EarlyStopping(patience=2,verbose=1,restore_best_weights=True)

hist = model.fit(feature_matrix_train_tf, class_numbers_train,\
               validation_data = (feature_matrix_dev_tf,class_numbers_dev),\
               batch_size = 100,verbose = 1,epochs = 20,\
               callbacks = [save_cb, stop_cb])

hist_T = model_T.fit(feature_matrix_train_tf_T, class_numbers_train_T,\
               validation_data = (feature_matrix_dev_tf_T, class_numbers_dev_T),\
               batch_size = 200, verbose = 1, epochs = 20,\
               callbacks = [save_cb_T, stop_cb_T])

Epoch 1/20


  "shape. This may consume a large amount of memory." % value)


Epoch 00001: val_loss improved from inf to 0.25592, saving model to models/imdb_bow.weights.h5
Epoch 2/20
Epoch 00002: val_loss did not improve from 0.25592
Epoch 3/20
Epoch 00003: val_loss did not improve from 0.25592
Restoring model weights from the end of the best epoch: 1.
Epoch 00003: early stopping
Epoch 1/20


  "shape. This may consume a large amount of memory." % value)


Epoch 00001: val_loss improved from inf to 0.26551, saving model to models_T/imdb_bow.weights.h5
Epoch 2/20
Epoch 00002: val_loss did not improve from 0.26551
Epoch 3/20
Epoch 00003: val_loss did not improve from 0.26551
Restoring model weights from the end of the best epoch: 1.
Epoch 00003: early stopping




*   Again, we can see that there is some difference in the results, although not large.





In [13]:
import numpy
from sklearn.metrics import classification_report, confusion_matrix

#Validation data used during training:
val_instances, val_labels = feature_matrix_dev_tf, class_numbers_dev

# Same for Tfid
val_instances_T, val_labels_T = feature_matrix_dev_tf_T, class_numbers_dev_T

print("Network output=",model.predict(val_instances))
predictions=numpy.argmax(model.predict(val_instances),axis=1)
print("Maximum class for each example=",predictions)
gold=val_labels
gold_T=val_labels_T
conf_matrix=confusion_matrix(list(gold),list(predictions))
print("Confusion matrix=\n",conf_matrix)

predictions_T = numpy.argmax(model_T.predict(val_instances_T),axis=1)
conf_matrix_T = confusion_matrix(list(gold_T),list(predictions_T))
print("Confusion matrix Tfid=\n", conf_matrix_T)

gold_labels=label_encoder.inverse_transform(list(gold))
gold_labels_T=label_encoder.inverse_transform(list(gold_T))

predicted_labels=label_encoder.inverse_transform(list(predictions))
predicted_labels_T=label_encoder.inverse_transform(list(predictions_T))

print(classification_report(gold_labels,predicted_labels))
print(classification_report(gold_labels_T,predicted_labels_T))


Network output= [[7.4333906e-01 2.5666091e-01]
 [1.6530928e-03 9.9834692e-01]
 [9.0480202e-01 9.5197946e-02]
 ...
 [5.8537900e-01 4.1462100e-01]
 [7.1745858e-02 9.2825419e-01]
 [9.9900621e-01 9.9386845e-04]]
Maximum class for each example= [0 1 0 ... 0 1 0]
Confusion matrix=
 [[2275  248]
 [ 287 2190]]
Confusion matrix Tfid=
 [[2172  258]
 [ 309 2261]]
              precision    recall  f1-score   support

         neg       0.89      0.90      0.89      2523
         pos       0.90      0.88      0.89      2477

    accuracy                           0.89      5000
   macro avg       0.89      0.89      0.89      5000
weighted avg       0.89      0.89      0.89      5000

              precision    recall  f1-score   support

         neg       0.88      0.89      0.88      2430
         pos       0.90      0.88      0.89      2570

    accuracy                           0.89      5000
   macro avg       0.89      0.89      0.89      5000
weighted avg       0.89      0.89      0.89   



*   The differences between CountVectorizer and Tfid are at this point so small that the classification reports look almost identical


