# **Sentiment Classification - Deep Learning - Basic Preprocessing**

# **Prerequisites**

**Install Required Packages**

In [1]:
# install the 'datasets' library 
!pip install datasets -q

# install the 'spacy' library
!pip install spacy -q


[notice] A new release of pip is available: 23.3.1 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip is available: 23.3.1 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
# downloads the small English model (en_core_web_sm) for the Spacy library
!python -m spacy download en_core_web_sm -q

[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')



[notice] A new release of pip is available: 23.3.1 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip


**Load Dataset**

In [3]:
# import the 'load_and_prepare_imdb_dataset' function from the 'imdb_data_loader'
from imdb_data_loader import load_and_prepare_imdb_dataset

# call the 'load_and_prepare_imdb_dataset' function to import the IMDB dataset
trainData, testData = load_and_prepare_imdb_dataset()

# **Dataset Analysis**

**Data Checks**

In [4]:
# call the 'head()' method on the 'trainData' DataFrame to inspect the first five rows
trainData.head()

Unnamed: 0,text,label
0,I rented I AM CURIOUS-YELLOW from my video sto...,0
1,"""I Am Curious: Yellow"" is a risible and preten...",0
2,If only to avoid making this type of film in t...,0
3,This film was probably inspired by Godard's Ma...,0
4,"Oh, brother...after hearing about this ridicul...",0


In [5]:
# get the shape of the 'trainData' DataFrame
trainData.shape

(25000, 2)

In [6]:
# call the 'head()' method on the 'testData' DataFrame to inspect the first five rows
testData.head()

Unnamed: 0,text,label
0,I love sci-fi and am willing to put up with a ...,0
1,"Worth the entertainment value of a rental, esp...",0
2,its a totally average film with a few semi-alr...,0
3,STAR RATING: ***** Saturday Night **** Friday ...,0
4,"First off let me say, If you haven't enjoyed a...",0


In [7]:
# get the shape of the 'testData' DataFrame
testData.shape

(25000, 2)

**Remove Duplicates**

In [8]:
# calculate the number of duplicated entries in the 'text' column of the 'trainData' DataFrame
trainDataDuplicates = trainData['text'].duplicated().sum()
trainDataDuplicates

96

In [9]:
# remove duplicate rows from 'trainData' based on the 'text' column
noTrainDataDuplicates = trainData.drop_duplicates(subset='text')
noTrainDataDuplicatesShape = noTrainDataDuplicates.shape

noTrainDataDuplicatesShape

(24904, 2)

In [10]:
# calculate the number of duplicated entries in the 'text' column of the 'testData' DataFrame
testDataDuplicates = testData['text'].duplicated().sum()
testDataDuplicates

199

In [11]:
# remove duplicate rows from 'testData' based on the 'text' column
noTestDataDuplicates = testData.drop_duplicates(subset='text')
noTestDataDuplicatesShape = noTestDataDuplicates.shape

noTestDataDuplicatesShape

(24801, 2)

# **Basic Preprocessing**

In [12]:
# import the preprocessing function 'preprocess_basic' from 'basic_preprocessing'

# Convert text to lowercase.
# Remove HTML tags using BeautifulSoup.
# Handle contractions.
# Expand acronyms.
# Tokenize the text using SpaCy.
# Remove punctuation tokens.
# Remove non-alphabetic characters.

from basic_preprocessing import preprocess_basic

In [13]:
# create copies of the noTrainDataDuplicates and noTestDataDuplicates DataFrames
rawtrainData = noTrainDataDuplicates.copy()
rawtestData = noTestDataDuplicates.copy()

In [14]:
# apply the 'preprocess_basic' function to each text entry in the 'rawtrainData' DataFrame
trainDataBasic = rawtrainData['text'].apply(preprocess_basic)

  text = BeautifulSoup(text, "html.parser").get_text()


In [15]:
# print the first three preprocessed text entries (train data) to verify the preprocessing step
for index, value in trainDataBasic.items():
    print(f"Index {index}: {value}")
    if index == 2:
        break

Index 0: i rented i am curious yellow from my video store because of all the controversy that surrounded it when it was first released in i also heard that at first it was seized by customs if it ever tried to enter this country therefore being a fan of films considered controversial i really had to see this for plot is centered around a young swedish drama student named lena who wants to learn everything she can about life in particular she wants to focus her attentions to making some sort of documentary on what the average swede thought about certain political issues such as the vietnam war and race issues in the united states in between asking politicians and ordinary denizens of stockholm about their opinions on politics she has sex with her drama teacher classmates and married kills me about i am curious yellow is that years ago this was considered pornographic really the sex and nudity scenes are few and far between even then it has it is not shot like some cheaply made porno whi

In [16]:
# apply the 'preprocess_basic' function to each text entry in the 'rawtestData' DataFrame

testDataBasic = rawtestData['text'].apply(preprocess_basic)

In [17]:
# print the first three preprocessed text entries (test data) to verify the preprocessing step
for index, value in testDataBasic.items():
    print(f"Index {index}: {value}")
    if index == 2:
        break

Index 0: i love sci fi and am willing to put up with a lot sci fi movies tv are usually underfunded under appreciated and misunderstood i tried to like this i really did but it is to good tv sci fi as babylon is to star trek the original silly prosthetics cheap cardboard sets stilted dialogues cg that does not match the background and painfully one dimensional characters can not be overcome with a sci fi setting i sure there are those of you out there who think babylon is good sci fi tv it has it is not it has it is clichéd and uninspiring while us viewers might like emotion and character development sci fi is a genre that does not take itself seriously cf star trek it may treat important issues yet not as a serious philosophy it has it is really difficult to care about the characters here as they are not simply foolish just missing a spark of life their actions and reactions are wooden and predictable often painful to watch the makers of earth know it has it is rubbish as they have to

# **Feature Extraction**

In [18]:
from gensim_model_downloader import download_and_save_model

**Word2Vec**

In [19]:
from gensim_model_api import load_gensim_model


In [21]:
# download_and_save_model('word2vec-google-news-300', 'word2vec_vector.kv')
word2vec_wv = load_gensim_model('word2vec_vector.kv')

**Glove**

In [22]:
from gensim_model_api import load_gensim_model

In [24]:
# download_and_save_model('glove-wiki-gigaword-300', 'glove_vector.kv')
glove_wv = load_gensim_model('glove_vector.kv')

# **Tokenisation and Padding**

In [25]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences

import numpy as np

In [26]:
# nitialize and fit tokenizer on training data, then convert train and test data into integer sequences for model input
tokenizer = Tokenizer()
tokenizer.fit_on_texts(trainDataBasic)

trainToken = tokenizer.texts_to_sequences(trainDataBasic)
testToken = tokenizer.texts_to_sequences(testDataBasic)

In [27]:
# calculate the 90th percentile of train sequence lengths to set max length, then pad train and test sequences to this length
sequence_lengths = [len(x) for x in trainToken]
percentile_90 = np.percentile(sequence_lengths, 90)
max_length = int(percentile_90)

trainPadded = pad_sequences(trainToken, maxlen=max_length, padding='post')
testPadded = pad_sequences(testToken, maxlen=max_length, padding='post')

In [28]:
embedding_dim = 300

In [29]:
# create an embedding matrix by mapping each word in the tokenizer's word index to its corresponding word2vec vector
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, embedding_dim))
for word, i in tokenizer.word_index.items():
    if word in word2vec_wv:
        embedding_vector = word2vec_wv[word]
        embedding_matrix[i] = embedding_vector

In [30]:
# initialize a non-trainable Embedding layer using the precomputed word2vec embedding matrix with specified dimensions
embedding_word2vec = Embedding(len(tokenizer.word_index) + 1,
                            embedding_dim,
                            weights=[embedding_matrix],
                            input_length=max_length,
                            trainable=False)

In [31]:
# build an embedding matrix for the tokenizer's vocabulary using GloVe word vectors
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, embedding_dim))
for word, i in tokenizer.word_index.items():
    if word in glove_wv:
        embedding_vector = glove_wv[word]
        embedding_matrix[i] = embedding_vector

In [32]:
# define a non-trainable GloVe-based Embedding layer for the model, using the pre-built embedding matrix
embedding_glove = Embedding(len(tokenizer.word_index) + 1,
                            embedding_dim,
                            weights=[embedding_matrix],
                            input_length=max_length,
                            trainable=False)

# **Classification**

In [33]:
# custom F1 Score metric class for TensorFlow, combining precision and recall calculations
import tensorflow as tf

class F1Score(tf.keras.metrics.Metric):
    def __init__(self, name='f1_score', **kwargs):
        super(F1Score, self).__init__(name=name, **kwargs)
        self.precision = tf.keras.metrics.Precision()
        self.recall = tf.keras.metrics.Recall()

    def update_state(self, y_true, y_pred, sample_weight=None):
        self.precision.update_state(y_true, y_pred, sample_weight)
        self.recall.update_state(y_true, y_pred, sample_weight)

    def result(self):
        p = self.precision.result()
        r = self.recall.result()
        return 2 * ((p * r) / (p + r + tf.keras.backend.epsilon()))

    def reset_states(self):
        self.precision.reset_states()
        self.recall.reset_states()

**Convolution Neural Network**

In [34]:
# construct a CNN model with Word2Vec embeddings, convolution, max pooling, dense layers, and compile with a custom F1 Score metric
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, GlobalMaxPooling1D, Embedding, Dropout

model_cnn = Sequential()
model_cnn.add(embedding_word2vec)
model_cnn.add(Conv1D(128, 5, activation='relu'))
model_cnn.add(GlobalMaxPooling1D())
model_cnn.add(Dense(10, activation='relu'))
model_cnn.add(Dropout(0.5))
model_cnn.add(Dense(1, activation='sigmoid'))

model_cnn.compile(optimizer='adam', 
                  loss='binary_crossentropy', 
                  metrics=[F1Score()])

In [35]:
# train the CNN model on padded training data and labels for 5 epochs with a 20% validation split

model_cnn.fit(trainPadded, rawtrainData['label'], epochs=5, validation_split=0.2)

Epoch 1/5

  m.reset_state()


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x26b37218150>

In [37]:
# evaluate the CNN model on test data and print the F1 Score for the Word2Vec-based model
loss, f1_cnn_word2vec = model_cnn.evaluate(testPadded, rawtestData['label'])
print(f"Test F1 Score CNN - Word2Vec: {f1_cnn_word2vec}")

Test F1 Score CNN - Word2Vec: 0.8839179277420044


In [38]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, GlobalMaxPooling1D, Embedding, Dropout

model_cnn = Sequential()
model_cnn.add(embedding_glove)
model_cnn.add(Conv1D(128, 5, activation='relu'))
model_cnn.add(GlobalMaxPooling1D())
model_cnn.add(Dense(10, activation='relu'))
model_cnn.add(Dropout(0.5))
model_cnn.add(Dense(1, activation='sigmoid'))

model_cnn.compile(optimizer='adam', 
                  loss='binary_crossentropy', 
                  metrics=[F1Score()])

In [39]:
model_cnn.fit(trainPadded, rawtrainData['label'], epochs=5, validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x26b714f4ed0>

In [40]:
loss, f1_cnn_glove = model_cnn.evaluate(testPadded, rawtestData['label'])
print(f"Test F1 Score CNN - Glove: {f1_cnn_glove}")

Test F1 Score CNN - Glove: 0.8742826581001282


**Convolution Neural Network - Long Short-Term Memory**

In [41]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, Conv1D, MaxPooling1D, LSTM

model_cnn_lstm = Sequential()
model_cnn_lstm.add(embedding_word2vec)
model_cnn_lstm.add(Conv1D(filters=64, kernel_size=5, activation='relu'))
model_cnn_lstm.add(MaxPooling1D(pool_size=4))
model_cnn_lstm.add(LSTM(128))
model_cnn_lstm.add(Dense(10, activation='relu'))
model_cnn_lstm.add(Dropout(0.5))
model_cnn_lstm.add(Dense(1, activation='sigmoid'))

model_cnn_lstm.compile(optimizer='adam', 
                  loss='binary_crossentropy', 
                  metrics=[F1Score()])

In [42]:
model_cnn_lstm.fit(trainPadded, rawtrainData['label'], epochs=5, validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x26b717adb90>

In [43]:
loss, f1_cnn_lstm_word2vec = model_cnn_lstm.evaluate(testPadded, rawtestData['label'])
print(f"Test F1 Score - Word2Vec: {f1_cnn_lstm_word2vec}")

Test F1 Score - Word2Vec: 0.8866626024246216


In [51]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, Conv1D, MaxPooling1D

model_cnn_lstm = Sequential()
model_cnn_lstm.add(embedding_glove)
model_cnn_lstm.add(Conv1D(filters=64, kernel_size=5, activation='relu'))
model_cnn_lstm.add(MaxPooling1D(pool_size=4))
model_cnn_lstm.add(LSTM(128))
model_cnn_lstm.add(Dense(10, activation='relu'))

model_cnn_lstm.add(Dense(1, activation='sigmoid'))

model_cnn_lstm.compile(optimizer='adam', 
                  loss='binary_crossentropy', 
                  metrics=[F1Score()])

In [52]:
model_cnn_lstm.fit(trainPadded, rawtrainData['label'], epochs=5, validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x26ca3308e50>

In [53]:
loss, f1_cnn_lstm_glove = model_cnn_lstm.evaluate(testPadded, rawtestData['label'])
print(f"Test F1 Score - Glove: {f1_cnn_lstm_glove}")

Test F1 Score - Glove: 0.8586170673370361


**Bidirectional Long Short-Term Memory**

In [56]:
from tensorflow.keras.layers import Bidirectional

model_bilstm = Sequential()
model_bilstm.add(embedding_word2vec)
model_bilstm.add(Bidirectional(LSTM(128)))
model_bilstm.add(Dense(10, activation='relu'))
model_bilstm.add(Dense(1, activation='sigmoid'))

model_bilstm.compile(optimizer='adam', 
                     loss='binary_crossentropy', 
                     metrics=[F1Score()])

In [57]:
model_bilstm.fit(trainPadded, rawtrainData['label'], epochs=5, validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x26ca6a83990>

In [58]:
loss, f1_bilstm_word2vec = model_bilstm.evaluate(testPadded, rawtestData['label'])
print(f"Test F1 Score - BiLSTM Word2Vec: {f1_bilstm_word2vec}")

Test F1 Score - BiLSTM Word2Vec: 0.8668519258499146


In [61]:
from tensorflow.keras.layers import Bidirectional

model_bilstm = Sequential()
model_bilstm.add(embedding_glove)
model_bilstm.add(Bidirectional(LSTM(128)))
model_bilstm.add(Dense(10, activation='relu'))
model_bilstm.add(Dense(1, activation='sigmoid'))

model_bilstm.compile(optimizer='adam', 
                     loss='binary_crossentropy', 
                     metrics=[F1Score()])

In [62]:
model_bilstm.fit(trainPadded, rawtrainData['label'], epochs=5, validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x26c7c607f90>

In [63]:
loss, f1_bilstm_glove = model_bilstm.evaluate(testPadded, rawtestData['label'])
print(f"Test F1 Score - BiLSTM Word2Vec: {f1_bilstm_glove}")

Test F1 Score - BiLSTM Word2Vec: 0.8870954513549805


# **Results**

In [65]:
from model_results import generate_dl_results_df

In [66]:
resultsDF = generate_dl_results_df(f1_cnn_word2vec, f1_cnn_glove, f1_bilstm_word2vec, f1_bilstm_glove, f1_cnn_lstm_word2vec, f1_cnn_lstm_glove)
print(resultsDF)

      Model Embedding  F1-score
0       CNN  Word2Vec  0.883918
1       CNN     GloVe  0.874283
2    BILSTM  Word2Vec  0.866852
3    BILSTM     GloVe  0.887095
4  CNN-LSTM  Word2Vec  0.886663
5  CNN-LSTM     GloVe  0.858617


In [67]:
resultsDF.to_csv("final_dl_models_basic_prep.csv", index=False)