# **Sentiment Classification - Deep Learning - Stopwords Preprocessing**

# **Prerequisites**

**Install Required Packages**

In [1]:
!pip install datasets -q
!pip install spacy -q


[notice] A new release of pip is available: 23.3.1 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip is available: 23.3.1 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
!python -m spacy download en_core_web_sm -q

[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')



[notice] A new release of pip is available: 23.3.1 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip


**Load Dataset**

In [3]:
from imdb_data_loader import load_and_prepare_imdb_dataset

trainData, testData = load_and_prepare_imdb_dataset()

# **Dataset Analysis**

**Data Checks**

In [4]:
trainData.head()

Unnamed: 0,text,label
0,I rented I AM CURIOUS-YELLOW from my video sto...,0
1,"""I Am Curious: Yellow"" is a risible and preten...",0
2,If only to avoid making this type of film in t...,0
3,This film was probably inspired by Godard's Ma...,0
4,"Oh, brother...after hearing about this ridicul...",0


In [5]:
trainData.shape

(25000, 2)

In [6]:
testData.head()

Unnamed: 0,text,label
0,I love sci-fi and am willing to put up with a ...,0
1,"Worth the entertainment value of a rental, esp...",0
2,its a totally average film with a few semi-alr...,0
3,STAR RATING: ***** Saturday Night **** Friday ...,0
4,"First off let me say, If you haven't enjoyed a...",0


In [7]:
testData.shape

(25000, 2)

**Remove Duplicates**

In [8]:
trainDataDuplicates = trainData['text'].duplicated().sum()

trainDataDuplicates

96

In [9]:
noTrainDataDuplicates = trainData.drop_duplicates(subset='text')

noTrainDataDuplicatesShape = noTrainDataDuplicates.shape

noTrainDataDuplicatesShape

(24904, 2)

In [10]:
testDataDuplicates = testData['text'].duplicated().sum()

testDataDuplicates

199

In [11]:
noTestDataDuplicates = testData.drop_duplicates(subset='text')

noTestDataDuplicatesShape = noTestDataDuplicates.shape

noTestDataDuplicatesShape

(24801, 2)

# **Stopwords Preprocessing**

In [12]:
from stopwords_preprocessing import preprocess_sw

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\momo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
rawtrainData = noTrainDataDuplicates.copy()
rawtestData = noTestDataDuplicates.copy()

In [14]:
trainDataSw = rawtrainData['text'].apply(preprocess_sw)

  text = BeautifulSoup(text, "html.parser").get_text()


In [15]:
for index, value in trainDataSw.items():
    print(f"Index {index}: {value}")
    if index == 2:
        break

Index 0: rented curious yellow video store controversy surrounded first released also heard first seized customs ever tried enter country therefore fan films considered controversial really see plot centered around young swedish drama student named lena wants learn everything life particular wants focus attentions making sort documentary average swede thought certain political issues vietnam war race issues united states asking politicians ordinary denizens stockholm opinions politics sex drama teacher classmates married kills curious yellow years ago considered pornographic really sex nudity scenes far even shot like cheaply made porno countrymen mind find shocking reality sex nudity major staple swedish cinema even ingmar bergman arguably answer good old boy john ford sex scenes commend filmmakers fact sex shown film shown artistic purposes rather shock people make money shown pornographic theaters america curious yellow good film anyone wanting study meat potatoes pun intended swedi

In [16]:
testDataSw = rawtestData['text'].apply(preprocess_sw)

In [17]:
for index, value in testDataSw.items():
    print(f"Index {index}: {value}")
    if index == 2:
        break

Index 0: love sci fi willing put lot sci fi movies tv usually underfunded appreciated misunderstood tried like really good tv sci fi babylon star trek original silly prosthetics cheap cardboard sets stilted dialogues cg match background painfully one dimensional characters overcome sci fi setting sure think babylon good sci fi tv clichéd uninspiring us viewers might like emotion character development sci fi genre take seriously cf star trek may treat important issues yet serious philosophy really difficult care characters simply foolish missing spark life actions reactions wooden predictable often painful watch makers earth know rubbish always say gene roddenberry earth otherwise people would continue watching roddenberry ashes must turning orbit dull cheap poorly edited watching without advert breaks really brings home trudging trabant show lumbers space spoiler kill main character bring back another actor jeeez dallas
Index 1: worth entertainment value rental especially like action m

# **Feature Extraction**

In [18]:
from gensim_model_downloader import download_and_save_model

**Word2Vec**

In [19]:
from gensim_model_api import load_gensim_model


In [20]:

# download_and_save_model('word2vec-google-news-300', 'word2vec_vector.kv')

In [21]:
word2vec_wv = load_gensim_model('word2vec_vector.kv')

**Glove**

In [22]:
from gensim_model_api import load_gensim_model

In [23]:
# download_and_save_model('glove-wiki-gigaword-300', 'glove_vector.kv')


In [24]:
glove_wv = load_gensim_model('glove_vector.kv')

# **Tokenisation and Padding**

In [25]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences

import numpy as np

In [26]:


tokenizer = Tokenizer()
tokenizer.fit_on_texts(trainDataSw)

trainToken = tokenizer.texts_to_sequences(trainDataSw)
testToken = tokenizer.texts_to_sequences(testDataSw)

In [27]:
sequence_lengths = [len(x) for x in trainToken]
percentile_90 = np.percentile(sequence_lengths, 90)
max_length = int(percentile_90)

trainPadded = pad_sequences(trainToken, maxlen=max_length, padding='post')
testPadded = pad_sequences(testToken, maxlen=max_length, padding='post')

In [28]:
embedding_dim = 300

In [29]:
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, embedding_dim))
for word, i in tokenizer.word_index.items():
    if word in word2vec_wv:
        embedding_vector = word2vec_wv[word]
        embedding_matrix[i] = embedding_vector

In [30]:
embedding_word2vec = Embedding(len(tokenizer.word_index) + 1,
                            embedding_dim,
                            weights=[embedding_matrix],
                            input_length=max_length,
                            trainable=False)

In [31]:
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, embedding_dim))
for word, i in tokenizer.word_index.items():
    if word in glove_wv:
        embedding_vector = glove_wv[word]
        embedding_matrix[i] = embedding_vector

In [32]:
embedding_glove = Embedding(len(tokenizer.word_index) + 1,
                            embedding_dim,
                            weights=[embedding_matrix],
                            input_length=max_length,
                            trainable=False)

# **Classification**

In [33]:
import tensorflow as tf

class F1Score(tf.keras.metrics.Metric):
    def __init__(self, name='f1_score', **kwargs):
        super(F1Score, self).__init__(name=name, **kwargs)
        self.precision = tf.keras.metrics.Precision()
        self.recall = tf.keras.metrics.Recall()

    def update_state(self, y_true, y_pred, sample_weight=None):
        self.precision.update_state(y_true, y_pred, sample_weight)
        self.recall.update_state(y_true, y_pred, sample_weight)

    def result(self):
        p = self.precision.result()
        r = self.recall.result()
        return 2 * ((p * r) / (p + r + tf.keras.backend.epsilon()))

    def reset_states(self):
        self.precision.reset_states()
        self.recall.reset_states()

**Convolution Neural Network**

In [85]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, GlobalMaxPooling1D, Embedding, Dropout

model_cnn = Sequential()
model_cnn.add(embedding_word2vec)
model_cnn.add(Conv1D(128, 5, activation='relu'))
model_cnn.add(GlobalMaxPooling1D())
model_cnn.add(Dense(10, activation='relu'))
model_cnn.add(Dropout(0.5))
model_cnn.add(Dense(1, activation='sigmoid'))

model_cnn.compile(optimizer='adam', 
                  loss='binary_crossentropy', 
                  metrics=[F1Score()])

In [86]:
model_cnn.fit(trainPadded, rawtrainData['label'], epochs=5, validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x25878779190>

In [119]:
model_cnn.save('model_cnn_word2vec.keras')

In [87]:
loss, f1_cnn_word2vec = model_cnn.evaluate(testPadded, rawtestData['label'])
print(f"Test F1 Score CNN - Word2Vec: {f1_cnn_word2vec}")

Test F1 Score CNN - Word2Vec: 0.873186469078064


In [37]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, GlobalMaxPooling1D, Embedding, Dropout

model_cnn = Sequential()
model_cnn.add(embedding_glove)
model_cnn.add(Conv1D(128, 5, activation='relu'))
model_cnn.add(GlobalMaxPooling1D())
model_cnn.add(Dense(10, activation='relu'))
model_cnn.add(Dropout(0.5))
model_cnn.add(Dense(1, activation='sigmoid'))

model_cnn.compile(optimizer='adam', 
                  loss='binary_crossentropy', 
                  metrics=[F1Score()])

In [38]:
model_cnn.fit(trainPadded, rawtrainData['label'], epochs=5, validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x256f1df2890>

In [118]:
model_cnn.save('model_cnn_glove.keras')

In [39]:
loss, f1_cnn_glove = model_cnn.evaluate(testPadded, rawtestData['label'])
print(f"Test F1 Score CNN - Glove: {f1_cnn_glove}")

Test F1 Score CNN - Glove: 0.8552526831626892


**Convolution Neural Network - Long Short-Term Memory**

In [40]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, Conv1D, MaxPooling1D, LSTM

model_cnn_lstm = Sequential()
model_cnn_lstm.add(embedding_word2vec)
model_cnn_lstm.add(Conv1D(filters=64, kernel_size=5, activation='relu'))
model_cnn_lstm.add(MaxPooling1D(pool_size=4))
model_cnn_lstm.add(LSTM(128))
model_cnn_lstm.add(Dense(10, activation='relu'))
model_cnn_lstm.add(Dropout(0.5))
model_cnn_lstm.add(Dense(1, activation='sigmoid'))

model_cnn_lstm.compile(optimizer='adam', 
                  loss='binary_crossentropy', 
                  metrics=[F1Score()])

In [43]:
model_cnn_lstm.fit(trainPadded, rawtrainData['label'], epochs=5, validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x256f1fe1b10>

In [117]:
model_cnn_lstm.save('model_cnn_lstm_word2vec.keras')

In [44]:
loss, f1_cnn_lstm_word2vec = model_cnn_lstm.evaluate(testPadded, rawtestData['label'])
print(f"Test F1 Score - Word2Vec: {f1_cnn_lstm_word2vec}")

Test F1 Score - Word2Vec: 0.8343249559402466


In [75]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, Conv1D, MaxPooling1D, LSTM

model_cnn_lstm = Sequential()
model_cnn_lstm.add(embedding_glove)

# Convolutional Layer
model_cnn_lstm.add(Conv1D(filters=64, kernel_size=5, activation='relu'))
model_cnn_lstm.add(MaxPooling1D(pool_size=4))

# LSTM Layer
model_cnn_lstm.add(LSTM(128))

# Fully Connected Layers
model_cnn_lstm.add(Dense(10, activation='relu'))

model_cnn_lstm.add(Dense(1, activation='sigmoid'))

model_cnn_lstm.compile(optimizer='adam', 
                  loss='binary_crossentropy', 
                  metrics=[F1Score()])

In [76]:
model_cnn_lstm.fit(trainPadded, rawtrainData['label'], epochs=5, validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x25873d8c050>

In [116]:
model_cnn_lstm.save('model_cnn_lstm_glove.keras')

In [77]:
loss, f1_cnn_lstm_glove = model_cnn_lstm.evaluate(testPadded, rawtestData['label'])
print(f"Test F1 Score - Glove: {f1_cnn_lstm_glove}")

Test F1 Score - Glove: 0.8495842814445496


**Bidirectional Long Short-Term Memory**

In [68]:
from tensorflow.keras.layers import Bidirectional

model_bilstm = Sequential()
model_bilstm.add(embedding_word2vec)
model_bilstm.add(Bidirectional(LSTM(128)))
model_bilstm.add(Dense(10, activation='relu'))
model_bilstm.add(Dense(1, activation='sigmoid'))

model_bilstm.compile(optimizer='adam', 
                     loss='binary_crossentropy', 
                     metrics=[F1Score()])


In [69]:
model_bilstm.fit(trainPadded, rawtrainData['label'], epochs=5, validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x2582d09b3d0>

In [115]:
model_bilstm.save('model_bilstm_word2vec.keras')

In [72]:
loss, _, f1_bilstm_word2vec = model_bilstm.evaluate(testPadded, rawtestData['label'])
print(f"Test F1 Score - BiLSTM Word2Vec: {f1_bilstm_word2vec}")

Test F1 Score - BiLSTM Word2Vec: 0.8778058290481567


In [78]:
from tensorflow.keras.layers import Bidirectional

model_bilstm = Sequential()
model_bilstm.add(embedding_word2vec)
model_bilstm.add(Bidirectional(LSTM(128)))
model_bilstm.add(Dense(10, activation='relu'))
model_bilstm.add(Dense(1, activation='sigmoid'))

model_bilstm.compile(optimizer='adam', 
                     loss='binary_crossentropy', 
                     metrics=[F1Score()])

In [79]:
model_bilstm.fit(trainPadded, rawtrainData['label'], epochs=5, validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x25875e9a190>

In [114]:
model_bilstm.save('model_bilstm_glove.keras')

In [80]:
loss, f1_bilstm_glove = model_bilstm.evaluate(testPadded, rawtestData['label'])
print(f"Test F1 Score - BiLSTM Glove: {f1_bilstm_glove}")

Test F1 Score - BiLSTM Glove: 0.8777210116386414


# **Results**

In [107]:
from model_results import generate_dl_results_df

In [112]:
f1_cnn_word2vec, f1_cnn_glove, f1_bilstm_word2vec, f1_bilstm_glove, f1_cnn_lstm_word2vec, f1_cnn_lstm_glove

(0.873186469078064,
 0.8552526831626892,
 0.8778058290481567,
 0.8777210116386414,
 0.8343249559402466,
 0.8495842814445496)

In [None]:
resultsDF = generate_dl_results_df(f1_cnn_word2vec, f1_cnn_glove, f1_bilstm_word2vec, f1_bilstm_glove, f1_cnn_lstm_word2vec, f1_cnn_lstm_glove)
print(resultsDF)

In [None]:
resultsDF.to_csv("final_dl_models_stopwords_prep.csv", index=False)