# **Sentiment Classification - Deep Learning - Lemma Preprocessing**

# **Prerequisites**

**Install Required Packages**

In [1]:
!pip install datasets -q
!pip install spacy -q


[notice] A new release of pip is available: 23.3.1 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip is available: 23.3.1 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
!python -m spacy download en_core_web_sm -q

[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')



[notice] A new release of pip is available: 23.3.1 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip


**Load Dataset**

In [3]:
from imdb_data_loader import load_and_prepare_imdb_dataset

trainData, testData = load_and_prepare_imdb_dataset()

# **Dataset Analysis**

**Data Checks**

In [4]:
trainData.head()

Unnamed: 0,text,label
0,I rented I AM CURIOUS-YELLOW from my video sto...,0
1,"""I Am Curious: Yellow"" is a risible and preten...",0
2,If only to avoid making this type of film in t...,0
3,This film was probably inspired by Godard's Ma...,0
4,"Oh, brother...after hearing about this ridicul...",0


In [5]:
trainData.shape

(25000, 2)

In [6]:
testData.head()

Unnamed: 0,text,label
0,I love sci-fi and am willing to put up with a ...,0
1,"Worth the entertainment value of a rental, esp...",0
2,its a totally average film with a few semi-alr...,0
3,STAR RATING: ***** Saturday Night **** Friday ...,0
4,"First off let me say, If you haven't enjoyed a...",0


In [7]:
testData.shape

(25000, 2)

**Remove Duplicates**

In [8]:
trainDataDuplicates = trainData['text'].duplicated().sum()

trainDataDuplicates

96

In [9]:
noTrainDataDuplicates = trainData.drop_duplicates(subset='text')

noTrainDataDuplicatesShape = noTrainDataDuplicates.shape

noTrainDataDuplicatesShape

(24904, 2)

In [10]:
testDataDuplicates = testData['text'].duplicated().sum()

testDataDuplicates

199

In [11]:
noTestDataDuplicates = testData.drop_duplicates(subset='text')

noTestDataDuplicatesShape = noTestDataDuplicates.shape

noTestDataDuplicatesShape

(24801, 2)

# **Lemma Preprocessing**

In [12]:
from lemma_preprocessing import preprocess_lemma

In [13]:
rawtrainData = noTrainDataDuplicates.copy()
rawtestData = noTestDataDuplicates.copy()

In [14]:
trainDataLemma = rawtrainData['text'].apply(preprocess_lemma)

  text = BeautifulSoup(text, "html.parser").get_text()


In [15]:
for index, value in trainDataLemma.items():
    print(f"Index {index}: {value}")
    if index == 2:
        break

Index 0: I rent I be curious yellow from my video store because of all the controversy that surround it when it be first release in 1967 I also hear that at first it be seize by you.s customs if it ever try to enter this country therefore be a fan of film consider controversial I really have to see this for myself.the plot be center around a young swedish drama student name lena who want to learn everything she can about life in particular she want to focus her attention to make some sort of documentary on what the average swede think about certain political issue such as the vietnam war and race issue in the united states in between ask politician and ordinary denizen of stockholm about their opinion on politic she have sex with her drama teacher classmate and marry men.what kill I about I be curious yellow be that 40 year ago this be consider pornographic really the sex and nudity scene be few and far between even then it have it be not shoot like some cheaply make porno while my cou

In [16]:
testDataLemma = rawtestData['text'].apply(preprocess_lemma)

In [17]:
for index, value in testDataLemma.items():
    print(f"Index {index}: {value}")
    if index == 2:
        break

Index 0: I love sci fi and be willing to put up with a lot sci fi movie tv be usually underfunded under appreciated and misunderstood I try to like this I really do but it be to good tv sci fi as babylon 5 be to star trek the original silly prosthetic cheap cardboard set stilte dialogue cg that do not match the background and painfully one dimensional character can not be overcome with a sci fi setting I be sure there be those of you out there who think babylon 5 be good sci fi tv it have it be not it have it be clichéd and uninspire while we viewer might like emotion and character development sci fi be a genre that do not take itself seriously cf star trek it may treat important issue yet not as a serious philosophy it have it be really difficult to care about the character here as they be not simply foolish just miss a spark of life their action and reaction be wooden and predictable often painful to watch the maker of earth know it have it be rubbish as they have to always say gene 

# **Feature Extraction**

In [18]:
from gensim_model_downloader import download_and_save_model

**Word2Vec**

In [19]:
from gensim_model_api import load_gensim_model


In [20]:

# download_and_save_model('word2vec-google-news-300', 'word2vec_vector.kv')

In [21]:
word2vec_wv = load_gensim_model('word2vec_vector.kv')

**Glove**

In [22]:
from gensim_model_api import load_gensim_model

In [23]:
# download_and_save_model('glove-wiki-gigaword-300', 'glove_vector.kv')


In [24]:
glove_wv = load_gensim_model('glove_vector.kv')

# **Tokenisation and Padding**

In [25]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences

import numpy as np

In [26]:


tokenizer = Tokenizer()
tokenizer.fit_on_texts(trainDataLemma)

trainToken = tokenizer.texts_to_sequences(trainDataLemma)
testToken = tokenizer.texts_to_sequences(testDataLemma)

In [27]:
sequence_lengths = [len(x) for x in trainToken]
percentile_90 = np.percentile(sequence_lengths, 90)
max_length = int(percentile_90)

trainPadded = pad_sequences(trainToken, maxlen=max_length, padding='post')
testPadded = pad_sequences(testToken, maxlen=max_length, padding='post')

In [28]:
embedding_dim = 300

In [29]:
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, embedding_dim))
for word, i in tokenizer.word_index.items():
    if word in word2vec_wv:
        embedding_vector = word2vec_wv[word]
        embedding_matrix[i] = embedding_vector

In [30]:
embedding_word2vec = Embedding(len(tokenizer.word_index) + 1,
                            embedding_dim,
                            weights=[embedding_matrix],
                            input_length=max_length,
                            trainable=False)

In [31]:
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, embedding_dim))
for word, i in tokenizer.word_index.items():
    if word in glove_wv:
        embedding_vector = glove_wv[word]
        embedding_matrix[i] = embedding_vector

In [32]:
embedding_glove = Embedding(len(tokenizer.word_index) + 1,
                            embedding_dim,
                            weights=[embedding_matrix],
                            input_length=max_length,
                            trainable=False)

# **Classification**

In [33]:
import tensorflow as tf

class F1Score(tf.keras.metrics.Metric):
    def __init__(self, name='f1_score', **kwargs):
        super(F1Score, self).__init__(name=name, **kwargs)
        self.precision = tf.keras.metrics.Precision()
        self.recall = tf.keras.metrics.Recall()

    def update_state(self, y_true, y_pred, sample_weight=None):
        self.precision.update_state(y_true, y_pred, sample_weight)
        self.recall.update_state(y_true, y_pred, sample_weight)

    def result(self):
        p = self.precision.result()
        r = self.recall.result()
        return 2 * ((p * r) / (p + r + tf.keras.backend.epsilon()))

    def reset_states(self):
        self.precision.reset_states()
        self.recall.reset_states()

**Convolution Neural Network**

In [60]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, GlobalMaxPooling1D, Embedding, Dropout

model_cnn = Sequential()
model_cnn.add(embedding_word2vec)
model_cnn.add(Conv1D(128, 5, activation='relu'))
model_cnn.add(GlobalMaxPooling1D())
model_cnn.add(Dense(10, activation='relu'))
model_cnn.add(Dropout(0.5))
model_cnn.add(Dense(1, activation='sigmoid'))

model_cnn.compile(optimizer='adam', 
                  loss='binary_crossentropy', 
                  metrics=[F1Score()])

In [61]:
model_cnn.fit(trainPadded, rawtrainData['label'], epochs=5, validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x1ee9050d710>

In [62]:
loss, f1_cnn_word2vec = model_cnn.evaluate(testPadded, rawtestData['label'])
print(f"Test F1 Score CNN - Word2Vec: {f1_cnn_word2vec}")

Test F1 Score CNN - Word2Vec: 0.8838421106338501


In [65]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, GlobalMaxPooling1D, Embedding, Dropout

model_cnn = Sequential()
model_cnn.add(embedding_glove)
model_cnn.add(Conv1D(128, 5, activation='relu'))
model_cnn.add(GlobalMaxPooling1D())
model_cnn.add(Dense(10, activation='relu'))
model_cnn.add(Dropout(0.5))
model_cnn.add(Dense(1, activation='sigmoid'))

model_cnn.compile(optimizer='adam', 
                  loss='binary_crossentropy', 
                  metrics=[F1Score()])

In [66]:
model_cnn.fit(trainPadded, rawtrainData['label'], epochs=5, validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x1ee9e019b10>

In [67]:
loss, f1_cnn_glove = model_cnn.evaluate(testPadded, rawtestData['label'])
print(f"Test F1 Score CNN - Glove: {f1_cnn_glove}")

Test F1 Score CNN - Glove: 0.8101916909217834


**Convolution Neural Network - Long Short-Term Memory**

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, Conv1D, MaxPooling1D, LSTM

# Assuming 'embedding_layer' is your pre-initialized Embedding layer
model_cnn_lstm = Sequential()
model_cnn_lstm.add(embedding_word2vec)

# Convolutional Layer
model_cnn_lstm.add(Conv1D(filters=64, kernel_size=5, activation='relu'))
model_cnn_lstm.add(MaxPooling1D(pool_size=4))

# LSTM Layer
model_cnn_lstm.add(LSTM(128))

# Fully Connected Layers
model_cnn_lstm.add(Dense(10, activation='relu'))

model_cnn_lstm.add(Dense(1, activation='sigmoid'))  # Use 'softmax' if you have multiple classes

model_cnn_lstm.compile(optimizer='adam', 
                  loss='binary_crossentropy', 
                  metrics=[F1Score()])

In [71]:
model_cnn_lstm.fit(trainPadded, rawtrainData['label'], epochs=5, validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x1eeaaac05d0>

In [72]:
loss, f1_cnn_lstm_word2vec = model_cnn_lstm.evaluate(testPadded, rawtestData['label'])
print(f"Test F1 Score - Word2Vec: {f1_cnn_lstm_word2vec}")

Test F1 Score - Word2Vec: 0.889437198638916


In [73]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, Conv1D, MaxPooling1D, LSTM

# Assuming 'embedding_layer' is your pre-initialized Embedding layer
model_cnn_lstm = Sequential()
model_cnn_lstm.add(embedding_glove)

# Convolutional Layer
model_cnn_lstm.add(Conv1D(filters=64, kernel_size=5, activation='relu'))
model_cnn_lstm.add(MaxPooling1D(pool_size=4))

# LSTM Layer
model_cnn_lstm.add(LSTM(128))

# Fully Connected Layers
model_cnn_lstm.add(Dense(10, activation='relu'))

model_cnn_lstm.add(Dense(1, activation='sigmoid'))  # Use 'softmax' if you have multiple classes

model_cnn_lstm.compile(optimizer='adam', 
                  loss='binary_crossentropy', 
                  metrics=[F1Score()])


In [74]:
model_cnn_lstm.fit(trainPadded, rawtrainData['label'], epochs=5, validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x1eecb0ab6d0>

In [75]:
loss, f1_cnn_lstm_glove = model_cnn_lstm.evaluate(testPadded, rawtestData['label'])
print(f"Test F1 Score - Glove: {f1_cnn_lstm_glove}")

Test F1 Score - Glove: 0.8511236906051636


**Bidirectional Long Short-Term Memory**

In [76]:
from tensorflow.keras.layers import Bidirectional

model_bilstm = Sequential()
model_bilstm.add(embedding_word2vec)
model_bilstm.add(Bidirectional(LSTM(128)))
model_bilstm.add(Dense(10, activation='relu'))
model_bilstm.add(Dense(1, activation='sigmoid'))

model_bilstm.compile(optimizer='adam', 
                     loss='binary_crossentropy', 
                     metrics=[F1Score()])

In [77]:
model_bilstm.fit(trainPadded, rawtrainData['label'], epochs=5, validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x1eeaab06d50>

In [78]:
loss, f1_bilstm_word2vec = model_bilstm.evaluate(testPadded, rawtestData['label'])
print(f"Test F1 Score - BiLSTM Word2Vec: {f1_bilstm_word2vec}")

Test F1 Score - BiLSTM Word2Vec: 0.8508748412132263


In [79]:
from tensorflow.keras.layers import Bidirectional

model_bilstm = Sequential()
model_bilstm.add(embedding_glove)
model_bilstm.add(Bidirectional(LSTM(128)))
model_bilstm.add(Dense(10, activation='relu'))
model_bilstm.add(Dense(1, activation='sigmoid'))

model_bilstm.compile(optimizer='adam', 
                     loss='binary_crossentropy', 
                     metrics=[F1Score()])

In [80]:
model_bilstm.fit(trainPadded, rawtrainData['label'], epochs=5, validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x1eec4c1b710>

In [81]:
loss, f1_bilstm_glove = model_bilstm.evaluate(testPadded, rawtestData['label'])
print(f"Test F1 Score - BiLSTM Glove: {f1_bilstm_glove}")

Test F1 Score - BiLSTM Glove: 0.870068371295929


# **Results**

In [82]:
from model_results import generate_dl_results_df

In [83]:
resultsDF = generate_dl_results_df(f1_cnn_word2vec, f1_cnn_glove, f1_bilstm_word2vec, f1_bilstm_glove, f1_cnn_lstm_word2vec, f1_cnn_lstm_glove)
print(resultsDF)

      Model Embedding  F1-score
0       CNN  Word2Vec  0.883842
1       CNN     GloVe  0.810192
2    BILSTM  Word2Vec  0.850875
3    BILSTM     GloVe  0.870068
4  CNN-LSTM  Word2Vec  0.889437
5  CNN-LSTM     GloVe  0.851124


In [84]:
resultsDF.to_csv("final_dl_models_lemma_prep.csv", index=False)