<a href="https://colab.research.google.com/github/mazenmagdii/IMDB-Sentiment-Classification/blob/main/IMDB_Keras_with_pretrained_Word2Vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Embedding,LSTM,Dropout,Bidirectional,GRU
from tensorflow.keras.callbacks import EarlyStopping,ModelCheckpoint
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import SpatialDropout1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten,BatchNormalization

In [2]:
from gensim.models import Word2Vec
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.datasets import imdb
(x_train,y_train),(x_test,y_test)=imdb.load_data(num_words=20000)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 0us/step


In [3]:
X=np.concatenate((x_train,x_test))
Y=np.concatenate((y_train,y_test))
x_train,x_temp,y_train,y_temp=train_test_split(X,Y,test_size=0.3,shuffle=True,random_state=42)
x_val,x_test,y_val,y_test=train_test_split(x_temp,y_temp,test_size=0.5,shuffle=True,random_state=42)

In [4]:
word_i= imdb.get_word_index()
reverse_word_i=dict([(value,key) for (key,value) in word_i.items()])
def decode_review(encoded_review):
  return ' '.join([reverse_word_i.get(i-3,'?') for i in encoded_review])

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
[1m1641221/1641221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1us/step


In [5]:
train_texts = [decode_review(review) for review in x_train]
val_texts = [decode_review(review) for review in x_val]
test_texts = [decode_review(review) for review in x_test]

In [6]:
tok=Tokenizer(num_words=20000)
tok.fit_on_texts(train_texts)

In [7]:
X_train_seq = tok.texts_to_sequences(train_texts)
X_val_seq = tok.texts_to_sequences(val_texts)
X_test_seq = tok.texts_to_sequences(test_texts)

In [8]:
max_length = 500
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length)
X_val_pad = pad_sequences(X_val_seq, maxlen=max_length)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length)

In [9]:
train_sentences = [review.split() for review in train_texts]
word2vec_model = Word2Vec(sentences=train_sentences, vector_size=100, window=5, min_count=1, workers=4)

In [10]:
embedding_matrix = np.zeros((20000, 100))
for word, i in tok.word_index.items():
    if i < 10000:
        try:
            embedding_vector = word2vec_model.wv[word]
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
        except KeyError:
            continue

LSTM

In [13]:
model = Sequential([
    Embedding(input_dim=20000, output_dim=100, weights=[embedding_matrix], input_length=max_length, trainable=False, mask_zero=True),
    SpatialDropout1D(0.2),
    LSTM(110, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')

])

# Compile the model
model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.01), metrics=['accuracy'])



In [21]:
model.save('best_model.keras')

In [15]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=True, patience=10)
mc = ModelCheckpoint('best_weights.keras', monitor='val_accuracy', mode='max', verbose=True, save_best_only=True)
rlr = ReduceLROnPlateau(monitor='val_loss', factor=0.25, patience=3, verbose=True,min_lr=0.0000001 )

In [17]:
history = model.fit(X_train_pad, y_train, epochs=30, batch_size=64, validation_data=(X_val_pad, y_val), verbose=True, callbacks=[es,mc,rlr])

Epoch 1/30
[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 925ms/step - accuracy: 0.7576 - loss: 0.4905
Epoch 1: val_accuracy improved from -inf to 0.87400, saving model to best_weights.keras
[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m545s[0m 992ms/step - accuracy: 0.7577 - loss: 0.4903 - val_accuracy: 0.8740 - val_loss: 0.3069 - learning_rate: 0.0100
Epoch 2/30
[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 928ms/step - accuracy: 0.8527 - loss: 0.3463
Epoch 2: val_accuracy improved from 0.87400 to 0.87733, saving model to best_weights.keras
[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m569s[0m 1s/step - accuracy: 0.8527 - loss: 0.3463 - val_accuracy: 0.8773 - val_loss: 0.2965 - learning_rate: 0.0100
Epoch 3/30
[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 934ms/step - accuracy: 0.8624 - loss: 0.3224
Epoch 3: val_accuracy improved from 0.87733 to 0.88200, saving model to best_weights.keras
[1m547/5

In [18]:
# Evaluate the model on the test set
x_test_seq = tok.texts_to_sequences([' '.join([reverse_word_i.get(i - 3, '?') for i in review]) for review in x_test])
x_test_padded = pad_sequences(x_test_seq, maxlen=max_length)
test_loss, test_accuracy = model.evaluate(x_test_padded, y_test)
print(f'Test Accuracy: {test_accuracy:.4f}')

[1m235/235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 139ms/step - accuracy: 0.8991 - loss: 0.2541
Test Accuracy: 0.8955


In [20]:
test_review = ["The movie wasn't good, i didn't enjoy watching it.It was a horrible one."]
review_sequence = tok.texts_to_sequences(test_review)
padded_review = pad_sequences(review_sequence, maxlen=max_length)

prediction = model.predict(padded_review)

print(f"Prediction: {prediction[0]}")

if prediction >= 0.5:
    print("Positive review!")
else:
    print("Negative review!")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 85ms/step
Prediction: [0.01208363]
Negative review!


GRU+CNN

In [None]:

model1 = Sequential([
      Embedding(input_dim=20000, output_dim=100, weights=[embedding_matrix], input_length=max_length, trainable=False, mask_zero=True),
      SpatialDropout1D(0.35),
      Conv1D(filters=70, kernel_size=3, activation='relu',padding='valid',strides=1),
      MaxPooling1D(pool_size=4),
      GRU(100, recurrent_dropout=0.1),
      BatchNormalization(),
      Dense(1, activation='sigmoid')
])
model1.summary()
# Compile the model
model1.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001,clipnorm=1.0), metrics=['accuracy'])

In [None]:
model1.load_weights('best_model1.keras')

In [None]:
mc1 = ModelCheckpoint('best_model1.keras', monitor='val_accuracy', mode='max', verbose=True, save_best_only=True)


In [None]:
history1 = model1.fit(X_train_pad, y_train, epochs=50, batch_size=32, validation_data=(X_val_pad, y_val), verbose=True, callbacks=[es,mc1,rlr])

Epoch 1/50




[1m1094/1094[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 159ms/step - accuracy: 0.5634 - loss: 0.7291
Epoch 1: val_accuracy improved from -inf to 0.50747, saving model to best_model1.keras
[1m1094/1094[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m187s[0m 167ms/step - accuracy: 0.5635 - loss: 0.7290 - val_accuracy: 0.5075 - val_loss: 0.6899 - learning_rate: 0.0010
Epoch 2/50
[1m1094/1094[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 158ms/step - accuracy: 0.7124 - loss: 0.5775
Epoch 2: val_accuracy improved from 0.50747 to 0.70293, saving model to best_model1.keras
[1m1094/1094[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m201s[0m 167ms/step - accuracy: 0.7124 - loss: 0.5775 - val_accuracy: 0.7029 - val_loss: 0.6868 - learning_rate: 0.0010
Epoch 3/50
[1m1094/1094[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 156ms/step - accuracy: 0.7575 - loss: 0.5201
Epoch 3: val_accuracy improved from 0.70293 to 0.77693, saving model to best_model1.keras
[1m1094/1

In [None]:
x_test_seq = tok.texts_to_sequences([' '.join([reverse_word_i.get(i - 3, '?') for i in review]) for review in x_test])
x_test_padded = pad_sequences(x_test_seq, maxlen=max_length)
test_loss, test_accuracy = model1.evaluate(x_test_padded, y_test)
print(f'Test Accuracy: {test_accuracy:.4f}')

[1m235/235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 51ms/step - accuracy: 0.8169 - loss: 0.5209
Test Accuracy: 0.8156
