In [None]:
try:
  %tensorflow_version 2;
except Exception:
  pass
import tensorflow as tf
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

Colab only includes TensorFlow 2.x; %tensorflow_version has no effect.


In [None]:
vocab_size=30000 # Maximum size of the vocabulary for tokenization
embedding_dim=16 # size of the numerical vector that represents each word
max_length=220 # Maximum length of sequences
trunc_type='post'
padding_type='post'
oov_tok='<OOV>' # Out-of-vocabulary token used during tokenization
training_size = 45000 # Number of samples in the training dataset
test_size = 5000 # Number of samples in the test dataset

In [None]:
import pandas as pd
df=pd.read_csv('/IMDBDataset.csv') # Load the dataset from the CSV file into a pandas DataFrame
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
# Map sentiment labels to numerical values
df['sentiment'] = df['sentiment'].replace({'positive': 1, 'negative': 0})
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [None]:
sentences=df['review'].tolist()
output=df['sentiment'].tolist()

In [None]:
training_sentences=sentences[:training_size]
training_output=output[:training_size]
test_sentences=sentences[training_size:]
test_output=output[training_size:]

In [None]:
''' Tokenize training and test sentences, convert them to sequences of tokens.'''

tokenizer=Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)
# word_index is a dictionary that maps each word in the training data to a unique integer index
word_index = tokenizer.word_index
#tokenizer.texts_to_sequences(training_sentences) converts each sentence in training_sentences into a sequence of integer indices based on the word_index created by the Tokenizer.
sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(test_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [None]:
import numpy as np

training_padded = np.array(training_padded)
training_output = np.array(training_output)
testing_padded = np.array(testing_padded)
test_output = np.array(test_output)


In [None]:
# Define a sequential model:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
# Compile the model with binary cross-entropy loss, Adam optimizer, and accuracy metric.
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 220, 16)           480000    
                                                                 
 global_average_pooling1d (  (None, 16)                0         
 GlobalAveragePooling1D)                                         
                                                                 
 dense (Dense)               (None, 6)                 102       
                                                                 
 dense_1 (Dense)             (None, 1)                 7         
                                                                 
Total params: 480109 (1.83 MB)
Trainable params: 480109 (1.83 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
num_epochs=30
#train the model
history = model.fit(training_padded, training_output, epochs=num_epochs, validation_data=(testing_padded, test_output), verbose=2)#verbose is displaying training details during training

Epoch 1/30
1407/1407 - 15s - loss: 0.4562 - accuracy: 0.8107 - val_loss: 0.3061 - val_accuracy: 0.8754 - 15s/epoch - 11ms/step
Epoch 2/30
1407/1407 - 13s - loss: 0.2487 - accuracy: 0.9035 - val_loss: 0.2695 - val_accuracy: 0.8898 - 13s/epoch - 9ms/step
Epoch 3/30
1407/1407 - 14s - loss: 0.1947 - accuracy: 0.9285 - val_loss: 0.2683 - val_accuracy: 0.8934 - 14s/epoch - 10ms/step
Epoch 4/30
1407/1407 - 14s - loss: 0.1588 - accuracy: 0.9441 - val_loss: 0.2831 - val_accuracy: 0.8898 - 14s/epoch - 10ms/step
Epoch 5/30
1407/1407 - 13s - loss: 0.1298 - accuracy: 0.9571 - val_loss: 0.3019 - val_accuracy: 0.8900 - 13s/epoch - 10ms/step
Epoch 6/30
1407/1407 - 14s - loss: 0.1061 - accuracy: 0.9658 - val_loss: 0.3338 - val_accuracy: 0.8846 - 14s/epoch - 10ms/step
Epoch 7/30
1407/1407 - 13s - loss: 0.0869 - accuracy: 0.9734 - val_loss: 0.3594 - val_accuracy: 0.8794 - 13s/epoch - 10ms/step
Epoch 8/30
1407/1407 - 13s - loss: 0.0699 - accuracy: 0.9793 - val_loss: 0.3978 - val_accuracy: 0.8760 - 13s/epo

In [None]:
# Sample review
test_review = ["I had high expectations for this movie given the hype around it, but I was left feeling disappointed. The plot was thin and predictable, and the characters lacked depth. The acting was subpar, with the lead actor delivering a particularly wooden performance. The special effects were decent, but they couldn't save the overall experience. The movie also dragged in several places, making it a tedious watch. Overall, I wouldn't recommend this film to others. It fails to live up to its potential and left me feeling like I wasted my time. (give me another review like this one"]

# Preprocess the review
test_sequences = tokenizer.texts_to_sequences(test_review)
test_padded = tf.keras.preprocessing.sequence.pad_sequences(test_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Predict with the model
prediction = model.predict(test_padded)
print(prediction)
if prediction[0] > 0.5:
    print("Positive Review")
else:
    print("Negative Review")

[[4.252836e-30]]
Negative Review
