In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from tensorflow.keras.callbacks import EarlyStopping


In [None]:
# Load the cleaned IMDB dataset
df = pd.read_csv("/content/IMDB_cleaned.csv")

# Convert 'positive' and 'negative' sentiments to 1 and 0
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['sentiment'])

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['clean_review'], df['label'], test_size=0.2, random_state=42)


In [None]:
# Define tokenizer parameters
vocab_size = 10000  # Maximum number of words
max_length = 200    # Max length of sequences
oov_tok = "<OOV>"   # Token for out-of-vocabulary words

# Initialize and fit the tokenizer on training data
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(X_train)

# Convert texts to sequences
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad sequences to ensure uniform input size
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length, padding='post', truncating='post')



In [None]:
# Define model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=16),  # input_length removed
    GlobalAveragePooling1D(),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])

# Build model with expected input shape (e.g., batch size is None, input length is 200)
model.build(input_shape=(None, max_length))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

early_stop = EarlyStopping(
    monitor='val_loss',      # what to monitor
    patience=2,              # how many epochs to wait before stopping
    restore_best_weights=True  # restore best model weights
)

# Show model summary
model.summary()


In [None]:
# Train the model with validation data
history = model.fit(X_train_pad, y_train, epochs=10, validation_data=(X_test_pad, y_test), verbose=2, callbacks=[early_stop] )

# Predict sentiment probabilities on the test set
y_pred_prob = model.predict(X_test_pad)

# Convert probabilities to binary predictions
y_pred = (y_pred_prob > 0.5).astype(int)

# Display performance metrics
print(classification_report(y_test, y_pred))



Epoch 1/10
1250/1250 - 8s - 7ms/step - accuracy: 0.7788 - loss: 0.4778 - val_accuracy: 0.8617 - val_loss: 0.3345
Epoch 2/10
1250/1250 - 6s - 4ms/step - accuracy: 0.8783 - loss: 0.2962 - val_accuracy: 0.8699 - val_loss: 0.3076
Epoch 3/10
1250/1250 - 11s - 8ms/step - accuracy: 0.8975 - loss: 0.2551 - val_accuracy: 0.8815 - val_loss: 0.2939
Epoch 4/10
1250/1250 - 11s - 9ms/step - accuracy: 0.9073 - loss: 0.2323 - val_accuracy: 0.8627 - val_loss: 0.3284
Epoch 5/10
1250/1250 - 6s - 5ms/step - accuracy: 0.9148 - loss: 0.2168 - val_accuracy: 0.8779 - val_loss: 0.3063
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
              precision    recall  f1-score   support

           0       0.87      0.89      0.88      4940
           1       0.89      0.88      0.88      5060

    accuracy                           0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000



In [None]:
# Define a function to predict sentiment of custom input
def predict_sentiment(text):
    seq = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(seq, maxlen=max_length, padding='post', truncating='post')
    prob = model.predict(padded)[0][0]
    label = "positive" if prob > 0.5 else "negative"
    return label, prob

# Example usage
print(predict_sentiment("I really enjoyed this movie, it was brilliant and emotional."))



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 187ms/step
('positive', np.float32(0.73475164))
