In [15]:

!pip install tensorflow scikit-learn nltk

# -------------------------------
# Import Libraries
# -------------------------------
import numpy as np
import nltk
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Download NLTK data for tokenization/lemmatization
nltk.download('punkt')
nltk.download('wordnet')

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# -------------------------------
#  Load IMDb Dataset
# -------------------------------
num_words = 10000  # Top 10,000 frequent words
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=num_words)
print("Number of training reviews:", len(x_train))
print("Number of test reviews:", len(x_test))

# Optional: decode first review for understanding
word_index = imdb.get_word_index()
index_word = {v:k for k,v in word_index.items()}

decoded_review = ' '.join([index_word.get(i-3, '?') for i in x_train[0]])
print("\nSample decoded review:\n", decoded_review[:500], "...")  # print first 500 chars

# -------------------------------
#  Pad Sequences
# -------------------------------
max_len = 500  # max words per review
x_train_pad = pad_sequences(x_train, maxlen=max_len)
x_test_pad = pad_sequences(x_test, maxlen=max_len)

# -------------------------------
#  Logistic Regression Approach
# -------------------------------
# Convert sequences back to text
def decode_reviews(sequences):
    return [' '.join([index_word.get(i-3, '?') for i in seq]) for seq in sequences]

x_train_text = decode_reviews(x_train_pad)
x_test_text = decode_reviews(x_test_pad)

# Vectorize text using CountVectorizer
vectorizer = CountVectorizer(max_features=10000)
X_train = vectorizer.fit_transform(x_train_text)
X_test = vectorizer.transform(x_test_text)

# Train Logistic Regression
clf = LogisticRegression(max_iter=200)
clf.fit(X_train, y_train)

# Evaluate on test set
y_pred = clf.predict(X_test)
accuracy_lr = accuracy_score(y_test, y_pred)
print("\n✅ Logistic Regression Test Accuracy:", round(accuracy_lr*100, 2), "%")

# -------------------------------
#  LSTM Model Approach
# -------------------------------
model = Sequential([
    Embedding(input_dim=num_words, output_dim=32, input_length=max_len),
    LSTM(32),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

history = model.fit(x_train_pad, y_train, epochs=3, batch_size=64, validation_data=(x_test_pad, y_test))

# Evaluate LSTM
loss, accuracy_lstm = model.evaluate(x_test_pad, y_test)
print("\n✅ LSTM Test Accuracy:", round(accuracy_lstm*100, 2), "%")

# -------------------------------
#  Make Predictions
# -------------------------------

# Logistic Regression Prediction
sample_review_lr = "The movie was amazing and I loved it"
sample_vector_lr = vectorizer.transform([sample_review_lr.lower()])
prediction_lr = clf.predict(sample_vector_lr)
print("\nLogistic Regression Prediction for sample review:")
print("Positive" if prediction_lr[0]==1 else "Negative")

# LSTM Prediction
from tensorflow.keras.preprocessing.text import text_to_word_sequence

def encode_review(text, word_index, num_words=10000):
    sequence = []
    for word in text_to_word_sequence(text):
        index = word_index.get(word, 0)  # unknown words get 0
        if index < num_words:
            sequence.append(index+3)  # +3 reserved
    return sequence

sample_review_lstm = "The movie was amazing and I loved it"
sample_seq = encode_review(sample_review_lstm, word_index)
sample_seq_padded = pad_sequences([sample_seq], maxlen=max_len)

prediction_lstm = model.predict(sample_seq_padded)
print("\nLSTM Prediction for sample review:")
print("Positive" if prediction_lstm[0][0] > 0.5 else "Negative")




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Number of training reviews: 25000
Number of test reviews: 25000

Sample decoded review:
 ? this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert ? is an amazing actor and now the same being director ? father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for ? and would recommend it to ever ...

✅ Logistic Regression Test Accuracy: 85.63 %
Epoch 1/3


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m116s[0m 289ms/step - accuracy: 0.6902 - loss: 0.5696 - val_accuracy: 0.8339 - val_loss: 0.3851
Epoch 2/3
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m159s[0m 333ms/step - accuracy: 0.8880 - loss: 0.2828 - val_accuracy: 0.8723 - val_loss: 0.3042
Epoch 3/3
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 331ms/step - accuracy: 0.9278 - loss: 0.1987 - val_accuracy: 0.8718 - val_loss: 0.3220
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 48ms/step - accuracy: 0.8711 - loss: 0.3251

✅ LSTM Test Accuracy: 87.18 %

Logistic Regression Prediction for sample review:
Positive
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 215ms/step

LSTM Prediction for sample review:
Positive
