In [1]:
import pickle
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [2]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

In [3]:
np.random.seed(42)
tf.random.set_seed(42)

In [4]:
with open('reviews.pkl', 'rb') as f:
    df1 = pickle.load(f)

In [5]:
df2 = pd.read_csv('/content/IMDB Dataset.csv', engine='python', on_bad_lines='skip')

In [6]:
def prepare_sequences(X, y, vocab_size=10000, max_len=200):
    tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
    tokenizer.fit_on_texts(X)
    sequences = tokenizer.texts_to_sequences(X)
    padded = pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post')
    return padded, y, tokenizer

In [7]:
def build_lstm(vocab_size=10000, embedding_dim=64, input_length=200):
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_dim, input_length=input_length))
    model.add(LSTM(64, return_sequences=False))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [8]:
def train_and_evaluate(X_train, y_train, X_test, y_test, vocab_size=10000, max_len=200, epochs=10, batch_size=64):
    model = build_lstm(vocab_size=vocab_size, input_length=max_len)
    model.summary()
    model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=epochs, batch_size=batch_size, verbose=1)

    y_pred_prob = model.predict(X_test)
    y_pred = (y_pred_prob >= 0.5).astype(int)

    acc = accuracy_score(y_test, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    fpr = fp / (fp + tn)
    fnr = fn / (fn + tp)

    print("Accuracy:", acc)
    print("False Positive Rate:", fpr)
    print("False Negative Rate:", fnr)
    print("\nClassification Report:\n", classification_report(y_test, y_pred))

    return model

# pre-processed data

In [9]:
X1 = df1['review'].values
y1 = df1['sentiment'].map({'negative': 0, 'positive': 1}).values

In [10]:
X1_pad, y1_pad, tok1 = prepare_sequences(X1, y1)

In [11]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1_pad, y1_pad, test_size=0.2, random_state=42)

In [12]:
model1 = train_and_evaluate(X_train1, y_train1, X_test1, y_test1)



Epoch 1/10
[1m386/386[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 15ms/step - accuracy: 0.5026 - loss: 0.6936 - val_accuracy: 0.5079 - val_loss: 0.6919
Epoch 2/10
[1m386/386[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 12ms/step - accuracy: 0.5511 - loss: 0.6828 - val_accuracy: 0.5057 - val_loss: 0.6850
Epoch 3/10
[1m386/386[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 11ms/step - accuracy: 0.5761 - loss: 0.6637 - val_accuracy: 0.6770 - val_loss: 0.6492
Epoch 4/10
[1m386/386[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 13ms/step - accuracy: 0.6766 - loss: 0.6038 - val_accuracy: 0.5241 - val_loss: 0.6978
Epoch 5/10
[1m386/386[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 11ms/step - accuracy: 0.5471 - loss: 0.6521 - val_accuracy: 0.5340 - val_loss: 0.6972
Epoch 6/10
[1m386/386[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 14ms/step - accuracy: 0.6539 - loss: 0.5903 - val_accuracy: 0.7442 - val_loss: 0.6033
Epoch 7/10
[1m386/38

# raw, unprocessed data

In [13]:
X2 = df2['review'].values
y2 = df2['sentiment'].map({'negative': 0, 'positive': 1}).values

In [14]:
X2_pad, y2_pad, tok2 = prepare_sequences(X2, y2)

In [15]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2_pad, y2_pad, test_size=0.2, random_state=42)

In [16]:
model_raw = train_and_evaluate(X_train2, y_train2, X_test2, y_test2)



Epoch 1/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 13ms/step - accuracy: 0.5337 - loss: 0.6881 - val_accuracy: 0.6163 - val_loss: 0.6618
Epoch 2/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 11ms/step - accuracy: 0.6351 - loss: 0.6462 - val_accuracy: 0.5953 - val_loss: 0.6647
Epoch 3/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 10ms/step - accuracy: 0.5616 - loss: 0.6756 - val_accuracy: 0.5890 - val_loss: 0.6481
Epoch 4/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 13ms/step - accuracy: 0.6034 - loss: 0.6380 - val_accuracy: 0.7596 - val_loss: 0.5861
Epoch 5/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 11ms/step - accuracy: 0.7221 - loss: 0.5521 - val_accuracy: 0.8390 - val_loss: 0.3993
Epoch 6/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 11ms/step - accuracy: 0.8625 - loss: 0.3584 - val_accuracy: 0.8696 - val_loss: 0.3152
Epoch 7/10
[1m625/6