In [2]:
import re
import tensorflow as tf
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Dense, LSTM, Bidirectional
from tensorflow.keras.models import Sequential
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import pickle

In [3]:
data_path = "IMDB Dataset.csv"
df = pd.read_csv(data_path)

In [4]:
def clean_text(text):
    text = text.lower()  
    text = re.sub(r"[^a-zA-Z0-9().,!?]", " ", text)  
    text = re.sub(r"\s+", " ", text)  
    return text

In [5]:
df["review"] = df["review"].apply(clean_text)

In [6]:
# Split data into training and testing sets
X = df["review"].to_numpy()
y = df["sentiment"].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)

In [8]:
tokenizer = Tokenizer(num_words=50000)
tokenizer.fit_on_texts(X_train)
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

In [9]:
max_length = 200
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_length)
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_length)

In [10]:
model = Sequential([
    Embedding(50000, 128),
    Bidirectional(LSTM(128, return_sequences=True)),
    Bidirectional(LSTM(64)),
    Dense(128, activation='relu'),
    Dense(1, activation='sigmoid')
])




In [11]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])




In [14]:
model.fit(X_train_padded, y_train_encoded, epochs=15, batch_size=64, validation_split=0.2)



<keras.src.callbacks.History at 0x24522e21820>

In [15]:
model.save("sentiment_analysis_model.h5")

  saving_api.save_model(


In [16]:
pickle.dump(tokenizer, open("tokenizer.pkl", "wb"))

In [17]:
loss, acc = model.evaluate(X_test_padded, y_test_encoded)
print('Accuracy:', acc * 100)

Accuracy: 84.64999794960022
