In [16]:
# Step 1: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Step 2: Correct path do (IMDB dataset ko apni drive main rakho)
import pandas as pd

# yahan tum apna actual path daalo, example:
df = pd.read_csv("/content/drive/MyDrive/IMDB Dataset.csv")

# Step 3: Check data
print(df.head())
print(df.isnull().sum())

# Step 4: Handle missing values
df = df.dropna()

# Step 5: Split data
from sklearn.model_selection import train_test_split

X = df['review']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: TF-IDF vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train).toarray()
X_test_tfidf = tfidf.transform(X_test).toarray()

# Step 7: Logistic Regression model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

log_model = LogisticRegression(max_iter=200)
log_model.fit(X_train_tfidf, y_train)

# Step 8: Accuracy check
y_pred = log_model.predict(X_test_tfidf)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred))

# Step 9: Save model + vectorizer
import joblib
import os

# Drive me folder ka path (apna folder path change kar sakte ho)
drive_model_path = "/content/drive/MyDrive/MovieSentModels"
os.makedirs(drive_model_path, exist_ok=True)

# Save models aur vectorizer
joblib.dump(log_model, os.path.join(drive_model_path, "log_model.pkl"))
joblib.dump(tfidf, os.path.join(drive_model_path, "tfidf.pkl"))
joblib.dump(X_train, os.path.join(drive_model_path, "X_train.pkl"))
joblib.dump(y_train, os.path.join(drive_model_path, "y_train.pkl"))

print(f"✅ Models aur vectorizer successfully save ho gaye Google Drive me: {drive_model_path}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
review       0
sentiment    0
dtype: int64
Logistic Regression Accuracy: 0.8951
✅ Models aur vectorizer successfully save ho gaye Google Drive me: /content/drive/MyDrive/MovieSentModels


In [17]:
# 1️⃣ Required Libraries
import pandas as pd
import numpy as np
import re
import os
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical

# 2️⃣ Ensure models folder exists
if not os.path.exists("models"):
    os.makedirs("models")

# 3️⃣ Load dataset
df = pd.read_csv("/content/drive/MyDrive/IMDB Dataset.csv")  # Update path if needed
df = df.dropna()
df = df.drop_duplicates()
df['review'] = df['review'].astype(str)
df['sentiment'] = df['sentiment'].str.lower()

# 4️⃣ Encode labels
le = LabelEncoder()
df['label'] = le.fit_transform(df['sentiment'])  # positive=1, negative=0
num_classes = len(le.classes_)

# 5️⃣ Text Cleaning Function
def clean_text(text):
    text = re.sub(r"<.*?>", "", text)  # remove HTML tags
    text = re.sub(r"[^a-zA-Z ]", "", text)  # remove special chars & numbers
    text = text.lower()
    return text

df['clean_review'] = df['review'].apply(clean_text)

# 6️⃣ Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    df['clean_review'], df['label'], test_size=0.3, stratify=df['label'], random_state=42
)

# 7️⃣ Tokenization & Padding for LSTM
max_words = 10000
max_len = 200

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post', truncating='post')

# 8️⃣ Save Tokenizer
with open("models/tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

# 9️⃣ LSTM Model
embedding_dim = 100

lstm_model = Sequential()
lstm_model.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_len))
lstm_model.add(LSTM(128, return_sequences=False))
lstm_model.add(Dropout(0.5))
lstm_model.add(Dense(num_classes, activation='softmax'))

lstm_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# 10️⃣ Train LSTM
history = lstm_model.fit(
    X_train_pad, y_train,
    validation_data=(X_test_pad, y_test),
    epochs=10,  # Colab fast test; increase to 10-15 for better results
    batch_size=64
)
# LSTM model save karna
lstm_model.save(os.path.join(drive_model_path, "lstm_model.h5"))

# Tokenizer save karna
import pickle

with open(os.path.join(drive_model_path, "tokenizer.pkl"), "wb") as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

print(f"✅ LSTM model aur tokenizer bhi successfully save ho gaye Google Drive me: {drive_model_path}")


Epoch 1/10




[1m543/543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m269s[0m 491ms/step - accuracy: 0.5180 - loss: 0.6930 - val_accuracy: 0.5059 - val_loss: 0.6856
Epoch 2/10
[1m543/543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m331s[0m 509ms/step - accuracy: 0.6079 - loss: 0.6573 - val_accuracy: 0.6153 - val_loss: 0.6436
Epoch 3/10
[1m543/543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m321s[0m 506ms/step - accuracy: 0.7076 - loss: 0.5902 - val_accuracy: 0.5776 - val_loss: 0.6700
Epoch 4/10
[1m543/543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m268s[0m 493ms/step - accuracy: 0.7159 - loss: 0.5459 - val_accuracy: 0.8615 - val_loss: 0.3309
Epoch 5/10
[1m543/543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m321s[0m 492ms/step - accuracy: 0.8999 - loss: 0.2604 - val_accuracy: 0.8690 - val_loss: 0.3191
Epoch 6/10
[1m543/543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m276s[0m 509ms/step - accuracy: 0.9325 - loss: 0.1915 - val_accuracy: 0.8633 - val_loss: 0.3451
Epoch 7/10
[1m



✅ LSTM model aur tokenizer bhi successfully save ho gaye Google Drive me: /content/drive/MyDrive/MovieSentModels
