In [None]:
import pandas as pd
import re
import nltk
import joblib

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix


nltk.download('stopwords')
nltk.download('wordnet')

# ---------------- LOAD DATA ----------------
df = pd.read_csv("IMDB Dataset.csv")

# ---------------- TEXT CLEANING ----------------
lemmatizer = WordNetLemmatizer()

stop_words = set(stopwords.words('english'))
# IMPORTANT: keep negation words to prevent loss of sentiment context
stop_words = stop_words - {"not", "no", "nor", "never"}

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"<.*?>", "", text)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]
    return " ".join(tokens)

df["cleaned_review"] = df["review"].apply(preprocess_text)

# ---------------- TRAIN TEST SPLIT ----------------
X = df["cleaned_review"]
y = df["sentiment"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ---------------- TF-IDF ----------------
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# ---------------- MODEL ----------------
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

# ---------------- EVALUATION ----------------
y_pred = model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

# ---------------- SAVE MODEL ----------------
joblib.dump(model, "lr3_model.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

print("Model & Vectorizer saved successfully")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mohit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mohit\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


              precision    recall  f1-score   support

    negative       0.89      0.88      0.88      5000
    positive       0.88      0.90      0.89      5000

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000

[[4378  622]
 [ 518 4482]]
âœ… Model & Vectorizer saved successfully
