# Baseline Model 2: Logistic Regression with TF-IDF

## Objective
To train and evaluate a Logistic Regression classifier on IMDb reviews using TF-IDF features, and compare its performance with Naive Bayes.


In [None]:
# Step 1: Import libraries

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
# Step 2: Load preprocessed dataset

df = pd.read_csv("data/cleaned_imdb_reviews.csv")

X = df["cleaned_review"]
Y = df["label"]

X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42
)


In [None]:
# Step 3: TF-IDF encoding

tfidf = TfidfVectorizer(max_features=5000, stop_words="english")
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


In [None]:
# Step 4: Train Logistic Regression

lr_model = LogisticRegression(max_iter=200, solver="liblinear")  
lr_model.fit(X_train_tfidf, Y_train)

Y_pred = lr_model.predict(X_test_tfidf)


In [None]:
# Step 5: Evaluation
print("Accuracy:", round(accuracy_score(Y_test, Y_pred), 4))
print("\nClassification Report:\n", classification_report(Y_test, Y_pred))

# Confusion matrix
cm = confusion_matrix(Y_test, Y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Purples",
            xticklabels=["Negative", "Positive"],
            yticklabels=["Negative", "Positive"])
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix - Logistic Regression")
plt.show()
