# Baseline Model 1: Naive Bayes with TF-IDF

## Objective
Train and evaluate a Naive Bayes classifier on IMDb reviews using TF-IDF encoding.

In [None]:
# Step 1: Import all libraries

import pandas as pd # type: ignore
from sklearn.model_selection import train_test_split # type: ignore
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Step 2L Load pre-processed data

df = pd.read_csv("data/cleaned_imbd_reviews.csv")
print(df.shape)
df.head()

In [None]:
# Step 3: Train-test split

X = df["cleaned_review"]
Y = df["label"]

X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42
)

print("Train size:", X_train.shape[0], "Test size:", X_test.shape[0])

In [None]:
# Step 4: TF-IDF encoding

tfidif = TfidfVectorizer(max_features=5000, stop_words="english")
X_train_tfidf = tfidif.fit_transform(X_train)
X_test_tfidf = tfidif.transform(X_test)

print("TF-IDF Train shape:", X_train_tfidf.shape)

In [None]:
# Step 5: Train Naive Bayes model

nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, Y_train)

Y_pred = nb_model.predict(X_test_tfidf)

In [None]:
# Step 6: Evaluate the model

print("Accuracy:", accuracy_score(Y_test, Y_pred))
print("\nClassification Report:\n", classification_report(Y_test, Y_pred))

# Confusion matrix

cm = confusion_matrix(Y_test, Y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Neg", "Pos"], yticklabels=["Neg", "Pos"])
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix - Naive Bayes")
plt.show()