
# Baseline Model 3: Support Vector Machine (SVM) with TF-IDF

## Objective
Train a Support Vector Machine classifier using TF-IDF features on the IMDb dataset and compare its performance with Naive Bayes and Logistic Regression.

In [None]:
# Step 1: Import libraries

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import os


In [None]:
# Step 2: Load preprocessed dataset

# Get the absolute path to the data directory
current_dir = os.path.dirname(os.path.abspath(__file__))
parent_dir = os.path.dirname(current_dir)
data_path = os.path.join(parent_dir, "data", "cleaned_imdb_reviews.csv")

df = pd.read_csv(data_path)

X = df["cleaned_review"]
Y = df["label"]

X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42
)

In [None]:
# Step 3: TF-IDF coding

tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


In [None]:
# Step 4: Train SVM model

svm_model = LinearSVC()
svm_model.fit(X_train_tfidf, Y_train)

Y_pred = svm_model.predict(X_test_tfidf)

In [None]:
# Step 5: Evaluate the model

print("Accuracy:", round(accuracy_score(Y_test, Y_pred), 4))
print("\nClassification Report:\n", classification_report(Y_test, Y_pred))

# Confusion Matrix
cm = confusion_matrix(Y_test, Y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Negative', 'Positive'], 
            yticklabels=['Negative', 'Positive'])

plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix - SVM Model with TF-IDF')
plt.show()