In [6]:
# Step 1: Imports
import pandas as pd
import numpy as np
import joblib

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix

# Step 2: Load preprocessed data
df = pd.read_csv("../data/preprocessed/cleaned.csv")

# Step 3: Features and Labels
X = df['clean_text'].fillna('')
y = df['label']

# Step 4: TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = vectorizer.fit_transform(X)

# Step 5: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, y, test_size=0.2, random_state=42
)

# Step 6: Train SVM Model
svm_model = LinearSVC()
svm_model.fit(X_train, y_train)

# Step 7: Evaluation
y_pred = svm_model.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Step 8: Save Model and Vectorizer
joblib.dump(svm_model, "../models/svm_model.joblib")
joblib.dump(vectorizer, "../models/tfidf_vectorizer.joblib")

print("✅ SVM model and vectorizer saved!")

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      1.00      4650
           1       0.99      1.00      1.00      4330

    accuracy                           1.00      8980
   macro avg       1.00      1.00      1.00      8980
weighted avg       1.00      1.00      1.00      8980

Confusion Matrix:
 [[4624   26]
 [  17 4313]]
✅ SVM model and vectorizer saved!
