# SVM model

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import seaborn as sns
from joblib import dump

## load the data

In [None]:
train_df = pd.read_csv('../data/train_data.csv')
test_df = pd.read_csv('../data/test_data.csv')

X_train = train_df['X_train'].astype(str)
y_train = train_df['y_train']
X_test = test_df['X_test'].astype(str)
y_test = test_df['y_test']

## TF-IDF Vectorization

In [None]:
vectorizer = TfidfVectorizer(
    max_features=10000,
    stop_words='english',
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95
)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

## Train model with Hyper-Parameter tuning

In [None]:
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

svm = SVC(probability=True, class_weight='balanced')
grid_search = GridSearchCV(svm, param_grid, cv=3, n_jobs=-1, verbose=1)
grid_search.fit(X_train_tfidf, y_train)

best_svm = grid_search.best_estimator_
print("Best parameters:", grid_search.best_params_)

## Evaluation

In [None]:
y_pred = best_svm.predict(X_test_tfidf)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

## Confusion Matrix

In [None]:
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6,6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('SVM Confusion Matrix')
plt.savefig('../models/svm_confusion_matrix.png')
plt.show()

## Save model

In [None]:
dump(best_svm, '../models/svm_model.joblib')
dump(vectorizer, '../models/tfidf_vectorizer.joblib')