In [5]:
import pandas as pd
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
import joblib

import sys
import os
sys.path.append(os.path.abspath('../../../src'))

In [6]:
from scripts.data_processing import load_and_preprocess_data

data_path = '../../../data/raw/winequalityN.csv'
X_train, X_test, y_train, y_test = load_and_preprocess_data(data_path, features_to_drop=['density'])

In [7]:
# Goal: define a function for getting mettrics from an SVM model so we can apply it to datasets with certain features removed based on 
# collinearity we identified in EDA

# We will also add the option for grid search for c and gamma hyperparameters
# We have selected radial basis function kernel based on our literature review and our data's non-linear nature. 

from sklearn.pipeline import Pipeline

def get_SVM_Metrics(X_train, X_test, y_train, y_test, param_grid=None, save_model=False):
    """
    Train SVM model and get metrics using a pipeline that combines scaling and classification
    """
    # Create pipeline with scaler and SVM
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('svm', svm.SVC(kernel='rbf', probability=True))
    ])
    
    # Modify param grid to work with pipeline
    if param_grid:
        pipeline_param_grid = {f'svm__{key}': value for key, value in param_grid.items()}
        
        grid_search = GridSearchCV(pipeline, pipeline_param_grid, cv=5, verbose=1, n_jobs=-1)
        grid_search.fit(X_train, y_train)
        print(f"Best parameters found: {grid_search.best_params_}")
        
        if save_model:
            joblib.dump(grid_search.best_estimator_, '../../../models/svm.joblib')
            print("Model saved successfully!")
        
        pipeline = grid_search.best_estimator_
    else:
        pipeline.fit(X_train, y_train)
    
    # Get predictions
    y_pred = pipeline.predict(X_test)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=1)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=1)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=1)
    
    # Print results
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    
    return pipeline, accuracy, precision, recall, f1


In [8]:
# Define parameter grid
param_grid = {
    'C': [7, 8, 9, 10, 11],
    'gamma': ['scale', 'auto', 0.1, 1]
}

# Train model and get metrics
best_pipeline, accuracy, precision, recall, f1 = get_SVM_Metrics(
    X_train, X_test, y_train, y_test,
    param_grid=param_grid,
    save_model=True
)

print("\nFinal model metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best parameters found: {'svm__C': 7, 'svm__gamma': 1}
Model saved successfully!
Accuracy: 0.9459
Precision: 0.9335
Recall: 0.9459
F1 Score: 0.9349

Final model metrics:
Accuracy: 0.9459
Precision: 0.9335
Recall: 0.9459
F1 Score: 0.9349
