In [None]:
pip install pandas scikit-learn

In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.utils.class_weight import compute_class_weight

# Load the dataset
file_path = '/users/riyakoduru/Downloads/avg_top10_bootstrapped.csv'
data = pd.read_csv(file_path)

# Assuming the last column is the target variable
X = data.iloc[:, :-1]  # All columns except the last one are features
y = data.iloc[:, -1]   # The last column is the target

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Calculate class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights_dict = dict(enumerate(class_weights))

# Define the SVM model with GridSearchCV for hyperparameter tuning
parameters = {
    'C': [0.1, 1, 10, 100],  # Regularization parameter
    'gamma': ['scale', 'auto'],  # Kernel coefficient
    'kernel': ['rbf', 'linear'],  # Type of kernel
    'class_weight': [class_weights_dict]  # Handling imbalanced data
}

svm_clf = GridSearchCV(SVC(), parameters, cv=5, scoring='accuracy')
svm_clf.fit(X_train_scaled, y_train)

# Display best parameters
print("Best parameters found: ", svm_clf.best_params_)

# Perform cross-validation on the training set
cv_scores = cross_val_score(svm_clf, X_train_scaled, y_train, cv=5)
print(f"Cross-validation scores on training set: {cv_scores}")
print(f"Mean CV score: {cv_scores.mean()}")

# Check performance on the training set
y_train_pred = svm_clf.predict(X_train_scaled)
print(f"Training Set Accuracy: {accuracy_score(y_train, y_train_pred)}")
print(f"Training Set Classification Report:\n{classification_report(y_train, y_train_pred)}")

# Predict on the test set using the best-found parameters
y_test_pred = svm_clf.predict(X_test_scaled)
print(f"Test Set Accuracy: {accuracy_score(y_test, y_test_pred)}")
print(f"Test Set Classification Report:\n{classification_report(y_test, y_test_pred)}")


Best parameters found:  {'C': 100, 'class_weight': {0: 1.2792022792022792, 1: 0.8208409506398537}, 'gamma': 'scale', 'kernel': 'rbf'}
Cross-validation scores on training set: [0.98333333 0.90807799 0.91364903 0.89693593 0.86908078]
Mean CV score: 0.9142154131847725
Training Set Accuracy: 0.9320712694877505
Training Set Classification Report:
              precision    recall  f1-score   support

         0.0       0.85      1.00      0.92       702
         1.0       1.00      0.89      0.94      1094

    accuracy                           0.93      1796
   macro avg       0.93      0.94      0.93      1796
weighted avg       0.94      0.93      0.93      1796

Test Set Accuracy: 0.9377431906614786
Test Set Classification Report:
              precision    recall  f1-score   support

         0.0       0.86      1.00      0.93       306
         1.0       1.00      0.90      0.95       465

    accuracy                           0.94       771
   macro avg       0.93      0.95      0.