In [2]:
# IMPORTS
# data manipulation and analysis
import pandas as pd
import numpy as np
# visualization
import matplotlib.pyplot as plt

# preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
# model
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
# hyperparameter tuning
from sklearn.model_selection import GridSearchCV, StratifiedKFold
# evaluation metrics
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
    classification_report,
    roc_curve,
    auc
)
# handle warnings
import warnings
warnings.filterwarnings('ignore')

# set random seed for reproducibility
np.random.seed(42)

### 1. Loading in Data

In [3]:
# loading the train set
train_df = pd.read_csv('../Data/breast_cancer_trainset.csv')

X_train = train_df.drop(columns=['diagnosis'])
y_train = train_df['diagnosis']

# Data dimensions
print("Data Dimensions:")
print(f"X_train: {X_train.shape}")
print(f"y_train: {y_train.shape}")

# Target distribution
print("-"*60)
print("Target Distribution:")
print(f"Benign (0): {(y_train==0).sum()} samples ({(y_train==0).sum()/len(y_train)*100:.1f}%)")
print(f"Malignant (1): {(y_train==1).sum()} samples ({(y_train==1).sum()/len(y_train)*100:.1f}%)")
print(f"Total: {len(y_train)} samples")
print("-"*60)
print("-"*60)

Data Dimensions:
X_train: (455, 30)
y_train: (455,)
------------------------------------------------------------
Target Distribution:
Benign (0): 285 samples (62.6%)
Malignant (1): 170 samples (37.4%)
Total: 455 samples
------------------------------------------------------------
------------------------------------------------------------


### 2. Building a Pipeline

We used a pipeline to avoid any data leakage during the preprocessing steps. This ensures stadard scaler is called on the training set in each cross validation fold. 

In [5]:
svm_pipeline = Pipeline(
    steps=[
        ('scaler', StandardScaler()),
        ('svm', SVC(probability=True))
    ]
)

### 3. Hypertuning with GridSearchCV

We used GridSearchCV with 5-fold cross validation to find the most optimal hyperparameters for our SVM model. GridSearchCV automatically splits the training data into folds, trains the model on different hyperparameter combinations, and evaluates performance using cross-validation. 

In the range of parameters, we included svm__C, which is a regularization parameter that controls the trade off between a smooth decision boundary and classifying correctly. We also included svm__kernel to test the linear, rbf, and polynomial kernels. svm__gamma is the kernel coefficient for rbf and polynomial, and controls the influence of a single training example. Lastly, svm__degree is the degree of the polynomial kernel function, which we set to test values of 2 and 3. 

We chose to optimize for recall as the scoring metric to minimize false negatives. In this use case, false negatives would mean missing a cancer diagnosis when the patient has cancer, which is critical in medical applications. 

In [6]:
# parameters for hyperparameter tuning
param_grid = {
    "svm__C": [0.1, 1, 10, 50], 
    "svm__kernel": ["linear", "rbf", "poly"],
    "svm__gamma": ["scale", "auto"],
    "svm__degree": [2, 3], # only for poly kernel
}

# perform gridsearch
grid = GridSearchCV(
    estimator=svm_pipeline,
    param_grid = param_grid, 
    cv = 5, 
    scoring = 'recall', 
    n_jobs = -1,
    verbose = 2
)
# fit model
grid.fit(X_train, y_train)
# best model
print("Best CV accuracy:", grid.best_score_)
print("Best parameters:", grid.best_params_)

best_model = grid.best_estimator_

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best CV accuracy: 0.9588235294117649
Best parameters: {'svm__C': 1, 'svm__degree': 2, 'svm__gamma': 'scale', 'svm__kernel': 'rbf'}


### 4. Evaluating Model Performance


After performing gridsearch, we found the best model, based on highest average recall score across all folds, used the radial basis function kernel with C=1, gamma = scale, and degree of 2. This gave us a CV recall score of 0.9588. 

In [9]:
svm_best_model = grid.best_estimator_

# Display best hyperparameters
print("=" * 70)
print("BEST MODEL FROM GRIDSEARCHCV")
print("=" * 70)
print("\nBest Hyperparameters:")
for param, value in grid.best_params_.items():
    print(f"  {param}: {value}")

print(f"\nBest Cross-Validation Recall Score: {grid.best_score_:.4f}")
print("\n" + "=" * 70)

# Display CV results summary
cv_results_df = pd.DataFrame(grid.cv_results_)
top_5_models = cv_results_df.nlargest(5, 'mean_test_score')[['params', 'mean_test_score', 'std_test_score']]

print("\nTop 5 Hyperparameter Combinations (by Recall):")
print("-" * 70)
for idx, row in top_5_models.iterrows():
    print(f"\nRank {top_5_models.index.get_loc(idx) + 1}:")
    print(f"  Recall: {row['mean_test_score']:.4f} (±{row['std_test_score']:.4f})")
    print(f"  Params: {row['params']}")

print("\n" + "=" * 70)
print("Model is ready for final test set evaluation.")
print("=" * 70)

BEST MODEL FROM GRIDSEARCHCV

Best Hyperparameters:
  svm__C: 1
  svm__degree: 2
  svm__gamma: scale
  svm__kernel: rbf

Best Cross-Validation Recall Score: 0.9588


Top 5 Hyperparameter Combinations (by Recall):
----------------------------------------------------------------------

Rank 1:
  Recall: 0.9588 (±0.0300)
  Params: {'svm__C': 1, 'svm__degree': 2, 'svm__gamma': 'scale', 'svm__kernel': 'rbf'}

Rank 2:
  Recall: 0.9588 (±0.0300)
  Params: {'svm__C': 1, 'svm__degree': 2, 'svm__gamma': 'auto', 'svm__kernel': 'rbf'}

Rank 3:
  Recall: 0.9588 (±0.0300)
  Params: {'svm__C': 1, 'svm__degree': 3, 'svm__gamma': 'scale', 'svm__kernel': 'rbf'}

Rank 4:
  Recall: 0.9588 (±0.0300)
  Params: {'svm__C': 1, 'svm__degree': 3, 'svm__gamma': 'auto', 'svm__kernel': 'rbf'}

Rank 5:
  Recall: 0.9588 (±0.0399)
  Params: {'svm__C': 10, 'svm__degree': 2, 'svm__gamma': 'scale', 'svm__kernel': 'rbf'}

Model is ready for final test set evaluation.
