In [64]:
# --- Import Dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [65]:
# --- Load Dataset ---
try :
    df = pd.read_csv("breast-cancer.csv")
    print("-- dataset successfully loaded --")
except FileNotFoundError:
    print("-- file not found, enter the correct filepath --")
    

-- dataset successfully loaded --


In [66]:
# --- Preview Datset ---
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             5

In [67]:
# --- Drop Unnecessary Columns --- Target Encoding --- Train Test Split --- Scaling ---
df.drop('id', axis = 1, inplace = True)
df['diagnosis'] = df['diagnosis'].map({'M' : 1, 'B' : 0,})

X = df.drop('diagnosis', axis = 1)
y = df['diagnosis']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state =42, test_size =0.2)
scalar = StandardScaler()
X_train_scaled = scalar.fit_transform(X_train)
X_test_scaled = scalar.transform(X_test)

In [68]:
# --- Linear and RBF Comparison ---

svm_linear = SVC(kernel = 'linear', random_state = 42)
svm_linear.fit(X_train_scaled, y_train)
y_pred_lin = svm_linear.predict(X_test_scaled)
print(f"-- linear svm qccuracy : {accuracy_score(y_test, y_pred_lin)}")

svm_rbf = SVC(kernel = 'rbf', random_state = 42)
svm_rbf.fit(X_train_scaled, y_train)
y_pred_rbf = svm_rbf.predict(X_test_scaled)
print(f"-- rbf svm accuracy : {accuracy_score(y_test, y_pred_rbf)}")

-- linear svm qccuracy : 0.956140350877193
-- rbf svm accuracy : 0.9824561403508771


In [69]:
# --- Hyperparameter Tuning ---
param_grid = {
    'C' : [0.1, 1, 10, 100],
    'gamma' : [1, 0.1, 0.01, 0.001],
    'kernel' : ['rbf']
}

grid = GridSearchCV(SVC(random_state = 42), param_grid, refit = True, cv = 5, verbose = 2)
grid.fit(X_train_scaled, y_train)
y_pred_final = grid.predict(X_test_scaled)
final_accuracy = accuracy_score(y_test, y_pred_final)

print(f"--- final accuracy : {final_accuracy}")
print(f"--- best parameters : {grid.best_params_}")

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=   0.0s
[CV] END ......................C=0.1, gamma=0.01, kernel=rbf; total time=   0.0s
[CV] END ......................C=0.1, gamma=0.01

In [70]:
# --- Final Tuned Modwl Metrics ---
print("--- Final tuned model metrics ---")
print(f"--- classification report : {classification_report(y_test, y_pred_final)}")
print(f"--- confusion matrix : {confusion_matrix(y_test, y_pred_final)}")


--- Final tuned model metrics ---
--- classification report :               precision    recall  f1-score   support

           0       0.97      1.00      0.99        71
           1       1.00      0.95      0.98        43

    accuracy                           0.98       114
   macro avg       0.99      0.98      0.98       114
weighted avg       0.98      0.98      0.98       114

--- confusion matrix : [[71  0]
 [ 2 41]]
