In [2]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, log_loss
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline
import os

In [16]:
wisconsin = pd.read_csv("Cases/Wisconsin/BreastCancer.csv", index_col=0)
wisconsin

Unnamed: 0_level_0,Clump,UniCell_Size,Uni_CellShape,MargAdh,SEpith,BareN,BChromatin,NoemN,Mitoses,Class
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
61634,5,4,3,1,2,2,2,3,1,Benign
63375,9,1,2,6,4,10,7,7,2,Malignant
76389,10,4,7,2,2,8,6,1,1,Malignant
95719,6,10,10,10,8,10,7,10,7,Malignant
128059,1,1,1,1,2,5,5,1,1,Benign
...,...,...,...,...,...,...,...,...,...,...
1369821,10,10,10,10,5,10,10,10,7,Malignant
1371026,5,10,10,10,4,10,5,6,3,Malignant
1371920,5,1,1,1,2,1,3,2,1,Benign
8233704,4,1,1,1,1,1,2,1,1,Benign


In [17]:
from sklearn.model_selection import train_test_split


X, y = wisconsin.drop('Class', axis=1), wisconsin['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=25, stratify=y)

Without scaling

In [18]:
svm = SVC(kernel="linear")
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.9714285714285714


With Scaling

In [19]:
std_scaler = StandardScaler()
pipe = Pipeline(steps = [('scaler', std_scaler), ('SVM', svm)])
pipe.fit(X_train,y_train)
y_pred = pipe.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.9666666666666667


Working on HR dataset

In [20]:
hr = pd.read_csv("Datasets/HR_comma_sep.csv")

In [21]:
X, y = hr.drop('left', axis = 1), hr['left']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=25, stratify=y)
ohe = OneHotEncoder(drop='first', sparse_output=False).set_output(transform='pandas')
col_trnf = ColumnTransformer([('OHE', ohe, make_column_selector(dtype_include=object))],
                             remainder='passthrough', verbose_feature_names_out=False)
std_scaler = StandardScaler()
pipe = Pipeline([('TRNSF', col_trnf), ('SCL', std_scaler), ('SVM', svm)])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.7819515447877307


Without Scaling

In [22]:
pipe = Pipeline([('TRNSF', col_trnf), ('SVM', svm)])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.7803956434763281


There is a hyper-parimeter called C. Lets ffo the hyper parimeter tuning

In [23]:
Cs = np.linspace(0.01, 5, 15)
scores = []
for c in Cs:
    svm = SVC(kernel='linear', C=c)
    pipe = Pipeline(steps = [('TRNSF', col_trnf), ('scaler', std_scaler), ('SVM', svm)])
    pipe.fit(X_train,y_train)
    y_pred = pipe.predict(X_test)
    scores.append([c, accuracy_score(y_test, y_pred)])

df_scores = pd.DataFrame(scores, columns=['C', 'score'])
df_scores.sort_values('score', ascending=False)

Unnamed: 0,C,score
0,0.01,0.786397
1,0.366429,0.781952
2,0.722857,0.781952
3,1.079286,0.781952
4,1.435714,0.781952
5,1.792143,0.781952
6,2.148571,0.781952
7,2.505,0.781952
8,2.861429,0.781952
9,3.217857,0.781952


Polynomial Kernal

In [25]:

svm = SVC(kernel='poly')
pipe = Pipeline(steps = [('TRNSF', col_trnf), ('scaler', std_scaler), ('SVM', svm)])
pipe.fit(X_train,y_train)
y_pred = pipe.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.9415425650144477


#### Radial kernal

In [26]:
svm = SVC(kernel='rbf')
pipe = Pipeline(steps = [('TRNSF', col_trnf), ('scaler', std_scaler), ('SVM', svm)])
pipe.fit(X_train,y_train)
y_pred = pipe.predict(X_test)
print(accuracy_score(y_test, y_pred))


0.9524338741942654


lets do hyper perimeter tuning for Poly and Radial 

In [30]:
Cs = np.linspace(0.01, 5, 15)
deg = [2,3,4]
scores = []
for d in tqdm(deg):
    for c in Cs:
        svm = SVC(kernel='poly', C=c, degree=d)
        pipe = Pipeline(steps = [('TRNSF', col_trnf), ('scaler', std_scaler), ('SVM', svm)])
        pipe.fit(X_train,y_train)
        y_pred = pipe.predict(X_test)
        scores.append([c, d,accuracy_score(y_test, y_pred)])

df_scores = pd.DataFrame(scores, columns=['C', 'degree', 'score'])
df_scores.sort_values('score', ascending=False)

100%|██████████| 3/3 [01:14<00:00, 24.88s/it]


Unnamed: 0,C,degree,score
41,3.930714,4,0.956879
40,3.574286,4,0.956657
39,3.217857,4,0.956435
42,4.287143,4,0.956212
44,5.0,4,0.95599
38,2.861429,4,0.955768
43,4.643571,4,0.955768
37,2.505,4,0.955101
36,2.148571,4,0.953768
35,1.792143,4,0.953545


In [None]:
Cs = np.linspace(0.01, 5, 15)
gammas = np.linspace(0.01, 5, 15)
scores = []
for g in tqdm(gammas):
    for c in Cs:
        svm = SVC(kernel='rbf', C=c, gamma=g)
        pipe = Pipeline(steps = [('TRNSF', col_trnf), ('scaler', std_scaler), ('SVM', svm)])
        pipe.fit(X_train,y_train)
        y_pred = pipe.predict(X_test)
        scores.append([c, d,accuracy_score(y_test, y_pred)])

df_scores = pd.DataFrame(scores, columns=['C', 'degree', 'score'])
df_scores.sort_values('score', ascending=False)

 67%|██████▋   | 10/15 [21:35<15:12, 182.46s/it]