In [32]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
import os

In [18]:
os.chdir("D:\\meridianthe4\\PML\\Cases\\Wisconsin")

In [19]:
data = pd.read_csv("BreastCancer.csv")

In [20]:
X, y = data.drop(["Code", "Class"], axis=1), data["Class"]
le = LabelEncoder()
y = le.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=25, stratify=y)

In [21]:
X_train

Unnamed: 0,Clump,UniCell_Size,Uni_CellShape,MargAdh,SEpith,BareN,BChromatin,NoemN,Mitoses
683,5,4,6,8,4,1,8,10,1
615,4,8,7,10,4,10,7,5,1
86,1,1,1,1,2,1,3,1,1
605,2,5,7,6,4,10,7,6,1
474,1,1,1,1,2,1,3,1,1
...,...,...,...,...,...,...,...,...,...
596,3,1,1,1,2,1,2,1,1
154,1,1,1,1,2,1,3,1,1
156,2,1,1,1,2,1,1,1,1
363,4,2,1,1,2,2,3,1,1


## w/o scaling

In [22]:
svm = SVC(kernel='linear')
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")

Accuracy: 0.9714


## with standard scaling

In [23]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
svm = SVC(kernel='linear')
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")

Accuracy: 0.9667


## with MinMax scaling

In [25]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
svm = SVC(kernel='linear')
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")

Accuracy: 0.9714


## HR Dataset

In [30]:
os.chdir("D:\\meridianthe4\\PML\\Cases")
hr = pd.read_csv("HR_comma_sep.csv")
X, y = hr.drop('left', axis=1), hr['left']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=25, stratify=y)
ohe = OneHotEncoder(drop='first', sparse_output=False).set_output(transform="pandas")
col_transformer = ColumnTransformer([("OHE", ohe, make_column_selector(dtype_include=object))], 
                                    remainder='passthrough', 
                                    verbose_feature_names_out=False)
col_transformer = col_transformer.set_output(transform="pandas")
std_scaler = StandardScaler().set_output(transform="pandas")
svm=SVC(kernel='linear')
pipe = Pipeline([("TRNSF", col_transformer), ("SCL", std_scaler), ("SVM", svm)])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")

Accuracy: 0.7820


In [28]:
# pipe = Pipeline([("TRNSF", col_transformer), ("SVM", svm)])
# pipe.fit(X_train, y_train)
# y_pred = pipe.predict(X_test)
# print(f"Accuracy without scaling: {accuracy_score(y_test, y_pred):.4f}")
# Accuracy without scaling: 0.7804
# Commented because takes too long to run

In [31]:
hr = pd.read_csv("HR_comma_sep.csv")
X, y = hr.drop('left', axis=1), hr['left']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=25, stratify=y)
Cs = np.linspace(0.01, 5, 15)
scores = []
for c in Cs:
    svm = SVC(kernel='linear', C=c)
    pipe = Pipeline([("TRNSF", col_transformer), ("SCL", std_scaler), ("SVM", svm)])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    scores.append([c, accuracy_score(y_test, y_pred)])
scores_df = pd.DataFrame(scores, columns=['C', 'score'])
scores_df.sort_values('score', ascending=False)

Unnamed: 0,C,score
0,0.01,0.786397
1,0.366429,0.781952
2,0.722857,0.781952
3,1.079286,0.781952
4,1.435714,0.781952
5,1.792143,0.781952
6,2.148571,0.781952
7,2.505,0.781952
8,2.861429,0.781952
9,3.217857,0.781952


## Polynomial Kernel

In [34]:
svm = SVC(kernel="poly")
pipe = Pipeline([("TRNSF", col_transformer), ("SCL", std_scaler), ("SVM", svm)])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print(f"Accuracy with polynomial kernel: {accuracy_score(y_test, y_pred):.4f}")

Accuracy with polynomial kernel: 0.9415


## Radial Kernel

In [36]:
svm = SVC(kernel="rbf")
pipe = Pipeline([("TRNSF", col_transformer), ("SCL", std_scaler), ("SVM", svm)])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print(f"Accuracy with Radial kernel: {accuracy_score(y_test, y_pred):.4f}")

Accuracy with Radial kernel: 0.9524


In [38]:
from tqdm import tqdm

In [39]:
Cs = np.linspace(0.01, 5, 15)
deg = [2, 3, 4]
scores = []
for d in tqdm(deg):
    for c in Cs:
        svm = SVC(kernel='poly', C=c, degree=d)
        pipe = Pipeline([("TRNSF", col_transformer), ("SCL", std_scaler), ("SVM", svm)])
        pipe.fit(X_train, y_train)
        y_pred = pipe.predict(X_test)
        scores.append([c, d, accuracy_score(y_test, y_pred)])
scores_df = pd.DataFrame(scores, columns=['C', 'degree', 'score'])
scores_df.sort_values('score', ascending=False)

100%|██████████| 3/3 [00:54<00:00, 18.05s/it]


Unnamed: 0,C,degree,score
41,3.930714,4,0.956879
40,3.574286,4,0.956657
39,3.217857,4,0.956435
42,4.287143,4,0.956212
44,5.0,4,0.95599
38,2.861429,4,0.955768
43,4.643571,4,0.955768
37,2.505,4,0.955101
36,2.148571,4,0.953768
35,1.792143,4,0.953545


In [43]:
Cs = np.linspace(0.01, 5, 10)
gammas = np.linspace(0.01, 5, 10)
scores = []
for g in tqdm(gammas):
    for c in Cs:
        svm = SVC(kernel='rbf', C=c, gamma=g)
        pipe = Pipeline([("TRNSF", col_transformer), ("SCL", std_scaler), ("SVM", svm)])
        pipe.fit(X_train, y_train)
        y_pred = pipe.predict(X_test)
        scores.append([c, g, accuracy_score(y_test, y_pred)])
scores_df = pd.DataFrame(scores, columns=['C', 'gamma', 'score'])
scores_df.sort_values('score', ascending=False)

100%|██████████| 10/10 [11:25<00:00, 68.56s/it]


Unnamed: 0,C,gamma,score
23,1.673333,1.118889,0.982885
24,2.227778,1.118889,0.981107
37,3.891111,1.673333,0.981107
36,3.336667,1.673333,0.981107
38,4.445556,1.673333,0.981107
...,...,...,...
60,0.010000,3.336667,0.777062
70,0.010000,3.891111,0.773950
80,0.010000,4.445556,0.771060
90,0.010000,5.000000,0.769060
