### Her is our Support Vector Machine

### Let's organize some data

In [186]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split

In [187]:
df = pd.read_csv('breast-cancer.csv')
df.head(5)

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


### Turn the diagnosis into 0 and 1
### Sample the data
### Maintain balance between zeros and ones in training data to avoide overfitting with any of them

In [188]:
df['diagnosis'].unique()

array(['M', 'B'], dtype=object)

In [189]:
df['diagnosis'] = (df['diagnosis'] == 'M').astype(int)
df['diagnosis'].unique()

array([1, 0])

In [190]:
cols = ['diagnosis', 'id']
x = df.drop(cols, axis=1)
y = df['diagnosis']

In [191]:
x.dtypes

radius_mean                float64
texture_mean               float64
perimeter_mean             float64
area_mean                  float64
smoothness_mean            float64
compactness_mean           float64
concavity_mean             float64
concave points_mean        float64
symmetry_mean              float64
fractal_dimension_mean     float64
radius_se                  float64
texture_se                 float64
perimeter_se               float64
area_se                    float64
smoothness_se              float64
compactness_se             float64
concavity_se               float64
concave points_se          float64
symmetry_se                float64
fractal_dimension_se       float64
radius_worst               float64
texture_worst              float64
perimeter_worst            float64
area_worst                 float64
smoothness_worst           float64
compactness_worst          float64
concavity_worst            float64
concave points_worst       float64
symmetry_worst      

In [192]:
y = np.where(y == 0, -1, 1)

In [193]:
np.unique(y)

array([-1,  1])

In [194]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.25, random_state=2108602)

In [195]:
x_train = x_train.to_numpy()
x_test = x_test.to_numpy()
# y_train = y_train.to_numpy()
# y_test = y_test.to_numpy()

In [196]:
def scale_dataset(X, y, oversample=False):
    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    
    if oversample:
        ros = RandomOverSampler()
        X, y = ros.fit_resample(X, y)

    data = np.hstack((X, np.reshape(y, (len(y), 1))))
    
    return data, X, y

In [197]:
train, x_train, y_train = scale_dataset(x_train, y_train, oversample= True)
test, x_test, y_test = scale_dataset(x_test, y_test)

In [198]:
print(len(y_train == 1)) 
print(len(y_train == 0)) 

536
536


In [199]:
# Hyperparameters
lmbda = 0.01
lr = 1e-03
itr = 100

### Trial Number 1

In [200]:
def init_W(x):
    m, n = x.shape
    np.random.seed(2108602)
    w = np.random.random(n)
    b = np.random.random(1)
    return w, b


def Loss(x, y, w, b, lmbda):
    return lmbda * np.linalg.norm(w) + max(0, 1 - np.dot(y, (np.dot(w, x.T) - b) ) )


def grad(x, y, w, b, lr, lmbda): 
    for i in range(len(y)):
        out = np.dot(y[i], (np.dot(w, x[i].T) - b) ) 
        if out >= 1:
            w -= lr*(2*lmbda*w)
            b = b
        else:
            w -= lr*(2*lmbda - y[i] * x[i])
            b -= lr*y[i]

    return w, b


def SVM(x, y, lmbda, lr, itr): 
    k = 0
    l_avg = np.zeros(itr)
    w, b = init_W(x) 
    while k < itr:
        l = np.zeros(len(y))
        for i in range(len(y)):
            w, b = grad(x, y, w, b, lr, lmbda)
            l[i] = Loss(x, y, w, b, lmbda) 

        l_avg[k] = np.average(l) 
        k += 1
        
    w_final = w
    b_final = b
    return w_final, b_final, l_avg


def pred(x, w_final, b_final):
    y_hat = np.dot(w_final, x.T) - b_final
    return np.sign(y_hat)


In [201]:
w_f, b_f, l_avg = SVM(x_train, y_train, lmbda, lr, itr)
y_h = pred(x_test, w_f, b_f)

In [202]:
y_train.shape

(536,)

In [207]:
def accuracy(y_true, y_pred):
    accuracy = np.sum(y_true == y_pred) / len(y_true)
    return accuracy

print("SVM classification accuracy", accuracy(y_test, y_h)*100)

SVM classification accuracy 95.8041958041958
