# 1. Support Vector Machine (linear) from scratch
## 1.1 dataset link: https://www.kaggle.com/ronitf/heart-disease-uci/download
## 1.2 Sklearn test is involved for comparing with same parameter setup
## 1.3 Linear SVM can score 25%-30%

In [3]:
import numpy as np
import pandas as pd
import statistics as st
import matplotlib.pyplot as plt
import os
import math
import random

# 2. Data process functions

In [4]:
## load & split data into train & test set
def load_data(name):
    path = os.getcwd()
    data_path = os.path.join(path, name)
    data_raw = pd.read_csv(data_path)
    data_index = data_raw.keys()

    #print(data_index)
    return (data_raw, data_index)


## plot histogram of each attribute
def plot_hist(data_refine, data_refine_index):
    data_zip = list(zip(*data_refine))

    for index in range(len(data_zip)):
        each_attr = data_zip[index]
        low_b = math.floor(min(each_attr))
        upp_b = math.ceil(max(each_attr))
        plt.hist(each_attr, range=[low_b, upp_b])
        plt.title(data_refine_index[index], loc='center')
        plt.grid()
        plt.show()


## split data_test from data_train, split_ration=0.7, 70% data for training, 30% of data for testing
def split_train_test_data(data_refine, data_refine_target, split_ratio, rand=True):
    data_train = []
    data_train_target = []
    data_test=[]
    data_test_target=[]
    train_length = int(len(data_refine) * split_ratio)

    if rand:
        for index in range(train_length):
            pos = random.randint(0, len(data_refine) - 1)
            data_train.append(data_refine.pop(pos))
            data_train_target.append(data_refine_target.pop(pos))

        data_test = data_refine
        data_test_target = data_refine_target

    else:

        data_refine_dic={}

        for index in range(len(data_refine_target)):
            if data_refine_target[index] not in data_refine_dic:
                data_refine_dic[data_refine_target[index]]=[]
                data_refine_dic[data_refine_target[index]].append(data_refine[index])
            else:
                data_refine_dic[data_refine_target[index]].append(data_refine[index])

        for key in list(data_refine_dic.keys()):

            train_length = int(len(data_refine_dic[key]) * split_ratio)

            for index in range(train_length-1):
                data_train.append(data_refine_dic[key].pop(0))
                data_train_target.append(data_refine_target.pop(0))


            for item in data_refine_dic[key]:
                data_test.append(item)
            data_test_target = data_refine_target

    return (data_train, data_train_target, data_test, data_test_target)


# shuffle the categarized data
def shuffle_data_ca(data_ca, data_ca_target):
    data_shf = []
    data_shf_target = []
    for i in range(len(data_ca)):
        loc = random.randint(0, len(data_ca) - 1)
        data_shf.append(data_ca.pop(loc))
        data_shf_target.append(data_ca_target.pop(loc))

    return (data_shf, data_shf_target)

# scale all data into [0,1]
def scale_data(dt_train):
    col_max_min = [(np.min(col), np.max(col)) for col in list(zip(*dt_train)) ]
    for row_index in range(len(dt_train)):
        for col_index in range(len(dt_train[row_index])):
            col_min = col_max_min[col_index][0]
            col_max = col_max_min[col_index][1]
            dt_train[row_index][col_index] = (dt_train[row_index][col_index]-col_min)/(col_max-col_min)
    return dt_train


 # 3. SVM functions

In [5]:
## generate random weight w0,w1,w2,w3.....
def generate_weight(num_attr, start, end):
    wt_array=[]
    for i in range(num_attr):
        wt_array.append(random.uniform(start, end))
    return wt_array

## SVM linear func
def SVM( dt_train, dt_train_target, wt_arr, kernal='linear', C=1, stp=1, epoch_limit=1000, stp_limit=1e-300, show_bias=True, stp_show=False):

    if kernal == 'linear':
        epoch = 0
        hinge_loss_all = 0
        reg_val = 1 / C
        while epoch <= epoch_limit:
            hinge_loss = []
            hinge_loss_last = hinge_loss_all


            for index in range(len(dt_train)):

                r = np.dot(dt_train[index], wt_arr) * dt_train_target[index]

                if r >= 1:
                    hinge_loss.append(0)
                    wt_arr = wt_arr - stp * reg_val * wt_arr
                else:
                    hinge_loss.append(1 - r)
                    wt_arr = wt_arr + stp * (dt_train_target[index] * np.array(dt_train[index]) - reg_val * wt_arr)

                hinge_loss_all = sum(hinge_loss)

            if stp_show:
                print('epoch:', epoch, ',hinge_loss:', round(hinge_loss_all, 5), ',step: ', stp)

            if abs(hinge_loss_last - hinge_loss_all) <= 0.1:
                stp = stp * 0.01

            if stp <= stp_limit:
                break

            epoch += 1

        if show_bias:
            error = 0
            if stp_show:
                print('vector length: ')
            for index in range(len(dt_train)):
                r = np.dot(dt_train[index], wt_arr) * dt_train_target[index]
                if r < 0:
                    error += 1
                if stp_show:
                    print(r)
            bias = error / len(dt_train) * 100
            print('data train length: ', len(dt_train))
            print('bias: ', bias, '%\n')

        return (wt_arr, bias/100, C)

## SVM test func
def SVM_test(dt_test, dt_test_target, wt_arr, C):
    error = 0
    reg_val = 1 / C
    for index in range(len(dt_test)):

        r = np.dot(dt_test[index], wt_arr) * dt_test_target[index]
        if r < 0:
            error += 1
    # print('test data length: ', len(dt_test))
    # print('variance: ', error / len(dt_test) * 100, '%\n')

    score = 1- error / len(dt_test)

    return (score, len(dt_test))


# 4. Test- Load & Modify Dataset

In [6]:
# load raw data and show correlation with target column
data_raw, data_raw_index=load_data('heart_disease_data.csv')
data_raw.corr(method='pearson')

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
age,1.0,-0.098447,-0.068653,0.279351,0.213678,0.121308,-0.116211,-0.398522,0.096801,0.210013,-0.168814,0.276326,0.068001,-0.225439
sex,-0.098447,1.0,-0.049353,-0.056769,-0.197912,0.045032,-0.058196,-0.04402,0.141664,0.096093,-0.030711,0.118261,0.210041,-0.280937
cp,-0.068653,-0.049353,1.0,0.047608,-0.076904,0.094444,0.044421,0.295762,-0.39428,-0.14923,0.119717,-0.181053,-0.161736,0.433798
trestbps,0.279351,-0.056769,0.047608,1.0,0.123174,0.177531,-0.114103,-0.046698,0.067616,0.193216,-0.121475,0.101389,0.06221,-0.144931
chol,0.213678,-0.197912,-0.076904,0.123174,1.0,0.013294,-0.15104,-0.00994,0.067023,0.053952,-0.004038,0.070511,0.098803,-0.085239
fbs,0.121308,0.045032,0.094444,0.177531,0.013294,1.0,-0.084189,-0.008567,0.025665,0.005747,-0.059894,0.137979,-0.032019,-0.028046
restecg,-0.116211,-0.058196,0.044421,-0.114103,-0.15104,-0.084189,1.0,0.044123,-0.070733,-0.05877,0.093045,-0.072042,-0.011981,0.13723
thalach,-0.398522,-0.04402,0.295762,-0.046698,-0.00994,-0.008567,0.044123,1.0,-0.378812,-0.344187,0.386784,-0.213177,-0.096439,0.421741
exang,0.096801,0.141664,-0.39428,0.067616,0.067023,0.025665,-0.070733,-0.378812,1.0,0.288223,-0.257748,0.115739,0.206754,-0.436757
oldpeak,0.210013,0.096093,-0.14923,0.193216,0.053952,0.005747,-0.05877,-0.344187,0.288223,1.0,-0.577537,0.222682,0.210244,-0.430696


In [7]:
# drop 'chol', 'fbs' du to low correlation to the target 
data_raw = data_raw.drop(columns=['chol', 'fbs'])
data_raw_index = data_raw.keys()
data_raw.head()

Unnamed: 0,age,sex,cp,trestbps,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,0,150,0,2.3,0,0,1,1
1,37,1,2,130,1,187,0,3.5,0,0,2,1
2,41,0,1,130,0,172,0,1.4,2,0,2,1
3,56,1,1,120,1,178,0,0.8,2,0,2,1
4,57,0,0,120,1,163,1,0.6,2,0,2,1


In [47]:
# covert from dataset dataframe to list
data_lst = data_raw.values.tolist()

# refine the dataset by removing the target columns 
data_rf = [dt[:-1] for dt in data_lst]
data_rf_target = list(list(zip(*data_lst))[-1])

# normalize the every columns value to [0,1]
data_rf = scale_data(data_rf)

# add one more column with value '1' into dataset for bias
for dt in data_rf:
    dt.append(1)

# modify dataset target to '1' or '-1'
for index in range(len(data_rf_target)):
    if data_rf_target[index] == 0.0:
        data_rf_target[index] = -1

# convert data_raw_index to a list so as to match the dataset columns
data_index = data_raw_index.values.tolist()
data_index = data_index[:-1]

# split the data_rf into train & test sets
# split ratio is 0.8
dt_train, dt_train_target, dt_test, dt_test_target = \
    split_train_test_data(data_rf, data_rf_target, split_ratio=0.8, rand=False)

# shuffle all training data
dt_train, dt_train_target = shuffle_data_ca(dt_train, dt_train_target);

In [52]:
a=[[1,2,3]]
a[-1].insert(0,1)
a

[[1, 1, 2, 3]]

In [53]:
 abs(121.53696368015022 -121.34117243999961)

0.1957912401506121

# 5. Linear SVM from Scratch

In [35]:
# generate random initial weight in [0,1]
#wt_lst = generate_weight(len(data_index)+1, 0, 1)

In [36]:
### weight is genrated by[95]
wt_lst=[
0.20717319464152695,
0.6206984801546556,
0.43006365979581385,
0.4580060620231049,
0.33883164044002245,
0.36739188157541647,
0.5832498372259184,
0.9209212798422121,
0.6737554996556483,
0.5508969453938781,
0.21220708634760788,
0.43266411739534283,]
wt_arr = np.array(wt_lst)

### calculate the error rate with initial weight
error=0;
for index in range(len(dt_train)):
    r = np.dot(dt_train[index], wt_arr) * dt_train_target[index]
    if r<0:
        error+=1
print('error with initial randome weight: ', error/len(dt_train)*100, '%\n')


# SVM linear algorithm
C=1000
stp=2
epoch_limit =1000
epoch=0
hinge_loss_all=0
while epoch <= epoch_limit:
    hinge_loss=[]
    hinge_loss_last = hinge_loss_all
    reg_val = 1 / C

    for index in range(len(dt_train)):

        r = np.dot(dt_train[index], wt_arr)*dt_train_target[index]

        if r>=1:
            hinge_loss.append(0)
            wt_arr = wt_arr - stp*reg_val*wt_arr
        else:
            hinge_loss.append(1-r)
            wt_arr = wt_arr + stp*(dt_train_target[index]*np.array(dt_train[index])-reg_val*wt_arr)

        hinge_loss_all = sum(hinge_loss)

    if abs(hinge_loss_last - hinge_loss_all)<=0.1:
        stp = stp * 0.01

    if stp<=1e-300:
        break

    epoch+=1

### print the final weight 
print('final weight:', wt_arr,'\n')

### calculate the bias
error=0;
for index in range(len(dt_train)):
    reg_val = 1 / C
    r = np.dot(dt_train[index], wt_arr) * dt_train_target[index]
    if r<0:
        error+=1
print('train data length: ', len(dt_train))
print('bias: ', error/len(dt_train)*100, '%\n')


### calculate the cross-validation & final accuracy
error=0;
for index in range(len(dt_test)):
    reg_val = 1 / C
    r = np.dot(dt_test[index], wt_arr) * dt_test_target[index]
    if r<0:
        error+=1
print('test data length: ',len(dt_test))
print('variance: ', error/len(dt_test)*100, '%\n')
print('score:', (1-error/len(dt_test))*100, '%\n')

error with initial randome weight:  31.25 %

final weight: [ 0.09582597 -0.99526074  1.39254867 -1.78049152 -0.1505056   3.84062239
 -0.66776045 -1.08132111 -0.66188109 -2.89313275 -1.90334204  2.21859269] 

train data length:  240
bias:  20.0 %

test data length:  63
variance:  69.84126984126983 %

score: 30.15873015873016 %



# 5. Sklearn Test

In [20]:
from sklearn.svm import SVC

data_raw, data_raw_index=load_data('heart_disease_data.csv')
data_raw = data_raw.drop(columns=['chol', 'fbs'])
data_raw_index = data_raw.keys()

# covert from dataset dataframe to list
data_lst = data_raw.values.tolist()

# refine the dataset by removing the target columns 
data_rf = [dt[:-1] for dt in data_lst]
data_rf_target = list(list(zip(*data_lst))[-1])

# normalize the every columns value to [0,1]
data_rf = scale_data(data_rf)

# add one more column with value '1' into dataset for bias
# for dt in data_rf:
#     dt.append(1)

# modify dataset target to '1' or '-1'
# for index in range(len(data_rf_target)):
#     if data_rf_target[index] == 0.0:
#         data_rf_target[index] = -1

# convert data_raw_index to a list so as to match the dataset columns
# data_index = data_raw_index.values.tolist()
# data_index = data_index[:-1]

# split the data_rf into train & test sets
# split ratio is 0.8
dt_train, dt_train_target, dt_test, dt_test_target = \
    split_train_test_data(data_rf, data_rf_target, split_ratio=0.8, rand=False)

# shuffle all training data
dt_train, dt_train_target = shuffle_data_ca(dt_train, dt_train_target);



model = SVC(C=100,gamma='scale', kernel='linear')
model.fit(dt_train, dt_train_target)

SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [21]:
score = model.score(dt_test, dt_test_target)
print('score:',score*100, '%\n')

score: 25.396825396825395 %

