In [1]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score

## Load dataset

In [2]:
# load dataset
df = pd.read_csv('Hepatitis.csv')
df.head()

Unnamed: 0,age,sex,steriod,antivirals,fatigue,mamaise,anorexia,liver big,liver firm,spleen palpable,spiders,ascites,varices,bilirubin,alk phosphate,sgot,albumin,protime,histology,class
0,39,1,1,1,1,1,2,2,1,2,2,2,2,2.3,280,98,3.8,40,1,1
1,59,1,1,2,1,1,2,2,1,1,1,2,2,1.5,107,157,3.6,38,2,1
2,47,1,2,2,2,2,2,2,2,2,1,2,1,2.0,84,23,4.2,66,2,1
3,48,1,1,2,1,1,2,2,1,2,1,1,1,4.8,123,157,2.7,31,2,1
4,47,1,2,2,1,1,2,2,1,2,2,1,1,1.7,86,20,2.1,46,2,1


In [3]:
# 독립변수/종속변수 분리
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [4]:
X.head()

Unnamed: 0,age,sex,steriod,antivirals,fatigue,mamaise,anorexia,liver big,liver firm,spleen palpable,spiders,ascites,varices,bilirubin,alk phosphate,sgot,albumin,protime,histology
0,39,1,1,1,1,1,2,2,1,2,2,2,2,2.3,280,98,3.8,40,1
1,59,1,1,2,1,1,2,2,1,1,1,2,2,1.5,107,157,3.6,38,2
2,47,1,2,2,2,2,2,2,2,2,1,2,1,2.0,84,23,4.2,66,2
3,48,1,1,2,1,1,2,2,1,2,1,1,1,4.8,123,157,2.7,31,2
4,47,1,2,2,1,1,2,2,1,2,2,1,1,1.7,86,20,2.1,46,2


In [5]:
y.head()

0    1
1    1
2    1
3    1
4    1
Name: class, dtype: int64

## Linear SVM 

In [8]:
?SVC

## 주요 파라미터

### C

### kernel: {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'}

#### - degree (if poly)

#### - gamma (if poly, rbf, and sigmoid)

#### - coef0 (if poly and sigmoid)

#### - decision_function_shape: {'ovo', 'ovr'}

In [9]:
# 모형 선언 후 학습
lin_svc = SVC(kernel='linear')  # default C=1.0
lin_svc.fit(X, y)

SVC(kernel='linear')

In [10]:
# 각 클래스의 서포트 벡터 개수
lin_svc.n_support_

array([10, 13], dtype=int32)

In [11]:
# 서포트 벡터에 해당하는 관측치 인덱스
lin_svc.support_

array([ 0,  1,  2,  4,  5,  6,  7,  9, 11, 12, 48, 49, 52, 60, 64, 65, 69,
       70, 72, 73, 74, 76, 79], dtype=int32)

In [12]:
lin_svc.support_vectors_

array([[ 39. ,   1. ,   1. ,   1. ,   1. ,   1. ,   2. ,   2. ,   1. ,
          2. ,   2. ,   2. ,   2. ,   2.3, 280. ,  98. ,   3.8,  40. ,
          1. ],
       [ 59. ,   1. ,   1. ,   2. ,   1. ,   1. ,   2. ,   2. ,   1. ,
          1. ,   1. ,   2. ,   2. ,   1.5, 107. , 157. ,   3.6,  38. ,
          2. ],
       [ 47. ,   1. ,   2. ,   2. ,   2. ,   2. ,   2. ,   2. ,   2. ,
          2. ,   1. ,   2. ,   1. ,   2. ,  84. ,  23. ,   4.2,  66. ,
          2. ],
       [ 47. ,   1. ,   2. ,   2. ,   1. ,   1. ,   2. ,   2. ,   1. ,
          2. ,   2. ,   1. ,   1. ,   1.7,  86. ,  20. ,   2.1,  46. ,
          2. ],
       [ 33. ,   1. ,   1. ,   2. ,   1. ,   1. ,   2. ,   2. ,   2. ,
          2. ,   2. ,   1. ,   2. ,   0.7,  63. ,  80. ,   3. ,  31. ,
          2. ],
       [ 42. ,   1. ,   1. ,   1. ,   1. ,   1. ,   2. ,   2. ,   2. ,
          2. ,   1. ,   2. ,   2. ,   0.5,  62. ,  68. ,   3.8,  29. ,
          2. ],
       [ 50. ,   1. ,   2. ,   2. ,   1. ,   2. ,   

In [13]:
# 서포트 벡터에 해당하는 관측치 값
pd.DataFrame(lin_svc.support_vectors_, 
             columns=X.columns)

Unnamed: 0,age,sex,steriod,antivirals,fatigue,mamaise,anorexia,liver big,liver firm,spleen palpable,spiders,ascites,varices,bilirubin,alk phosphate,sgot,albumin,protime,histology
0,39.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.3,280.0,98.0,3.8,40.0,1.0
1,59.0,1.0,1.0,2.0,1.0,1.0,2.0,2.0,1.0,1.0,1.0,2.0,2.0,1.5,107.0,157.0,3.6,38.0,2.0
2,47.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,1.0,2.0,84.0,23.0,4.2,66.0,2.0
3,47.0,1.0,2.0,2.0,1.0,1.0,2.0,2.0,1.0,2.0,2.0,1.0,1.0,1.7,86.0,20.0,2.1,46.0,2.0
4,33.0,1.0,1.0,2.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,0.7,63.0,80.0,3.0,31.0,2.0
5,42.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,0.5,62.0,68.0,3.8,29.0,2.0
6,50.0,1.0,2.0,2.0,1.0,2.0,2.0,2.0,1.0,1.0,2.0,1.0,1.0,2.8,155.0,75.0,2.4,32.0,2.0
7,38.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,0.4,243.0,49.0,3.8,90.0,2.0
8,49.0,1.0,1.0,2.0,1.0,1.0,2.0,2.0,2.0,1.0,1.0,2.0,2.0,1.4,85.0,70.0,3.5,35.0,2.0
9,43.0,1.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,2.0,1.2,100.0,19.0,3.1,42.0,2.0


In [14]:
# weights and intercept
print(lin_svc.coef_)
print(lin_svc.intercept_)

[[ 2.81414369e-02  1.00000000e+00  7.33947784e-01  4.89590777e-01
  -1.27572636e-01  5.25818167e-01 -1.71019278e+00 -6.74006695e-01
   5.80576764e-01  3.39972927e-01  5.24695752e-01 -2.29612682e-02
  -2.03857168e-01 -6.31711760e-01 -1.22925777e-03  1.28422681e-02
   4.39427050e-01  2.39730756e-02 -1.37336355e+00]]
[-1.837366]


In [15]:
y_pred = lin_svc.predict(X)

In [16]:
# 예측된 label
y_pred

array([1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [17]:
con_mat = confusion_matrix(y, y_pred)
acc = accuracy_score(y, y_pred)
f1 = f1_score(y, y_pred)
print(con_mat)
print('accuracy:', acc, 'f1 score:', f1)


[[11  2]
 [ 0 67]]
accuracy: 0.975 f1 score: 0.9166666666666666


### Regularization parameter 바꿔보기

In [18]:
lin_svc = SVC(kernel='linear', C=5)
lin_svc.fit(X, y)

SVC(C=5, kernel='linear')

In [19]:
y_pred = lin_svc.predict(X)
con_mat_2 = confusion_matrix(y, y_pred)
acc_2 = accuracy_score(y, y_pred)
f1_2 = f1_score(y, y_pred)
print(con_mat_2)
print('accuracy:', acc_2, 'f1 score:', f1_2)


[[12  1]
 [ 0 67]]
accuracy: 0.9875 f1 score: 0.9600000000000001


### 여러 regularization parameter 에 대해서 모형 학습 해보기

In [20]:
from sklearn.model_selection import train_test_split
n_instances, n_features = X.shape
idx = np.arange(n_instances)
train_index, test_index = train_test_split(idx,
                                           train_size=0.8,
                                           test_size=0.2)
X_train, X_test = X.iloc[train_index,:], X.iloc[test_index, :]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]

In [21]:
print(X_train.shape)
print(X_test.shape)

(64, 19)
(16, 19)


In [22]:
c_range = [1, 5, 10, 100, 1000]

f1_train = np.zeros(shape=(5,))
acc_train = np.zeros(shape=(5,))
f1_test = np.zeros(shape=(5,))
acc_test = np.zeros(shape=(5,)) 

In [23]:
svc_list = []
for i, c in enumerate(c_range):
    svc = SVC(kernel='linear', C=c).fit(X_train, y_train)
    svc_list.append(svc)
    y_train_pred =svc.predict(X_train)
    y_test_pred = svc.predict(X_test)
    
    acc_train[i] = accuracy_score(y_train, y_train_pred)
    f1_train[i] = f1_score(y_train, y_train_pred)
    
    acc_test[i] = accuracy_score(y_test, y_test_pred)
    f1_test[i] = f1_score(y_test, y_test_pred)
    
    print('='*30)
    print('Regularization parameter:', c)
    print('train acc:', acc_train[i])
    print('train f1 score:', f1_train[i])
    print('test acc:', acc_test[i])
    print('test f1 score:',f1_test[i])

Regularization parameter: 1
train acc: 0.96875
train f1 score: 0.9230769230769231
test acc: 0.8125
test f1 score: 0.0
Regularization parameter: 5
train acc: 1.0
train f1 score: 1.0
test acc: 0.6875
test f1 score: 0.0
Regularization parameter: 10
train acc: 1.0
train f1 score: 1.0
test acc: 0.6875
test f1 score: 0.0
Regularization parameter: 100
train acc: 1.0
train f1 score: 1.0
test acc: 0.6875
test f1 score: 0.0
Regularization parameter: 1000
train acc: 1.0
train f1 score: 1.0
test acc: 0.6875
test f1 score: 0.0


## Non-linear kernel

In [24]:
svc_rbf = SVC(kernel='rbf', max_iter=1000, C=1).fit(X_train,y_train)
svc_rbf2 = SVC(kernel='rbf', max_iter=1000, C=100).fit(X_train,y_train)
svc_poly = SVC(kernel='poly', degree=2, max_iter=1000, C=1).fit(X_train,y_train)
svc_poly2 = SVC(kernel='poly', degree=2, max_iter=1000, C=10).fit(X_train,y_train)

In [25]:
svc_list = [svc_rbf, svc_rbf2, svc_poly, svc_poly2]

In [26]:
for svc in svc_list:
    y_train_pred= svc.predict(X_train)
    y_test_pred = svc.predict(X_test)
    con_mat_train = confusion_matrix(y_train, y_train_pred)
    con_mat_test = confusion_matrix(y_test, y_test_pred)
    print('='*30)
    print(svc)
    print('train set confusion mat')
    print(con_mat_train)
    print('test set confusion mat')
    print(con_mat_test)

SVC(C=1, max_iter=1000)
train set confusion mat
[[ 0 13]
 [ 0 51]]
test set confusion mat
[[16]]
SVC(C=100, max_iter=1000)
train set confusion mat
[[11  2]
 [ 1 50]]
test set confusion mat
[[ 0  0]
 [ 2 14]]
SVC(C=1, degree=2, kernel='poly', max_iter=1000)
train set confusion mat
[[ 0 13]
 [ 0 51]]
test set confusion mat
[[16]]
SVC(C=10, degree=2, kernel='poly', max_iter=1000)
train set confusion mat
[[ 1 12]
 [ 1 50]]
test set confusion mat
[[ 0  0]
 [ 1 15]]


### Standard scaler

In [27]:
from sklearn.preprocessing import StandardScaler
X_scaler = StandardScaler()
X_scaler.fit(X_train)

X_train_std = X_scaler.transform(X_train)
X_test_std = X_scaler.transform(X_test)


In [31]:
svc_rbf_std = SVC(kernel='rbf', max_iter=1000, C=1).fit(X_train_std, y_train)
svc_rbf2_std = SVC(kernel='rbf', max_iter=1000, C=100).fit(X_train_std, y_train)
svc_poly_std = SVC(kernel='poly', degree=2, max_iter=1000, C=1).fit(X_train_std, y_train)
svc_poly2_std = SVC(kernel='poly', degree=2, max_iter=1000, C=10).fit(X_train_std, y_train)


In [32]:
svc_list_std= [svc_rbf_std, svc_rbf2_std, svc_poly_std, svc_poly2_std]

In [33]:
for svc in svc_list_std:
    y_train_pred = svc.predict(X_train_std)
    y_test_pred = svc.predict(X_test_std)
    con_mat_train = confusion_matrix(y_train, y_train_pred)
    con_mat_test = confusion_matrix(y_test, y_test_pred)
    print('='*30)
    print(svc)
    print('Standard scaler & train set confusion mat')
    print(con_mat_train)
    print('Standard scaler & test set confusion mat')
    print(con_mat_test)

SVC(C=1, max_iter=1000)
Standard scaler & train set confusion mat
[[11  2]
 [ 0 51]]
Standard scaler & test set confusion mat
[[ 0  0]
 [ 1 15]]
SVC(C=100, max_iter=1000)
Standard scaler & train set confusion mat
[[13  0]
 [ 0 51]]
Standard scaler & test set confusion mat
[[ 0  0]
 [ 2 14]]
SVC(C=1, degree=2, kernel='poly', max_iter=1000)
Standard scaler & train set confusion mat
[[ 9  4]
 [ 0 51]]
Standard scaler & test set confusion mat
[[16]]
SVC(C=10, degree=2, kernel='poly', max_iter=1000)
Standard scaler & train set confusion mat
[[13  0]
 [ 0 51]]
Standard scaler & test set confusion mat
[[ 0  0]
 [ 1 15]]


# 실습

## Dataset: Hepatitis

## Hyper-parameter

### - kernel

### - C

## 5-fold cross validation을 통해 각 모형 하이퍼 파라미터에 대한 평균 train, test 성능(acc, f1)을 계산하기.

In [None]:
svc_rbf_std = SVC(kernel='rbf', max_iter=1000, C=1)
svc_rbf2_std = SVC(kernel='rbf', max_iter=1000, C=100)
svc_poly_std = SVC(kernel='poly', degree=2, max_iter=1000, C=1)
svc_poly2_std = SVC(kernel='poly', degree=2, max_iter=1000, C=10)

In [36]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5, shuffle=True, random_state=3920)

In [39]:
train_acc1 = np.zeros(shape=(5,))
train_acc2 = np.zeros(shape=(5,))
train_acc3 = np.zeros(shape=(5,))
train_acc4 = np.zeros(shape=(5,))
test_acc1 = np.zeros(shape=(5,))
test_acc2 = np.zeros(shape=(5,))
test_acc3 = np.zeros(shape=(5,))
test_acc4 = np.zeros(shape=(5,))

for i, (train_index, test_index) in enumerate(kf.split(X)):
    X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    scaler = StandardScaler()
    X_train_std = scaler.fit_transform(X_train)
    X_test_std = scaler.transform(X_test)
    
    svc_rbf_std = SVC(kernel='rbf', max_iter=1000, C=1)
    train_acc1[i], test_acc1[i] = svc_acc(svc_rbf_std, X_train_std, y_train, X_test_std, y_test)
    
    svc_rbf2_std = SVC(kernel='rbf', max_iter=1000, C=100)
    train_acc2[i], test_acc2[i] = svc_acc(svc_rbf2_std, X_train_std, y_train, X_test_std, y_test)
    
    svc_poly_std = SVC(kernel='poly', degree=2, max_iter=1000, C=1)
    train_acc3[i], test_acc3[i] = svc_acc(svc_poly_std, X_train_std, y_train, X_test_std, y_test)
    
    svc_poly2_std = SVC(kernel='poly', degree=2, max_iter=1000, C=10)
    train_acc4[i], test_acc4[i] = svc_acc(svc_poly2_std, X_train_std, y_train, X_test_std, y_test)

In [40]:
print(train_acc1)
print(test_acc1)

[0.953125 0.984375 0.96875  0.96875  0.953125]
[0.8125 0.75   0.875  1.     0.875 ]


In [38]:
def svc_acc(model, X_train_, y_train_, X_test_, y_test_):
    model.fit(X_train_, y_train_)
    
    y_train_pred_ = model.predict(X_train_)
    y_test_pred_ = model.predict(X_test_)
    
    train_acc_ = accuracy_score(y_train_, y_train_pred_)
    test_acc_ = accuracy_score(y_test_, y_test_pred_)
    return train_acc_, test_acc_