In [59]:
import numpy as np
import pandas as pd
from warnings import filterwarnings
filterwarnings('ignore')

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline

### SVM(Support Vector Machines)
SVM은 다차원 공간에 존재하는 점들 간의 경계 간격이 최대화 되는 평면을 상상하면 되는데, 이는 초평면(hyperplane)이라고도 불리며 각 구역별로 가장 균질한 점들이 남도록 한다

어떤 형태의 데이터에도 적용 가능하지만 <strong>관측값이 상대적으로 매우 높은 차원일 경우에 특히 장점을 가진다</strong>

동작 원리에 따라 크게 세가지로 분류된다
- 최대 마진 분류기
    - 각 영역을 분리한 초평면 사이의 간격이 최대가 되게 하는 것
- 서포트 벡터 분류기
    - 최대 마진 분류기의 확장된 개념
    - 정확한 분류가 불가능한 경우, 허용된 범위 내의 오차를 용인해 최적의 적합화를 시도한다
- 서포트 벡터 머신
    - 결정 경계면이 선형이 아니어서 어떤 비용 함수를 사용하더라도 서포트 벡터 분류기로 분리할 수 없을 경우에 사용한다

#### 커널 함수
주어진 원시 특징 벡터에 관해 그 벡터를 매핑한 특징 벡터의 내적과 같은 값을 반환해주는 함수 

커널 함수를 사용하면 특징 벡터를 고차원 공간으로 매핑한 후 직접 내적을 계산하는 수고를 하지 않아도 이와 똑같은 결과를 얻을 수 있다.
- 다항식 커널
    - 다항식 커널이 주로 사용되는데, 이 중 특히 2차 다항식이 많이 사용된따.
- 래디얼 베이시스 함수(Radial Basis Function, RBF)
    - 비선형 문제를 해결할 때 우선적으로 고려해볼 만한 기법
    - SVM을 사용할 때 특징값 확대는 권장하는 정도지만 RBF 커널에 있어서는 아주 중요하다
    - 값이 작으면 고차원에서 뾰족한 요철이 생기고 값이 커질수록 요청은 부드러워지고 넓어진다.
    - 감마 값이 작으면 낮은 편중과 높은 분산을 가진 해를 도출하고, 높은 감마 값에서는 높은 편중과 낮은 분산을 가진 해를 도출한다 

### UCI 문자 인식 데이터

In [21]:
# 데이터 설명
with open('data/letter-recognition_names.csv') as f:
    for line in f:
        print(line.strip('\n'))

1. Title: Letter Image Recognition Data 

2. Source Information
   -- Creator: David J. Slate
     -- Odesta Corporation; 1890 Maple Ave; Suite 115; Evanston, IL 60201
   -- Donor: David J. Slate (dave@math.nwu.edu) (708) 491-3867   
   -- Date: January, 1991

3. Past Usage:
   -- P. W. Frey and D. J. Slate (Machine Learning Vol 6 #2 March 91):
	"Letter Recognition Using Holland-style Adaptive Classifiers".

   	The research for this article investigated the ability of several
	variations of Holland-style adaptive classifier systems to learn to
	correctly guess the letter categories associated with vectors of 16
	integer attributes extracted from raster scan images of the letters.
	The best accuracy obtained was a little over 80%.  It would be
	interesting to see how well other methods do with the same data.

4. Relevant Information:

   The objective is to identify each of a large number of black-and-white
   rectangular pixel displays as one of the 26 capital letters in the English
 

In [24]:
labels = ['lettr', 'x-box', 'y-box', 'width', 'high', 'onpix', 
          'x-bar', 'y-br', 'x2bar', 'y2bar', 'xybar', 'x2ybr', 
          'xy2br', 'x-ege', 'xegvy', 'y-ege', 'yegvx']

In [26]:
df = pd.read_csv('data/letter-recognition.csv', header=None, names=labels)
df

Unnamed: 0,lettr,x-box,y-box,width,high,onpix,x-bar,y-br,x2bar,y2bar,xybar,x2ybr,xy2br,x-ege,xegvy,y-ege,yegvx
0,T,2,8,3,5,1,8,13,0,6,6,10,8,0,8,0,8
1,I,5,12,3,7,2,10,5,5,4,13,3,9,2,8,4,10
2,D,4,11,6,8,6,10,6,2,6,10,3,7,3,7,3,9
3,N,7,11,6,6,3,5,9,4,6,4,4,10,6,10,2,8
4,G,2,1,3,1,1,8,6,6,6,6,5,9,1,7,5,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,D,2,2,3,3,2,7,7,7,6,6,6,4,2,8,3,7
19996,C,7,10,8,8,4,4,8,6,9,12,9,13,2,9,3,7
19997,T,6,9,6,7,5,6,11,3,7,11,9,5,2,12,2,4
19998,S,2,3,4,2,1,8,7,2,6,10,6,8,1,9,5,8


In [27]:
# 입출력 데이터
x_data = df.drop('lettr', axis=1)
y_data = df['lettr']

In [32]:
# 트레인 테스트셋 구성
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, stratify=y_data, random_state=0)

x_train.shape, x_test.shape, y_train.shape, y_test.shape

((16000, 16), (4000, 16), (16000,), (4000,))

#### 최대 마진 분류기 - 선형 커널

In [33]:
# 모델 생성 및 학습
model_svm = SVC(kernel='linear', C=1.0, random_state=0)
model_svm.fit(x_train, y_train)

SVC(kernel='linear', random_state=0)

In [34]:
# 예측
y_train_pred = model_svm.predict(x_train)
y_test_pred = model_svm.predict(x_test)

In [36]:
# train confusion matrix
pd.crosstab(y_train, y_train_pred, rownames=['Actual'], colnames=['Predicted'])

Predicted,A,B,C,D,E,F,G,H,I,J,...,Q,R,S,T,U,V,W,X,Y,Z
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A,610,1,0,1,0,0,0,0,0,2,...,0,2,2,0,3,0,0,0,4,0
B,2,545,1,7,2,1,6,7,1,1,...,0,21,10,0,0,1,0,2,0,0
C,0,0,531,0,13,1,14,4,0,0,...,0,0,0,2,3,0,0,0,0,0
D,0,19,0,589,0,0,2,7,2,2,...,0,7,0,0,3,0,0,0,0,0
E,0,6,6,0,526,3,29,1,1,0,...,5,2,13,6,0,0,0,2,0,5
F,0,0,1,5,6,551,5,5,3,3,...,0,0,11,11,0,0,0,1,5,0
G,2,2,30,10,4,3,487,5,0,0,...,18,5,15,0,0,9,2,1,0,0
H,1,10,9,30,0,7,5,420,0,2,...,7,35,0,1,5,2,0,3,2,0
I,0,1,2,7,0,11,1,0,539,20,...,0,0,9,0,0,0,0,6,0,7
J,9,0,0,3,0,3,0,3,29,529,...,0,0,10,0,0,0,0,1,0,6


In [38]:
# train score
print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

           A       0.92      0.97      0.94       631
           B       0.84      0.89      0.86       613
           C       0.88      0.90      0.89       589
           D       0.82      0.91      0.87       644
           E       0.84      0.86      0.85       614
           F       0.83      0.89      0.86       620
           G       0.74      0.79      0.76       619
           H       0.71      0.72      0.71       587
           I       0.91      0.89      0.90       604
           J       0.89      0.88      0.89       598
           K       0.83      0.84      0.83       591
           L       0.92      0.89      0.90       609
           M       0.93      0.94      0.93       634
           N       0.95      0.92      0.94       626
           O       0.87      0.75      0.81       603
           P       0.95      0.87      0.91       642
           Q       0.88      0.82      0.85       626
           R       0.81    

In [40]:
# test confusion matrix
pd.crosstab(y_test, y_test_pred, rownames=['Actual'], colnames=['Predicted'])

Predicted,A,B,C,D,E,F,G,H,I,J,...,Q,R,S,T,U,V,W,X,Y,Z
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A,149,0,1,0,0,0,0,0,0,4,...,0,0,1,0,0,0,0,0,2,0
B,0,126,0,2,1,0,4,7,0,1,...,0,5,3,0,0,1,0,0,0,0
C,0,0,129,0,7,0,3,0,0,0,...,0,0,0,0,0,0,0,0,0,0
D,1,5,0,149,0,0,0,2,1,0,...,0,0,0,0,0,0,0,0,0,0
E,0,2,5,0,127,0,6,0,0,0,...,2,2,3,3,0,0,0,2,0,0
F,0,2,0,0,1,142,0,0,0,1,...,0,0,2,3,0,0,0,0,1,0
G,0,0,9,2,0,1,129,1,0,0,...,1,2,3,0,0,0,0,0,0,0
H,0,3,1,8,0,3,1,103,0,0,...,3,3,0,2,3,0,0,1,0,0
I,0,0,0,2,2,5,0,0,131,5,...,0,0,2,0,0,0,0,2,0,0
J,3,0,0,0,0,2,0,0,5,134,...,0,0,3,0,0,0,0,0,0,2


In [43]:
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           A       0.93      0.94      0.93       158
           B       0.78      0.82      0.80       153
           C       0.83      0.88      0.85       147
           D       0.80      0.93      0.86       161
           E       0.80      0.82      0.81       154
           F       0.78      0.92      0.84       155
           G       0.71      0.84      0.77       154
           H       0.67      0.70      0.68       147
           I       0.89      0.87      0.88       151
           J       0.85      0.90      0.88       149
           K       0.75      0.84      0.79       148
           L       0.94      0.89      0.92       152
           M       0.92      0.96      0.94       158
           N       0.90      0.89      0.90       157
           O       0.86      0.76      0.81       150
           P       0.96      0.84      0.90       161
           Q       0.90      0.75      0.82       157
           R       0.83    

#### 다항 커널

In [49]:
# 모델 생성 및 학습
model_svm_poly = SVC(kernel='poly', C=1.0, degree=3)
model_svm_poly.fit(x_train, y_train)

SVC(kernel='poly')

In [50]:
# 예측
y_train_pred2 = model_svm_poly.predict(x_train)
y_test_pred2 = model_svm_poly.predict(x_test)

In [51]:
# train score
print(classification_report(y_train, y_train_pred2))

              precision    recall  f1-score   support

           A       1.00      1.00      1.00       631
           B       0.95      0.96      0.96       613
           C       0.98      0.97      0.97       589
           D       0.94      0.98      0.96       644
           E       0.96      0.98      0.97       614
           F       0.96      0.99      0.97       620
           G       0.95      0.97      0.96       619
           H       0.95      0.90      0.93       587
           I       0.98      0.96      0.97       604
           J       0.97      0.97      0.97       598
           K       0.96      0.96      0.96       591
           L       0.99      0.96      0.98       609
           M       0.99      0.99      0.99       634
           N       1.00      0.98      0.99       626
           O       0.97      0.98      0.97       603
           P       1.00      0.96      0.98       642
           Q       0.99      0.99      0.99       626
           R       0.92    

In [52]:
# test score
print(classification_report(y_test, y_test_pred2))

              precision    recall  f1-score   support

           A       0.99      0.99      0.99       158
           B       0.89      0.94      0.91       153
           C       0.94      0.95      0.95       147
           D       0.89      0.95      0.92       161
           E       0.91      0.94      0.93       154
           F       0.94      0.96      0.95       155
           G       0.92      0.94      0.93       154
           H       0.89      0.91      0.90       147
           I       0.95      0.97      0.96       151
           J       0.99      0.96      0.97       149
           K       0.90      0.96      0.93       148
           L       1.00      0.93      0.96       152
           M       0.97      0.98      0.97       158
           N       0.96      0.94      0.95       157
           O       0.92      0.93      0.93       150
           P       0.98      0.91      0.95       161
           Q       0.94      0.94      0.94       157
           R       0.94    

선형 분류기의 정확도는 85%였는데 다항 커널을 사용했을 때 정확도가 95%로 정확도를 10%정도 향상시킬 수 있었다

#### RBF 커널

In [53]:
# 모델 생성 및 학습
model_svm_rbf = SVC(kernel='rbf', C=1.0, gamma=0.1)
model_svm_rbf.fit(x_train, y_train)

SVC(gamma=0.1)

In [55]:
# 예측
y_train_pred3 = model_svm_rbf.predict(x_train)
y_test_pred3 = model_svm_rbf.predict(x_test)

In [56]:
# train score
print(classification_report(y_train, y_train_pred3))

              precision    recall  f1-score   support

           A       1.00      1.00      1.00       631
           B       0.99      1.00      1.00       613
           C       1.00      1.00      1.00       589
           D       1.00      1.00      1.00       644
           E       1.00      1.00      1.00       614
           F       1.00      1.00      1.00       620
           G       1.00      1.00      1.00       619
           H       0.99      0.99      0.99       587
           I       1.00      0.99      0.99       604
           J       0.99      1.00      0.99       598
           K       1.00      1.00      1.00       591
           L       1.00      1.00      1.00       609
           M       1.00      1.00      1.00       634
           N       1.00      1.00      1.00       626
           O       1.00      1.00      1.00       603
           P       1.00      1.00      1.00       642
           Q       1.00      1.00      1.00       626
           R       1.00    

In [57]:
# test score
print(classification_report(y_test, y_test_pred3))

              precision    recall  f1-score   support

           A       1.00      1.00      1.00       158
           B       0.93      0.97      0.95       153
           C       0.98      0.98      0.98       147
           D       0.95      0.98      0.96       161
           E       0.98      0.97      0.97       154
           F       0.96      0.98      0.97       155
           G       0.95      0.98      0.96       154
           H       0.92      0.92      0.92       147
           I       0.97      0.96      0.97       151
           J       0.97      0.96      0.96       149
           K       0.93      0.95      0.94       148
           L       0.99      0.98      0.99       152
           M       0.94      0.99      0.96       158
           N       0.99      0.97      0.98       157
           O       0.97      0.97      0.97       150
           P       0.97      0.94      0.96       161
           Q       0.97      0.99      0.98       157
           R       0.97    

RBF 커널을 사용했을 때 정확도 97%로 가장 좋은 성능을 보였다