## Example
breast_cancer 데이터와 SVM 알고리즘을 이용해 예측모델 만들기

In [122]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer # 사용할 데이터셋
from sklearn.svm import SVC # 사용할 알고리즘
from sklearn.model_selection import train_test_split # 데이터 분할

### 데이터 불러오기

In [123]:
cancer = load_breast_cancer()
print(cancer.data.shape, cancer.target.shape)

(569, 30) (569,)


In [128]:
cancer

array(['malignant', 'benign'], dtype='<U9')

In [129]:
pd.value_counts(cancer['target']) # 0 : Malignant, 1 : Benign

1    357
0    212
dtype: int64

### 데이터 분할하기 : train, test

In [130]:
X_train, X_test, y_train, y_test = train_test_split(cancer.data,
                                                    cancer.target,
                                                    train_size = 0.8,
                                                    stratify = cancer.target,
                                                    random_state = 1234)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(455, 30) (455,) (114, 30) (114,)


### 데이터 분할 : train, validation set

In [131]:
X_train_d, X_validation, y_train_d, y_validation = train_test_split(X_train,
                                                                y_train,
                                                                test_size = 114,
                                                                stratify = y_train,
                                                                random_state = 321)

print(X_train_d.shape, y_train_d.shape, X_validation.shape, y_validation.shape)

(341, 30) (341,) (114, 30) (114,)


### 모델 구축 (1)
- Learn Algorithm = SVM
- Hyper-parameters = gamma, C
- Hold out

In [132]:
np.logspace(-3,2,num=6)

array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02])

In [133]:
scores = {}
# grid search 6 * 6
for gamma in np.logspace(-3,2,num=6) : 
    for C in np.logspace(-3,2,num=6) :
        model = SVC(gamma=gamma, C=C)
        model.fit(X_train_d,y_train_d)
        scores[(gamma,C)] = model.score(X_validation, y_validation)

scores = pd.Series(scores)
print(scores.idxmax(), scores.max())
#print(np.logspace(-3,2,num=6))

(0.001, 1.0) 0.9385964912280702


In [134]:
# 최종 모델 선택
gamma = 0.001
C = 1.0
model = SVC(gamma=gamma, C=C)

In [136]:
# 최종 모델 평가, test set 이용
model.fit(X_train,y_train)
#yfit = model.predict(X_test)
model.score(X_test,y_test)
#np.mean(yfit==y_test)

0.9473684210526315

### 모델 구축 (2)
- Learn Algorithm = SVM
- Hyper-parameters = gamma, C
- K-Fold CV

In [137]:
from sklearn.model_selection import cross_val_score

# 모델 구축, Learn Algorithm = SVM, Hyper-parameters = gamma, C
scores = {}
# grid search 6 * 6
for gamma in np.logspace(-3,2,num=6) : 
    for C in np.logspace(-3,2,num=6) :
        model = SVC(gamma=gamma, C=C)
        scores[(gamma,C)] = cross_val_score(model, X_train, y_train, cv=10).mean()

scores = pd.Series(scores)
print(scores.idxmax(), scores.max())
#print(np.logspace(-3,2,num=6))

(0.001, 1.0) 0.9167632850241546


In [138]:
# 최종 모델 선택
gamma = 0.001
C = 1.0
model = SVC(gamma=gamma, C=C)

model.fit(X_train,y_train)
model.score(X_test,y_test)

0.9473684210526315

### 모델 평가

In [139]:
from sklearn.metrics import confusion_matrix

y_pred = model.predict(X_test)
y_true = y_test
confusion_matrix(y_true, y_pred)

array([[38,  4],
       [ 2, 70]], dtype=int64)

In [140]:
# negative = 악성, positive = 양성
tn,fp,fn,tp = confusion_matrix(y_true,y_pred).ravel() # ravel 다차원을 1차원으로 푸는 것
tn,fp,fn,tp

(38, 4, 2, 70)

In [141]:
#accuracy 1 = 양성, 0 = 악성
print('accuracy = ',(tn+tp)/(tn+fp+fn+tp))
#Precision
print('Precision = ',tp/(tp+fp))
#Sensitivity
print('Sensitivity = ',tp/(tp+fn))
#f1 socre
precision = tp/(tp+fp)
sensitivity = tp/(tp+fn)
f1 = 2/(1/precision + 1/sensitivity)
print('f1 score = ',f1)

accuracy =  0.9473684210526315
Precision =  0.9459459459459459
Sensitivity =  0.9722222222222222
f1 score =  0.958904109589041


In [142]:
# negative = 양성, positive = 악성
tp,fn,fp,tn = confusion_matrix(y_true,y_pred).ravel() 

#accuracy
print('accuracy = ',(tn+tp)/(tn+fp+fn+tp))
#Precision
print('Precision = ',tp/(tp+fp))
#Sensitivity
print('Sensitivity = ',tp/(tp+fn))
#f1 socre
precision = tp/(tp+fp)
sensitivity = tp/(tp+fn)
f1 = 2/(1/precision + 1/sensitivity)
print('f1 score = ',f1)

accuracy =  0.9473684210526315
Precision =  0.95
Sensitivity =  0.9047619047619048
f1 score =  0.9268292682926829


In [143]:
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html
from sklearn.metrics import f1_score
f1_score(y_true,y_pred)

0.9589041095890412