# 분류 모델 적용 및 성능평가
knn, svm, tree

##  데이터 로드 및 전처리

In [92]:
# 기본 라이브러리 불러오기
from sklearn import metrics
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

'''
[Step 1] 데이터 준비/ 기본 설정
'''

# Breast Cancer 데이터셋 가져오기 (출처: UCI ML Repository)
uci_path = 'https://archive.ics.uci.edu/ml/machine-learning-databases/\
breast-cancer-wisconsin/breast-cancer-wisconsin.data'
df = pd.read_csv(uci_path, header=None)

# 열 이름 지정
df.columns = ['id', 'clump', 'cell_size', 'cell_shape', 'adhesion', 'epithlial',
              'bare_nuclei', 'chromatin', 'normal_nucleoli', 'mitoses', 'class']

# 데이터셋의 크기
print(df.shape)

(699, 11)


In [93]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   id               699 non-null    int64 
 1   clump            699 non-null    int64 
 2   cell_size        699 non-null    int64 
 3   cell_shape       699 non-null    int64 
 4   adhesion         699 non-null    int64 
 5   epithlial        699 non-null    int64 
 6   bare_nuclei      699 non-null    object
 7   chromatin        699 non-null    int64 
 8   normal_nucleoli  699 non-null    int64 
 9   mitoses          699 non-null    int64 
 10  class            699 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 60.2+ KB


In [94]:
df.describe()

Unnamed: 0,id,clump,cell_size,cell_shape,adhesion,epithlial,chromatin,normal_nucleoli,mitoses,class
count,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0
mean,1071704.0,4.41774,3.134478,3.207439,2.806867,3.216023,3.437768,2.866953,1.589413,2.689557
std,617095.7,2.815741,3.051459,2.971913,2.855379,2.2143,2.438364,3.053634,1.715078,0.951273
min,61634.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0
25%,870688.5,2.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0
50%,1171710.0,4.0,1.0,1.0,1.0,2.0,3.0,1.0,1.0,2.0
75%,1238298.0,6.0,5.0,5.0,4.0,4.0,5.0,4.0,1.0,4.0
max,13454350.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,4.0


In [95]:
df.describe(include='object')

Unnamed: 0,bare_nuclei
count,699
unique,11
top,1
freq,402


In [96]:
df['bare_nuclei'].unique()

array(['1', '10', '2', '4', '3', '9', '7', '?', '5', '8', '6'],
      dtype=object)

In [97]:
df['bare_nuclei'] = df['bare_nuclei'].replace('?',np.nan)
df = df.dropna(subset=['bare_nuclei'], axis=0)
df['bare_nuclei'] = df['bare_nuclei'].astype('int')

In [98]:
df['bare_nuclei'].unique()

array([ 1, 10,  2,  4,  3,  9,  7,  5,  8,  6])

In [99]:
print(df['class'].unique())

[2 4]


In [100]:
# 타켓 변경 2->0, 4->1
df['class'] = df['class'].map({2: 0, 4: 1})

## 모델링 - baseline

In [None]:
# 데이터 분리 7:3
X= df.drop(columns=['id','class']).to_numpy() #타깃변수 제거해야함!!!!!!!!!!!!!!!!!
y = df['class']
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.3, random_state=42)

In [105]:
# 학습
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier

knn = KNeighborsClassifier()
svm = LinearSVC()
tree = DecisionTreeClassifier()

knn.fit(X_train,y_train)
svm.fit(X_train,y_train)
tree.fit(X_train,y_train)

pred_knn = knn.predict(X_test)
pred_svm = svm.predict(X_test)
pred_tree = tree.predict(X_test)

In [106]:
from sklearn.metrics import classification_report

knn_report = metrics.classification_report(y_test,pred_knn)
svm_report = metrics.classification_report(y_test,pred_svm)
tree_report = metrics.classification_report(y_test,pred_tree)

print(f'<knn> {knn_report}')
print(f'<svm> {svm_report}')
print(f'<tree> {tree_report}')

<knn>               precision    recall  f1-score   support

           0       0.95      0.98      0.97       127
           1       0.97      0.91      0.94        78

    accuracy                           0.96       205
   macro avg       0.96      0.95      0.95       205
weighted avg       0.96      0.96      0.96       205

<svm>               precision    recall  f1-score   support

           0       0.95      0.98      0.97       127
           1       0.97      0.91      0.94        78

    accuracy                           0.96       205
   macro avg       0.96      0.95      0.95       205
weighted avg       0.96      0.96      0.96       205

<tree>               precision    recall  f1-score   support

           0       0.93      0.98      0.95       127
           1       0.96      0.87      0.91        78

    accuracy                           0.94       205
   macro avg       0.94      0.92      0.93       205
weighted avg       0.94      0.94      0.94       205



##### 평가
- Train data 평가
- Test data 평가
- 두 수치가 차이가 많이 나면 과대적합(Train > Test), 0.95
- 두 수치가 차이가 많이 나면 과소적합(Train < Test), 0.90 ...0.05이상이면 보통 과적합

## 모델링 - tunning

In [115]:
from sklearn.model_selection import GridSearchCV

tree = DecisionTreeClassifier()
param_grid = {
    'criterion' : ['gini', 'entropy', 'log_loss'],
    'max_depth' : range(1,10)
}

grid_searchcv = GridSearchCV(tree, param_grid=param_grid)
grid_searchcv.fit(X_train,y_train)

0,1,2
,estimator,DecisionTreeClassifier()
,param_grid,"{'criterion': ['gini', 'entropy', ...], 'max_depth': range(1, 10)}"
,scoring,
,n_jobs,
,refit,True
,cv,
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,criterion,'entropy'
,splitter,'best'
,max_depth,4
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [112]:
print(grid_searchcv.best_params_)
print(grid_searchcv.best_score_)

{'criterion': 'entropy', 'max_depth': 4, 'min_samples_split': 2}
0.9644078947368421


In [None]:
tree = DecisionTreeClassifier(criterion='entropy', max_depth=4, min_samples_split=2)
tree.fit(X_train,y_train)
pred_tree = tree.predict(X_test) #파라미터 튜닝한 모델의 예측값 덮어쓰기!!!!!!!!!!!!!!! 잊지마

In [None]:
tree_grid_report = metrics.classification_report(y_test,pred_tree)

print(f'<Tunning> {tree_grid_report}') #모델 성능이 향상됨
print(f'<Baseline> {tree_report}')

<Tunning>               precision    recall  f1-score   support

           0       0.96      0.98      0.97       127
           1       0.97      0.94      0.95        78

    accuracy                           0.97       205
   macro avg       0.97      0.96      0.96       205
weighted avg       0.97      0.97      0.97       205

<Baseline>               precision    recall  f1-score   support

           0       0.93      0.98      0.95       127
           1       0.96      0.87      0.91        78

    accuracy                           0.94       205
   macro avg       0.94      0.92      0.93       205
weighted avg       0.94      0.94      0.94       205

