# KNN분류 연습

In [1]:
import pandas as pd 
import seaborn as sns

In [2]:
df=sns.load_dataset('titanic')

In [3]:
# 출력할 열의 개수를 15로 성정하기
pd.set_option('display.max_columns',15)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [4]:
# nan값이 많은 deck 열을 삭제 
# embarked와 내용이 겹치는 embark_town 열을 삭제 
rdf=df.drop(['deck','embark_town'],axis=1)
rdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   survived    891 non-null    int64   
 1   pclass      891 non-null    int64   
 2   sex         891 non-null    object  
 3   age         714 non-null    float64 
 4   sibsp       891 non-null    int64   
 5   parch       891 non-null    int64   
 6   fare        891 non-null    float64 
 7   embarked    889 non-null    object  
 8   class       891 non-null    category
 9   who         891 non-null    object  
 10  adult_male  891 non-null    bool    
 11  alive       891 non-null    object  
 12  alone       891 non-null    bool    
dtypes: bool(2), category(1), float64(2), int64(4), object(4)
memory usage: 72.5+ KB


In [5]:
# age열에 나이 데이터가 없는 모든 행을 삭제 
rdf=rdf.dropna(subset=['age'],axis=0)
rdf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 714 entries, 0 to 890
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   survived    714 non-null    int64   
 1   pclass      714 non-null    int64   
 2   sex         714 non-null    object  
 3   age         714 non-null    float64 
 4   sibsp       714 non-null    int64   
 5   parch       714 non-null    int64   
 6   fare        714 non-null    float64 
 7   embarked    712 non-null    object  
 8   class       714 non-null    category
 9   who         714 non-null    object  
 10  adult_male  714 non-null    bool    
 11  alive       714 non-null    object  
 12  alone       714 non-null    bool    
dtypes: bool(2), category(1), float64(2), int64(4), object(4)
memory usage: 63.6+ KB


In [10]:
# embarked열의 nan값을 승선 도시 중에서 가장 많인 출현한 값으로 치환하기 
# 승선도시 중에서 가장 많이 출현한 값 
rdf['embarked'].value_counts(dropna=True)
most_freq=rdf['embarked'].value_counts(dropna=True).idxmax()
most_freq

'S'

In [12]:
# embarked 열의 nan값을 most_freq값으로 치환하기 
rdf['embarked'].fillna(most_freq, inplace=True)
rdf['embarked'].value_counts()

S    556
C    130
Q     28
Name: embarked, dtype: int64

In [13]:
# 분석에 활용할 열(속성)을 선택 
ndf=rdf[['survived','pclass','sex','age','sibsp','parch','embarked']]
ndf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 714 entries, 0 to 890
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   survived  714 non-null    int64  
 1   pclass    714 non-null    int64  
 2   sex       714 non-null    object 
 3   age       714 non-null    float64
 4   sibsp     714 non-null    int64  
 5   parch     714 non-null    int64  
 6   embarked  714 non-null    object 
dtypes: float64(1), int64(4), object(2)
memory usage: 44.6+ KB


In [14]:
ndf.describe(include='all')

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,embarked
count,714.0,714.0,714,714.0,714.0,714.0,714
unique,,,2,,,,3
top,,,male,,,,S
freq,,,453,,,,556
mean,0.406162,2.236695,,29.699118,0.512605,0.431373,
std,0.49146,0.83825,,14.526497,0.929783,0.853289,
min,0.0,1.0,,0.42,0.0,0.0,
25%,0.0,1.0,,20.125,0.0,0.0,
50%,0.0,2.0,,28.0,0.0,0.0,
75%,1.0,3.0,,38.0,1.0,1.0,


In [15]:
# get_dummies(): 원핫인코딩 - 범주형 데이터를 모형이 인식할 수 있도록 숫자형으로 변환
onehot_sex=pd.get_dummies(ndf['sex'])
onehot_sex.head()

Unnamed: 0,female,male
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1


In [17]:
ndf['sex'].head()

0      male
1    female
2    female
3    female
4      male
Name: sex, dtype: object

In [18]:
# ndf, onehot_sex 데이터합치기 
ndf=pd.concat([ndf,onehot_sex],axis=1)
ndf.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,embarked,female,male
0,0,3,male,22.0,1,0,S,0,1
1,1,1,female,38.0,1,0,C,1,0
2,1,3,female,26.0,0,0,S,1,0
3,1,1,female,35.0,1,0,S,1,0
4,0,3,male,35.0,0,0,S,0,1


In [19]:
# embarked 데이터를 원핫인코딩으로 변환하기
onehot_embarked = pd.get_dummies(ndf['embarked'],prefix='town')
onehot_embarked

Unnamed: 0,town_C,town_Q,town_S
0,0,0,1
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1
...,...,...,...
885,0,1,0
886,0,0,1
887,0,0,1
889,1,0,0


In [20]:
ndf=pd.concat([ndf,onehot_embarked],axis=1)
ndf.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,embarked,female,male,town_C,town_Q,town_S
0,0,3,male,22.0,1,0,S,0,1,0,0,1
1,1,1,female,38.0,1,0,C,1,0,1,0,0
2,1,3,female,26.0,0,0,S,1,0,0,0,1
3,1,1,female,35.0,1,0,S,1,0,0,0,1
4,0,3,male,35.0,0,0,S,0,1,0,0,1


In [21]:
# 속성변수, 설명변수, 독립변수 
X=ndf[['pclass','age','sibsp','parch','female','male','town_C','town_Q','town_S']]
Y=ndf['survived']
X.head()

Unnamed: 0,pclass,age,sibsp,parch,female,male,town_C,town_Q,town_S
0,3,22.0,1,0,0,1,0,0,1
1,1,38.0,1,0,1,0,1,0,0
2,3,26.0,0,0,1,0,0,0,1
3,1,35.0,1,0,1,0,0,0,1
4,3,35.0,0,0,0,1,0,0,1


In [22]:
# 설명 변수 데이터를 정규화(normalization)
# 분석시 데이터값의 크기에 따라서 분석의 결과에 영향을 미칠 수 있음
# 나이범위가 크기때문에 정규화를 통해서 모든 속성변수들의 값을 기준단위로 변경 
from sklearn import preprocessing
import numpy as np 

In [24]:
X=preprocessing.StandardScaler().fit(X).transform(X)
X

array([[ 0.91123237, -0.53037664,  0.52457013, ..., -0.47180795,
        -0.20203051,  0.53307848],
       [-1.47636364,  0.57183099,  0.52457013, ...,  2.11950647,
        -0.20203051, -1.87589641],
       [ 0.91123237, -0.25482473, -0.55170307, ..., -0.47180795,
        -0.20203051,  0.53307848],
       ...,
       [-1.47636364, -0.73704057, -0.55170307, ..., -0.47180795,
        -0.20203051,  0.53307848],
       [-1.47636364, -0.25482473, -0.55170307, ...,  2.11950647,
        -0.20203051, -1.87589641],
       [ 0.91123237,  0.15850313, -0.55170307, ..., -0.47180795,
         4.94974747, -1.87589641]])

In [25]:
from sklearn.model_selection import train_test_split

In [26]:
X_train,X_test, y_train, y_test=train_test_split(X,Y,test_size=0.3, random_state=10)

In [27]:
print('train data 개수:',X_train.shape)
print('test data 개수:',X_test.shape)

train data 개수: (499, 9)
test data 개수: (215, 9)


In [29]:
# KNN 분류 모형 - sklearn 사용
from sklearn.neighbors import KNeighborsClassifier
# n_neighbors=5: k개의 최근접이웃
# 최근접 데이터를 5개 선택 
knn=KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_hat=knn.predict(X_test)
print(y_hat[0:10])
print(y_test.values[0:10])

[0 0 1 0 0 1 1 1 0 0]
[0 0 1 0 0 1 1 1 0 0]


In [30]:
# 모형의 성능평가하기 
from sklearn import metrics 
knn_matrix=metrics.confusion_matrix(y_test,y_hat)
print(knn_matrix)

[[109  16]
 [ 25  65]]


In [None]:
confusion_matrix
예측값 

T[113 12] [TP FP]T
F[ 29 61] [FN TN]F

TP: True Positive 실제값 T, 예측값 T
FP: False Positivie 실제값 F, 예측값 T
FN: False Negative 실제값 T, 예측값 F
TN: True Negative 실제값: F, 예측값 F

Precision(정확도)
True로 예측한 대상이 실제 True인 비율 
정확도가 높다는 것은 FN오류가 작은경우를 말한다 

Recall(재현율)
실제값이 True인 분석 대상 중 True로 예측한 비율 
재현율이 높다는 것은 FN오류가 낮다는 것을 말한다

F1-Score(F1지표)
정확도와 재현율 조화 평균을 계산한 값 
모형의 예측력을 평가 지표

In [31]:
# 모형 성능평가 - 평가지표 계산
knn_report = metrics.classification_report(y_test, y_hat)
print(knn_report)
# accuracy: 정확도
# macro avg: 단순평균
# weighted avg: 가중평균. 표본의 갯수로 가중평균

              precision    recall  f1-score   support

           0       0.81      0.87      0.84       125
           1       0.80      0.72      0.76        90

    accuracy                           0.81       215
   macro avg       0.81      0.80      0.80       215
weighted avg       0.81      0.81      0.81       215



# SVM(Support Vector Machine) 분류

In [32]:
# SVM 분류모형 - sklearn 사용
# sklearn 라이브러리에서 SVM 분류모형 가져오기
from sklearn import svm
# 모형 객체 생성 (kernel='rbf' 적용)
# 커널: 벡터 공간으로 매핑함수
# rbf: radial basis function
# Linear
# Polynomial
# Sigmoid

In [33]:
svm_model=svm.SVC(kernel='rbf')
#train data를 가지고 모형학습
svm_model.fit(X_train, y_train)
# test data를 가지고 y_hat을 예측(분류)
y_hat=svm_model.predict(X_test)
print(y_hat[0:10])
print(y_test.values[0:10])

[0 0 1 0 0 0 1 0 0 0]
[0 0 1 0 0 1 1 1 0 0]


In [34]:
# 모형 성능 평가 - confusion matrix 게산
svm_matrix=metrics.confusion_matrix(y_test,y_hat)
print(svm_matrix)

[[120   5]
 [ 35  55]]


In [35]:
# 모형 성능 평가 - 평가지표 계산
svm_report = metrics.classification_report(y_test,y_hat)
print(svm_report)

              precision    recall  f1-score   support

           0       0.77      0.96      0.86       125
           1       0.92      0.61      0.73        90

    accuracy                           0.81       215
   macro avg       0.85      0.79      0.80       215
weighted avg       0.83      0.81      0.81       215

