## 다양한 알고리즘으로 훈련
* 다양한 머신러닝 알고리즘을 이용해서
* 교차검증 방식으로 모델을 훈련시키고
* 예측정확도를 평가해 봄

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.model_selection import train_test_split, cross_val_score, KFold

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score

In [3]:
# 데이터 불러오기
titanic = pd.read_csv('titanic3.csv')
titanic.head()

Unnamed: 0,titles,age,sibsp,parch,fare,gender,embark_town,pclass,survived
0,16,29.0,0.0,0.0,211.3375,0,2,0,1
1,13,0.9167,1.0,2.0,151.55,1,2,0,1
2,16,2.0,1.0,2.0,151.55,0,2,0,0
3,19,30.0,1.0,2.0,151.55,1,2,0,0
4,20,25.0,1.0,2.0,151.55,0,2,0,0


In [4]:
# data, target으로 나눔
data = titanic.iloc[:, :8]
target = titanic.survived

In [5]:
data

Unnamed: 0,titles,age,sibsp,parch,fare,gender,embark_town,pclass
0,16,29.000000,0.0,0.0,211.3375,0,2,0
1,13,0.916700,1.0,2.0,151.5500,1,2,0
2,16,2.000000,1.0,2.0,151.5500,0,2,0
3,19,30.000000,1.0,2.0,151.5500,1,2,0
4,20,25.000000,1.0,2.0,151.5500,0,2,0
...,...,...,...,...,...,...,...,...
1301,16,14.500000,1.0,0.0,14.4542,0,0,2
1302,16,29.881135,1.0,0.0,14.4542,0,0,2
1303,19,26.500000,0.0,0.0,7.2250,1,0,2
1304,19,27.000000,0.0,0.0,7.2250,1,0,2


In [6]:
titanic.survived.value_counts()

survived
0    808
1    498
Name: count, dtype: int64

In [7]:
# train, test로 나눔
X_train, X_test, y_train, y_test = train_test_split(data, target, train_size=0.7, 
                                                    random_state=2309220945, stratify=target)

## 머신러닝 알고리즘 적용

#### 의사결정나무 분류기

In [8]:
dtclf = DecisionTreeClassifier()

dtclf.fit(X_train, y_train)
pred = dtclf.predict(X_test)
accuracy_score(y_test, pred)

0.7397959183673469

#### 로지스틱 회귀 분류기

In [9]:
lrclf = LogisticRegression()

lrclf.fit(X_train, y_train)
pred = lrclf.predict(X_test)
accuracy_score(y_test, pred)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.7602040816326531

#### 랜덤포레스트 분류기

In [10]:
rfclf = RandomForestClassifier()

rfclf.fit(X_train, y_train)
pred = rfclf.predict(X_test)
accuracy_score(y_test, pred)

0.7831632653061225

#### K이웃 분류기

In [11]:
# 'Flags' object has no attribute 'c_contiguous'

In [12]:
# pip install scikit-learn==1.2.2

In [13]:
knclf = KNeighborsClassifier(n_neighbors=5)

knclf.fit(X_train, y_train)
pred = knclf.predict(X_test)
accuracy_score(y_test, pred)

0.6964285714285714

### 교차검증 cross validation
* 데이터수가 적은 경우 데이터의 일부인 평가 데이터도 작음
    + 성능 평가의 신뢰도 의심
* 데이터를 동일한 크기로 k개 나누고
* 이들 중 훈련/평가 데이터로 구분지어 평가를 k번 실시함
    + KFold 교차검증이라 함
* sklearn 패키지에서는 cross_val_score 함수 이용
* cross_val_score(분류기, 독립변수, 종속변수, 평가방식, 검증횟수)

In [14]:
dtclf = DecisionTreeClassifier()
scores = cross_val_score(dtclf, data, target, scoring='accuracy', cv=10)
scores, np.mean(scores)

(array([0.64122137, 0.7480916 , 0.83206107, 0.70992366, 0.71755725,
        0.71755725, 0.66923077, 0.62307692, 0.66923077, 0.68461538]),
 0.7012566059894304)

In [15]:
lrclf = LogisticRegression(max_iter=500)
scores = cross_val_score(lrclf, data, target, scoring='accuracy', cv=10)
scores, np.mean(scores)

(array([0.60305344, 0.80916031, 0.89312977, 0.83969466, 0.80152672,
        0.83206107, 0.73846154, 0.64615385, 0.73846154, 0.75384615]),
 0.765554903112155)

In [16]:
rfclf = RandomForestClassifier()
scores = cross_val_score(rfclf, data, target, scoring='accuracy', cv=10)
scores, np.mean(scores)

(array([0.69465649, 0.80916031, 0.83969466, 0.77862595, 0.77862595,
        0.81679389, 0.72307692, 0.64615385, 0.69230769, 0.72307692]),
 0.7502172636523782)

In [17]:
knclf = KNeighborsClassifier(n_neighbors=7)
scores = cross_val_score(knclf, data, target, scoring='accuracy', cv=10)
scores, np.mean(scores)

(array([0.55725191, 0.6870229 , 0.67938931, 0.80152672, 0.6259542 ,
        0.6870229 , 0.66153846, 0.59230769, 0.63846154, 0.65384615]),
 0.6584321785085143)