<a href="https://colab.research.google.com/github/mgk0422/python-basic/blob/master/scikit_learn_titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import glob
import sys
from openpyxl import load_workbook
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
# 코랩 한글깨짐 설정
import matplotlib as mpl
import matplotlib.pyplot as plt
 
%config InlineBackend.figure_format = 'retina'
 
!apt -qq -y install fonts-nanum
 
import matplotlib.font_manager as fm
fontpath = '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf'
font = fm.FontProperties(fname=fontpath, size=9)
plt.rc('font', family='NanumBarunGothic') 
mpl.font_manager._rebuild()

In [None]:
titanic_df=pd.read_excel('/content/sample_data/titanic_train.xlsx')
titanic_df

In [None]:
# 데이터 정보
print('###### titanic 정보 ######')
print(titanic_df.info())

In [None]:
# DataFrame의 fiina() 함수를 이용해 간단하게 Null 값을 평균 또는 고정값으로 정함
# Age는 평균 값으로 대체 , Cabinm Embarked는 N으로 대체
titanic_df['Age'].fillna(titanic_df['Age'].mean(),inplace=True)
titanic_df['Cabin'].fillna('N',inplace=True)
titanic_df['Embarked'].fillna('N',inplace=True)

In [None]:
print('데이터 세트의 Null 값 갯수',titanic_df.isnull().sum().sum())

In [None]:
titanic_df['Cabin']=titanic_df['Cabin'].str[:1]
titanic_df['Cabin'].head()

In [None]:
# 성별이 생존확률에 어떠한 영향을 미쳤을까?
titanic_df.groupby(['Sex','Survived'])['Survived'].count()

In [None]:
sns.barplot(x='Sex',y='Survived',data=titanic_df)

In [None]:
sns.barplot(x='Pclass',y='Survived',hue='Sex',data=titanic_df)

In [None]:
# 입력 age에 따라 구분 값을 반환하는 함수 설정
def get_category(age):
    cat=''
    if age<=-1:cat='UnKnown'
    elif age<=5:cat='Baby'
    elif age<=12:cat='Child'
    elif age<=18:cat='Teenager'
    elif age<=25:cat='Student'
    elif age<=35:cat='Young Adult'
    elif age<=68:cat='Adult'
    else : cat='Elderly'
    return cat

plt.figure(figsize=(10,6))
group_names=['UnKnown','Baby','Child','Teenager','Student','Young Adult','Adult','Elderly']

titanic_df['Age_cat']=titanic_df['Age'].apply(lambda x : get_category(x))
sns.barplot(x='Age_cat',y='Survived',hue='Sex',data=titanic_df,order=group_names)
titanic_df.drop('Age_cat',axis=1,inplace=True)

In [None]:
# 인코딩

from sklearn import preprocessing

def encode_features(dataDF):
    features=['Cabin','Sex','Embarked']
    for feature in features:
        le=preprocessing.LabelEncoder()
        le=le.fit(dataDF[feature])
        dataDF[feature]=le.transform(dataDF[feature])

    return dataDF

titanic_df=encode_features(titanic_df)
titanic_df.head()

In [None]:
from sklearn.preprocessing import LabelEncoder

# Null 처리함수
def fillna(df):
    df['Age'].fillna(df['Age'].mean(),inplace=True)
    df['Cabin'].fillna('N',inplace=True)
    df['Embarked'].fillna('N',inplace=True)
    df['Fare'].fillna(0,inplace=True)
    return df

# 머신러닝 알고리즘에 불필요한 속성 제거
def drop_fetures(df):
    df.drop(['PassengerId','Name','Ticket'],axis=1,inplace=True)
    return df

# 레이블 인코딩 수행
def format_features(df):
    df['Cabin']=df['Cabin'].str[:1]
    features=['Cabin','Sex','Embarked']
    for feature in features:
        le = LabelEncoder()
        le = le.fit(df[feature])
        df[feature]=le.transform(df[feature])
    return df

def transform_features(df):
    df=fillna(df)
    df=drop_fetures(df)
    df=format_features(df)
    return df

In [None]:
# 피처 데이터 세트와 레이블 데이터 세트 추출

titanic_df=pd.read_excel('/content/sample_data/titanic_train.xlsx')
y_titanic_df=titanic_df['Survived']
X_titanic_df=titanic_df.drop('Survived',axis=1)

X_titanic_df=transform_features(X_titanic_df)

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X_titanic_df,y_titanic_df,test_size=0.2,random_state=11)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
dt_clf=DecisionTreeClassifier(random_state=11)
rf_clf=RandomForestClassifier(random_state=11)
lr_clf=LogisticRegression()

# DecisionTreeClassifier 학습/예측/평가
dt_clf.fit(X_train,y_train)
dt_pred=dt_clf.predict(X_test)
print('DecisionTree 정확도:{0:4f}'.format(accuracy_score(y_test,dt_pred)))

# RandomForestClassifier 학습/예측/평가
rf_clf.fit(X_train,y_train)
rf_pred=rf_clf.predict(X_test)
print('RandomForest:{0:4f}'.format(accuracy_score(y_test,rf_pred)))

# LogisticRegression 학습/예측/평가
lr_clf.fit(X_train,y_train)
lr_pred=lr_clf.predict(X_test)
print('LogisticRegression 정확도:{0:4f}'.format(accuracy_score(y_test,lr_pred)))

In [None]:
from sklearn.model_selection import KFold

def exec_kfold(clf,folds=5):
    kfold=KFold(n_splits=folds)
    scores=[]

    #kfold 교차 검증 수행

    for iter_count,(train_index,test_index) in enumerate(kfold.split(X_titanic_df)):
        X_train,X_test=X_titanic_df.values[train_index],X_titanic_df.values[test_index]
        y_train,y_test=y_titanic_df.values[train_index],y_titanic_df.values[test_index]

        clf.fit(X_train,y_train)
        predictions=clf.predict(X_test)
        accuracy=accuracy_score(y_test,predictions)
        scores.append(accuracy)
        print("교차검증 {0} 정확도 {1:.4f}".format(iter_count,accuracy))

    mean_score=np.mean(scores)
    print("평균 정확도:{0:.4f}".format(mean_score))

exec_kfold(dt_clf,folds=5)

In [None]:
# 교차검증 cross_val_score()
from sklearn.model_selection import cross_val_score

scores=cross_val_score(dt_clf,X_titanic_df,y_titanic_df,cv=5)
for iter_count,accuracy in enumerate(scores):
    print("교차검증 {0} 정확도:{1:4f}".format(iter_count,accuracy))
print("평균 정확도:{0:.4f}".format(np.mean(scores)))

In [None]:
from sklearn.model_selection import GridSearchCV

parameters={'max_depth':[2,3,5,10],'min_samples_split':[2,3,5],'min_samples_leaf':[1,5,8]}
                         
grid_dclf=GridSearchCV(dt_clf,param_grid=parameters,scoring='accuracy',cv=5)
grid_dclf.fit(X_train,y_train)

print('GridSearchCV 최적 하이퍼 파라미터:',grid_dclf.best_params_)
print('GridSearchCV 최고 정확되{0:.4f}:',format(grid_dclf.best_score_))
best_dclf=grid_dclf.best_estimator_

In [None]:
dpredictions=best_dclf.predict(X_test)
accuracy=accuracy_score(y_test,dpredictions)
print('테스트 세트에서의 DecisionTressClassifier정확도:{0:.4f}'.format(accuracy))

##### 데이터의 전처리 작업
*   오류 데이터의 보정이나 결손값 처리 등의 다양한 데이터 클렌징 작업,레이블 인코딩, 원-핫 인코딩, 데이터의 스케일링/정규화 작업 등으로 머신러닝 알고리즘이 최적으로 수행될 수 있게 데이터를 사전 처리하는 것

##### 머신러닝 모델링
*   학습데이터 세트로 학습한 뒤 반드시 별도의 테스트 데이터 세트로 평가되어야한다.
*   테스트 데이터의 건수부족이나 고정된 테스트 데이터 세트를 이용한 반복적인 모델의 학습과 평가는 해당 테스트 데이터 세트에만 치우친 빈약한 머신러닝 모델을 만들 가능성이 높음





