#### 타이타닉 생존자 예측

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

plt.rcParams['figure.figsize']=(4,3) 

In [None]:
titanic_df= pd.read_csv('data/train.csv')
titanic_df.info()

In [None]:
# NaN 처리
titanic_df['Age'].fillna(titanic_df.Age.mean(), inplace=True)
titanic_df['Cabin'].fillna('N', inplace=True)
titanic_df['Embarked'].fillna('N', inplace=True)
print('NaN값: ',titanic_df.isnull().sum().sum())

In [None]:
# 문자열 peatuer 처리
print('Sex 값 분포', titanic_df.Sex.value_counts())
print('\n Cabin 값 분포', titanic_df['Cabin'].value_counts())
print('\n Embarked 값 분포', titanic_df.Embarked.value_counts())

In [None]:
titanic_df['Cabin']=titanic_df['Cabin'].str[:1]
print(titanic_df['Cabin'].value_counts())

In [None]:
titanic_df.groupby(['Sex','Survived'])['Survived'].size()

In [None]:
sns.barplot(x='Sex', y='Survived', data=titanic_df)

In [None]:
sns.barplot(x='Pclass', y='Survived', data=titanic_df, hue='Sex')

In [None]:
# Age에 따른 생존
def get_category(age):
    cat=''
    if age <= -1: cat='Unkown'
    elif age <=5: cat='Baby'
    elif age <=12: cat='Child'
    elif age <=18: cat='Teenager'
    elif age <=25: cat='Student'
    elif age <=35: cat='Young Adult'
    elif age <=60: cat='Adult'
    else : cat='Eldery'
    return cat

plt.figure(figsize=(10,6))
group_names=['Unkown','Baby','Child','Teenager','Student','Young Adult','Adult','Eldery']
titanic_df['Age_cat']=titanic_df['Age'].apply(lambda x:get_category(x))
sns.barplot(x='Age_cat',y='Survived',data=titanic_df, hue='Sex', order=group_names)
    

In [None]:
titanic_df.head()

In [None]:
# 문자열 피처를  숫자형으로 변환
from sklearn.preprocessing import LabelEncoder

def encode_features(dataDF):
    features=['Cabin','Sex','Embarked']
    for feature in features:
        le=LabelEncoder()
        le=le.fit(dataDF[feature])
        dataDF[feature]=le.transform(dataDF[feature])
    return dataDF

titanic_df=encode_features(titanic_df)
titanic_df.head()

In [None]:
## 전처리 과정 함수화

In [None]:
# null값 정리
def fillna(df):
    df['Age'].fillna(df['Age'].mean(), inplace=True)
    df['Cabin'].fillna('N', inplace=True)
    df['Embarked'].fillna('N', inplace=True)
    df['Fare'].fillna(0, inplace=True)    
    return df

In [None]:
# 알고리즘에 불필요한 피처 제거
def drop_features(df):
    df.drop(['PassengerId','Name','Ticket'], axis=1,inplace=True)
    return df

In [None]:
#레이블 인코딩 
def format_features(df):
    df['Cabin']=df['Cabin'].str[:1]
    features=['Cabin','Sex','Embarked']
    for feature in features:
        le=LabelEncoder()
        le=le.fit(df[feature])
        df[feature]=le.transform(df[feature])
    return df

In [None]:
## 전처리 함수 호출
def transform_features(df):
    df=fillna(df)
    df=drop_features(df)
    df=format_features(df)
    return df

In [None]:
# Survived 분리해서 결정값(종속변수) 데이트세트 만들기/ 드롭하여 피처(독립변수) 데이터셋트 만들기

titanic_df=pd.read_csv('./data/train.csv')
y_titanic_df=titanic_df['Survived']
X_titanic_df=titanic_df.drop('Survived', axis=1)

In [None]:
X_titanic_df=transform_features(X_titanic_df)
X_titanic_df

In [None]:
X_titanic_df['Age'].unique()

In [None]:
### train, test 데이터세트 분리
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test= train_test_split(X_titanic_df, y_titanic_df, test_size=0.2,random_state=11)

In [None]:
### ML 알고니즘 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
dt_clf=DecisionTreeClassifier(random_state=11)
rf_clf=RandomForestClassifier(random_state=11)
lr_clf=LogisticRegression(solver='liblinear')

In [None]:
#DecisionTreeClassifier 학습,예측,평가
dt_clf.fit(X_train,y_train)
dt_pred=dt_clf.predict(X_test)
print('DecisionTreeClassifier 정확도: {0:.4f}'.format(accuracy_score(y_test,dt_pred)))

In [None]:
#RandomForestClassifier 학습,예측,평가
rf_clf.fit(X_train,y_train)
rf_pred=rf_clf.predict(X_test)
print('RandomForestClassifier 정확도: {0:.4f}'.format(accuracy_score(y_test,rf_pred)))

In [None]:
#LogisticRegression 학습,예측,평가
lr_clf.fit(X_train,y_train)
lr_pred=lr_clf.predict(X_test)
print('LogisticRegressionr 정확도: {0:.4f}'.format(accuracy_score(y_test,lr_pred)))

In [None]:
### 교차검증
# KFold
from sklearn.model_selection import KFold
def exec_kfold(clf, folds=5):
    kfold=KFold(n_splits=folds)
    scores=[]
    
    for iter_count, (train_index, test_index) in enumerate(kfold.split(X_titanic_df)):
        X_train,X_test=X_titanic_df.values[train_index], X_titanic_df.values[test_index]
        y_train,y_test=y_titanic_df.values[train_index], y_titanic_df.values[test_index]
        clf.fit(X_train,y_train)
        predictions=clf.predict(X_test)
        accuracy=accuracy_score(y_test, predictions)
        scores.append(accuracy)
        print(iter_count,accuracy)
    mean_score=np.mean(scores)
    print(mean_score)
exec_kfold(dt_clf, folds=5)
        

In [None]:
# cross_val_score
from sklearn.model_selection import cross_val_score
scores= cross_val_score(dt_clf, X_titanic_df, y_titanic_df,cv=5)
for iter_count, accuracy in enumerate(scores, start=1):
    print("교차검증 cross_val_score 결과: {}회, 정확도: {:.4f}".format(iter_count, accuracy))
print('평균정확도 결과: {:.4f}'.format(np.mean(scores)))

In [None]:
scores

In [None]:
# GridSearchCV 최적 하이퍼 파라미터
from sklearn.model_selection import GridSearchCV
parameters={'max_depth':[2,3,5,10], 'min_samples_split':[2,3,5], 'min_samples_leaf':[1,5,8]}
grid_dclf=GridSearchCV(dt_clf,param_grid=parameters, scoring='accuracy', cv=5)
grid_dclf.fit(X_train, y_train)

print('최적의파라미터: ', grid_dclf.best_params_)
print('최고 정확도: {:.4f}'.format(grid_dclf.best_score_))
best_dclf=grid_dclf.best_estimator_

dpredictions=best_dclf.predict(X_test)
accuracy=accuracy_score(y_test, dpredictions)
print('테스트 세트에서의 DecisionTreeClassifier 정확도: {:.4f}'.format(accuracy))