### Titanic Data analysis

### Data load

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings 
warnings.filterwarnings('ignore')

titanic = pd.read_csv('../data/titanic_train.csv')
titanic.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [2]:
titanic.info()
# feature : 11개, target column : Survived

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [3]:
# null 확인
titanic.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [4]:
print(titanic['Pclass'].value_counts())
print(titanic['Ticket'].value_counts())
print(titanic['Fare'].value_counts())

3    491
1    216
2    184
Name: Pclass, dtype: int64
347082      7
CA. 2343    7
1601        7
3101295     6
CA 2144     6
           ..
9234        1
19988       1
2693        1
PC 17612    1
370376      1
Name: Ticket, Length: 681, dtype: int64
8.0500     43
13.0000    42
7.8958     38
7.7500     34
26.0000    31
           ..
35.0000     1
28.5000     1
6.2375      1
14.0000     1
10.5167     1
Name: Fare, Length: 248, dtype: int64


### EDA

In [5]:
print(titanic['Cabin'].value_counts())
print(titanic['Embarked'].value_counts())

B96 B98        4
G6             4
C23 C25 C27    4
C22 C26        3
F33            3
              ..
E34            1
C7             1
C54            1
E36            1
C148           1
Name: Cabin, Length: 147, dtype: int64
S    644
C    168
Q     77
Name: Embarked, dtype: int64


In [6]:
# null 처리
# Age는 평균으로 처리, Cabin과 Embarked는 NaN으로 새로운 값으로 분류
titanic['Age'].fillna(titanic['Age'].mean(), inplace=True)
titanic['Cabin'].fillna('NaN', inplace=True)
titanic['Embarked'].fillna('NaN', inplace=True)

In [7]:
titanic.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [8]:
# 불필요한 칼럼 삭제
# PassengerId, name
titanic.drop(['PassengerId', 'Name'], axis=1, inplace=True)

In [9]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       891 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Ticket    891 non-null    object 
 7   Fare      891 non-null    float64
 8   Cabin     891 non-null    object 
 9   Embarked  891 non-null    object 
dtypes: float64(2), int64(4), object(4)
memory usage: 69.7+ KB


In [10]:
# Cabin 앞 알파벳만 따서 새롭게 만들기
titanic['Cabin'] = titanic['Cabin'].str[:1]

In [11]:
titanic.head(3)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,A/5 21171,7.25,N,S
1,1,1,female,38.0,1,0,PC 17599,71.2833,C,C
2,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,N,S


In [12]:
# object type 인코딩하기 : LabelEncoder
titanic.dtypes[titanic.dtypes == 'object'].index.to_list()

['Sex', 'Ticket', 'Cabin', 'Embarked']

In [13]:
from sklearn.preprocessing import LabelEncoder
object_col = titanic.dtypes[titanic.dtypes == 'object'].index.to_list()

for col in object_col:
    le = LabelEncoder()
    le.fit(titanic[col])
    titanic[col] = le.transform(titanic[col])
    
titanic.head(3)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,1,22.0,1,0,523,7.25,7,3
1,1,1,0,38.0,1,0,596,71.2833,2,0
2,1,3,0,26.0,0,0,669,7.925,7,3


In [26]:
# 데이터 불균형 확인
titanic['Survived'].value_counts()

0    549
1    342
Name: Survived, dtype: int64

### 모델 적용하기

In [15]:
# train test set split
from sklearn.model_selection import train_test_split

y = titanic['Survived']
X = titanic.drop('Survived', axis=1, inplace=False)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### To do list
1. cross validation로 일반화 평가 확인
2. GridSearchCV 매개변수 찾기
3. 분류 모델 6개 : Decision Tree, RandomForest, LogisticRegressor, ensemble(XGBoost, Adaboost, SVM)

In [27]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.svm import SVC

dt_clf = DecisionTreeClassifier(random_state=42)
rf_clf = RandomForestClassifier(n_estimators=5)
lr_clf = LogisticRegression(solver='liblinear')
xgb_clf = XGBClassifier()
ada_clf = AdaBoostClassifier()
svm_clf = SVC()

dt_clf.fit(X_train, y_train)
rf_clf.fit(X_train, y_train)
lr_clf.fit(X_train, y_train)
xgb_clf.fit(X_train, y_train)
ada_clf.fit(X_train, y_train)
svm_clf.fit(X_train, y_train)

In [21]:
from sklearn.metrics import accuracy_score

# predict
dt_pred = dt_clf.predict(X_test)
rf_pred = rf_clf.predict(X_test)
lr_pred = lr_clf.predict(X_test)
xgb_pred = xgb_clf.predict(X_test)
ada_pred = ada_clf.predict(X_test)
svm_pred = svm_clf.predict(X_test)

# accuracy score : 파라미터 조절 안한 상태 (기본)
print("dt_clf accuracy:{:.2f} ".format(accuracy_score(y_test, dt_pred)))
print("rf_clf accuracy:{:.2f} ".format(accuracy_score(y_test, rf_pred)))
print("lr_clf accuracy:{:.2f} ".format(accuracy_score(y_test, lr_pred)))
print("xgb_clf accuracy:{:.2f} ".format(accuracy_score(y_test, xgb_pred)))
print("ada_clf accuracy:{:.2f} ".format(accuracy_score(y_test, ada_pred)))
print("svm_clf accuracy:{:.2f} ".format(accuracy_score(y_test, svm_pred)))

dt_clf accuracy:0.79 
rf_clf accuracy:0.81 
lr_clf accuracy:0.80 
xgb_clf accuracy:0.83 
ada_clf accuracy:0.82 
svm_clf accuracy:0.70 


In [28]:
# 학습 및 평가 함수 만들기
def train_predict(model, x_train, y_train, x_test, y_test):
    model.fit(x_train, y_train)
    pred = model.predict(x_test)
    print('{0} accuaracy score: {1:.2f}'.format(model.__class__.__name__, accuracy_score(y_test, pred)))

In [29]:
train_predict(dt_clf, X_train, y_train, X_test, y_test)
train_predict(rf_clf, X_train, y_train, X_test, y_test)
train_predict(lr_clf, X_train, y_train, X_test, y_test)
train_predict(xgb_clf, X_train, y_train, X_test, y_test)
train_predict(ada_clf, X_train, y_train, X_test, y_test)
train_predict(svm_clf, X_train, y_train, X_test, y_test)

DecisionTreeClassifier accuaracy score: 0.78
RandomForestClassifier accuaracy score: 0.82
LogisticRegression accuaracy score: 0.80
XGBClassifier accuaracy score: 0.83
AdaBoostClassifier accuaracy score: 0.82
SVC accuaracy score: 0.70


In [30]:
# GridSearchCV 적용하기
# 1. DecisionTree
from sklearn.model_selection import GridSearchCV

params = {
    'max_depth':[2,3,4]
}

grid_search = GridSearchCV(estimator=dt_clf, param_grid=params, scoring='neg_mean_squared_error', cv=5)
grid_search.fit(X_train, y_train)
print("best_params: {}".format(grid_search.best_params_))

best_params: {'max_depth': 3}


In [31]:
dt_clf = DecisionTreeClassifier(random_state=42, max_depth=3)
train_predict(dt_clf, X_train, y_train, X_test, y_test)

DecisionTreeClassifier accuaracy score: 0.80


In [32]:
# GridSearchCV
def gridSearchCV(model, parmas):
    grid_search = GridSearchCV(estimator=model, param_grid=parmas, scoring='neg_mean_squared_error', cv=5)
    grid_search.fit(X_train, y_train)
    print('best_params: {}'.format(grid_search.best_params_))
    return grid_search.best_params_

In [33]:
params = {'max_depth':[2,3,4,5]}
best_params = gridSearchCV(dt_clf, params)
dt_clf = DecisionTreeClassifier(**best_params, random_state=42)
train_predict(dt_clf, X_train, y_train, X_test, y_test)

best_params: {'max_depth': 3}


In [37]:
# 2. RandomForest
params = {
    'n_estimators': [10, 50, 100],
    'max_depth': [2,3,4,5]
}

best_params = gridSearchCV(rf_clf, params)

rf_clf = RandomForestClassifier(**best_params)
train_predict(rf_clf, X_train, y_train, X_test, y_test)
# n_estimators=5로 설정한 것에 비해서 0.02 떨어짐

best_params: {'max_depth': 5, 'n_estimators': 10}
RandomForestClassifier accuaracy score: 0.80


In [44]:
# 4. XGBoost 
# HyperOpt로 하이퍼파라미터 튜닝
# 베이지안 최적화를 만들기 위한 요소 : SearchSpace(입력값 범위), objective func(목적 함수), fmin(목적함수 반환 최소값 유추)
from hyperopt import hp

# max_depth: 5~20 1간격, min_child_weight: 1~2 1간격, learning_rate: 0.01~0.2 정규분포
search_space = {
    'max_depth': hp.quniform('max_depth', 5, 20, 1),
    'min_child_weight': hp.quniform('min_child_weight', 1, 2, 1),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1)
}


In [50]:
from sklearn.model_selection import cross_val_score
from hyperopt import STATUS_OK

def objective_func(search_space):    
    xgb_clf = XGBClassifier(n_estimators=100,
                            max_depth=int(search_space['max_depth']),
                            min_child_weight=int(search_space['min_child_weight']),
                            learning_rate=search_space['learning_rate'], 
                            colsample_bytree=search_space['colsample_bytree'], 
                            eval_metric='logloss')
    accuracy = cross_val_score(xgb_clf, X_train, y_train, scoring='accuracy', cv=3)
    
    # cross_val_score에서 나오는 scroing은 음수로 나옴. 그래서 -1을 곱해주면서 결과 출력
    return {'loss': -1*np.mean(accuracy), 'status': STATUS_OK}

In [51]:
from hyperopt import fmin, tpe, Trials

trial_val = Trials()
best = fmin(fn=objective_func,
            space=search_space,
            algo=tpe.suggest,
            max_evals=50,
            trials=trial_val, rstate=np.random.default_rng(seed=9))
print('best: ', best)

100%|██████████| 50/50 [00:09<00:00,  5.14trial/s, best loss: -0.838527816189767] 
best:  {'colsample_bytree': 0.9933226773354293, 'learning_rate': 0.05688427461830717, 'max_depth': 11.0, 'min_child_weight': 2.0}


In [54]:
# train / validation 분리하기

xgb_clf = XGBClassifier(n_estimators=400, learning_rate=round(best['learning_rate'], 5), 
                        max_depth=int(best['max_depth']), 
                        min_child_weight=int(best['min_child_weight']),
                        colsample_bytree=round(best['colsample_bytree'], 5))

X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)
evals=[(X_tr, y_tr), (X_val, y_val)]
xgb_clf.fit(X_tr, y_tr, early_stopping_rounds=50, eval_metric='logloss', 
            eval_set=evals, verbose=True)

preds = xgb_clf.predict(X_test)
print('{0} accuaracy score: {1:.2f}'.format(xgb_clf.__class__.__name__, accuracy_score(y_test, preds)))


Parameters: { "learng_rate" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	validation_0-logloss:0.54489	validation_1-logloss:0.55714
[1]	validation_0-logloss:0.47887	validation_1-logloss:0.54372
[2]	validation_0-logloss:0.41727	validation_1-logloss:0.48309
[3]	validation_0-logloss:0.36943	validation_1-logloss:0.44610
[4]	validation_0-logloss:0.34248	validation_1-logloss:0.45830
[5]	validation_0-logloss:0.31272	validation_1-logloss:0.44363
[6]	validation_0-logloss:0.28978	validation_1-logloss:0.43492
[7]	validation_0-logloss:0.27077	validation_1-logloss:0.43070
[8]	validation_0-logloss:0.25680	validation_1-logloss:0.43285
[9]	validation_0-logloss:0.24439	validation_1-logloss:0.43554
[10]	validation_0-logloss:0.23540	validation_1-logloss:0.44475

In [None]:
# 4. Adaboost
params = {
    'n_estimators': [10, 50, 100]
}


