# 타이타닉 생존자 예측

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns    

In [2]:
df = sns.load_dataset('titanic')
df.head(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True


- 데이터 전처리

In [3]:
# Feature selection
df = df[['survived','pclass','sex','age','sibsp','parch','fare','embarked','deck']]
df.head(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
0,0,3,male,22.0,1,0,7.25,S,
1,1,1,female,38.0,1,0,71.2833,C,C
2,1,3,female,26.0,0,0,7.925,S,


In [5]:
df.shape

(891, 9)

In [4]:
# 결측치(NaN) 처리
df.isna().sum()

survived      0
pclass        0
sex           0
age         177
sibsp         0
parch         0
fare          0
embarked      2
deck        688
dtype: int64

In [6]:
# 나이는 평균으로 대체
df.age.fillna(df.age.mean(), inplace=True)
df.age.isna().sum()

0

In [9]:
# embarked는 최빈값으로 대체
df.embarked.value_counts()

S    644
C    168
Q     77
Name: embarked, dtype: int64

In [11]:
df.embarked.fillna('S', inplace=True)
df.embarked.isna().sum()

0

In [12]:
# deck 은 삭제
df.drop(columns=['deck'], inplace=True)
df.head(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S


In [14]:
# sex, embarked를 숫자로 변환
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df['sex'] = encoder.fit_transform(df.sex)
df['embarked'] = encoder.fit_transform(df.embarked)
df.head(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked
0,0,3,1,22.0,1,0,7.25,2
1,1,1,0,38.0,1,0,71.2833,0
2,1,3,0,26.0,0,0,7.925,2


- Train/Test dataset으로 분리

In [15]:
X = df.iloc[:,1:].values
y = df.survived.values
X.shape, y.shape

((891, 7), (891,))

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=2021
)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((712, 7), (179, 7), (712,), (179,))

- RandomForest 모델 생성 및 학습

In [17]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=2021)
rfc.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 2021,
 'verbose': 0,
 'warm_start': False}

In [18]:
rfc.fit(X_train, y_train)

RandomForestClassifier(random_state=2021)

- 성능 평가

In [19]:
rfc.score(X_test, y_test)

0.8100558659217877

- GridSearchCV 수행

In [20]:
params = {
    'max_depth' : [3,5,7],
    'min_samples_split' : [2,3,4]
}

In [22]:
from sklearn.model_selection import GridSearchCV

grid_rf = GridSearchCV(rfc, param_grid=params, scoring='accuracy', cv = 3)
grid_rf.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=RandomForestClassifier(random_state=2021),
             param_grid={'max_depth': [3, 5, 7],
                         'min_samples_split': [2, 3, 4]},
             scoring='accuracy')

In [23]:
grid_rf.best_params_

{'max_depth': 7, 'min_samples_split': 2}

In [24]:
params2 = {
    'max_depth' : [6,7,8,9],
    'min_samples_split' : [2,3,4]
}

In [25]:
grid_rf2 = GridSearchCV(rfc, param_grid=params2, scoring='accuracy', cv = 3)
grid_rf2.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=RandomForestClassifier(random_state=2021),
             param_grid={'max_depth': [6, 7, 8, 9],
                         'min_samples_split': [2, 3, 4]},
             scoring='accuracy')

In [26]:
grid_rf2.best_params_

{'max_depth': 7, 'min_samples_split': 2}

In [27]:
grid_rf2.score(X_test, y_test)

0.8324022346368715

- 테스트 데이터에 적용

In [28]:
y_test[100], X_test[100]

(1, array([ 1.,  1., 31.,  1.,  0., 57.,  2.]))

In [29]:
grid_rf2.predict(X_test[100].reshape(1,-1))

array([0], dtype=int64)

- 타이타닉 엉터리 분류기

In [30]:
from sklearn.base import BaseEstimator

class MyClassfier(BaseEstimator):
    # fit(), predict() method는 재정의(overriding)
    def fit(self, X, y):
        pass
    def predict(self, X):
        pred = np.zeros((X.shape[0], 1))
        for i in range(X.shape[0]):
            if X[i, 1] == 0:            # 여성이면 생존으로 처리
                pred[i,0] = 1
        return pred

In [31]:
my_clf = MyClassfier()
my_clf.fit(X_train, y_train)
pred = my_clf.predict(X_test)

In [33]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred)

0.7877094972067039