### 타이타닉 생존자 예측

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

In [14]:
df = sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


#### 1. Data Preprocessing

- Feature Selection

In [15]:
df = df[['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked', 'deck']]
print(df.shape)
df.head()

(891, 9)


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
0,0,3,male,22.0,1,0,7.25,S,
1,1,1,female,38.0,1,0,71.2833,C,C
2,1,3,female,26.0,0,0,7.925,S,
3,1,1,female,35.0,1,0,53.1,S,C
4,0,3,male,35.0,0,0,8.05,S,


- NaN

In [16]:
df.isna().sum()

survived      0
pclass        0
sex           0
age         177
sibsp         0
parch         0
fare          0
embarked      2
deck        688
dtype: int64

In [17]:
# age - 평균으로 대체
df.age.fillna(df.age.mean(), inplace=True)
df.age.isnull().sum()

0

In [19]:
# embarked - 최빈값으로 대체
df.embarked.value_counts()
df.embarked.fillna('S', inplace=True)
df.embarked.isnull().sum()

0

In [20]:
# deck - 열 삭제
df.drop(['deck'], axis=1, inplace=True)
df.isna().sum().sum()

- Categorical Encoding

In [21]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [22]:
df.sex = le.fit_transform(df.sex)
df.embarked = le.fit_transform(df.embarked)

df.head()

#### 2. Train / Test Split

In [24]:
X = df.drop(['survived'],axis=1).values
y = df['survived'].values
X.shape, y.shape

((891, 7), (891,))

In [26]:
# y값의 분포
# df.survived.value_counts()
np.unique(y, return_counts=True)

(array([0, 1], dtype=int64), array([549, 342], dtype=int64))

In [27]:
# train/test dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=2022
)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((712, 7), (179, 7), (712,), (179,))

In [28]:
np.unique(y_train, return_counts=True)

(array([0, 1], dtype=int64), array([439, 273], dtype=int64))

#### 3. RandomForest Model

In [31]:
import warnings
warnings.filterwarnings('ignore')

In [30]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state = 2022)
rf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 2022,
 'verbose': 0,
 'warm_start': False}

In [32]:
rf.fit(X_train, y_train)

RandomForestClassifier(random_state=2022)

#### 4. Prediction and Evaluation

In [33]:
rf.score(X_test, y_test)

0.8324022346368715

#### 5. GridSearchCV

In [34]:
params = {
    'max_depth':[2,4,6,8],
    'min_samples_split':[2,4,6]
}

In [36]:
from sklearn.model_selection import GridSearchCV
gv_rf = GridSearchCV(rf, params, scoring='accuracy', cv=5)
gv_rf.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=2022),
             param_grid={'max_depth': [2, 4, 6, 8],
                         'min_samples_split': [2, 4, 6]},
             scoring='accuracy')

In [37]:
gv_rf.best_params_

{'max_depth': 4, 'min_samples_split': 4}

In [39]:
params = {
    'max_depth':[3,4,5],
    'min_samples_split':[3,4,5]
}

gv_rf = GridSearchCV(rf, params, scoring='accuracy', cv=5)
%time gv_rf.fit(X_train, y_train)

gv_rf.best_params_

Wall time: 4.6 s


{'max_depth': 4, 'min_samples_split': 3}

In [40]:
best_rf = gv_rf.best_estimator_
best_rf.score(X_test, y_test)

0.8212290502793296

#### 6. Test set

In [41]:
X_test[25], y_test[25]

(array([ 3.  ,  1.  , 45.  ,  0.  ,  0.  ,  8.05,  2.  ]), 1)

In [42]:
best_rf.predict(X_test[25].reshape(1, -1))

array([0], dtype=int64)

#### 7. Mess Classifier

In [48]:
# 여성의 생존률
# df[df['sex']==0].survived.mean()
df.groupby('sex')['survived'].mean()

sex
0    0.742038
1    0.188908
Name: survived, dtype: float64

In [49]:
df.groupby(['sex','pclass'])['survived'].mean()

sex  pclass
0    1         0.968085
     2         0.921053
     3         0.500000
1    1         0.368852
     2         0.157407
     3         0.135447
Name: survived, dtype: float64

In [69]:
from sklearn.base import BaseEstimator

class MyClassifier(BaseEstimator):
    # fit(), predict() method만 재정의(Overriding)
    def fit(self, X, y):
        pass
    def predict(self, X):
        pred = np.zeros(X.shape[0], int) # X의 행 갯수만큼 0으로 초기화한 배열을 생성
        for i in range(X.shape[0]):
            if X[i, 1] == 0:
                pred[i] = 1
        return pred

In [75]:
my_clf = MyClassifier()
my_clf.fit(X_train, y_train)

pred_my = my_clf.predict(X_test)

In [76]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred_my)

0.7877094972067039

- 모델의 성능을 평가할 때, 무조건적으로 정확도를 사용하는 것은 지양해야함

In [72]:
pred_rf = best_rf.predict(X_test)
sdf = pd.DataFrame({'y':y_test, 'RF':pred_rf, 'My':pred_my})
sdf

Unnamed: 0,y,RF,My
0,1,0,0
1,0,0,0
2,1,0,1
3,0,0,0
4,0,0,0
...,...,...,...
174,0,0,0
175,1,1,1
176,0,0,0
177,1,1,1


In [73]:
# TN FP
# FN TP
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, pred_rf)

array([[103,   7],
       [ 25,  44]], dtype=int64)

In [77]:
confusion_matrix(y_test, pred_my)

array([[96, 14],
       [24, 45]], dtype=int64)

In [78]:
# 정밀도 (Precision Scroe TP / (FP+TP))
from sklearn.metrics import precision_score, recall_score
print(precision_score(y_test, pred_rf))
print(precision_score(y_test, pred_my))

0.8627450980392157
0.7627118644067796


In [79]:
# 재현율 (Recall TP / (FN+TP))
print(recall_score(y_test, pred_rf))
print(recall_score(y_test, pred_my))

0.6376811594202898
0.6521739130434783


In [81]:
# F1 Score (정밀도와 재현율의 조화 평균)
from sklearn.metrics import f1_score
f1_score(y_test, pred_rf), f1_score(y_test, pred_my)

(0.7333333333333333, 0.703125)

In [82]:
# ROC-AUC
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, pred_rf), roc_auc_score(y_test, pred_my)

(0.787022397891963, 0.7624505928853755)