<a href="https://colab.research.google.com/github/peisuke/ml-works/blob/main/9/9_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 9.1 タイタニック号の生存者予測問題
- 目的
    - これまでに機械学習の様々なロジックを学んだ
    - 本実験では、より実問題に近い形のタイタニック号生存者予測問題を解く

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score 
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV
%matplotlib inline

# データの確認
- 数値データ以外に性別データやNaN（空）のデータが存在する事が確認できる

In [2]:
#titanicデータセットを読み込んで、一部を表示
df = sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [3]:
# 生存に関係無さそうなデータを削除
# deck, aliveを削除
drop_list = ["deck","alive", "embark_town"]
df = df.drop(drop_list, axis=1)

# データの修正
- まずは欠損値の確認と補完
- 続いてカテゴリの変数を数値に変換

In [4]:
df.isnull().sum()   # 欠損値の個数を確認

survived        0
pclass          0
sex             0
age           177
sibsp           0
parch           0
fare            0
embarked        2
class           0
who             0
adult_male      0
alone           0
dtype: int64

In [5]:
# ageの欠損値を平均値で補完
df["age"] = df["age"].fillna(df["age"].mean())  

In [6]:
# 欠損値の個数を確認
# ageの欠損がなくなったことを確認、embarkedに欠損が残る
print(df.isnull().sum())

survived      0
pclass        0
sex           0
age           0
sibsp         0
parch         0
fare          0
embarked      2
class         0
who           0
adult_male    0
alone         0
dtype: int64


In [7]:
# 欠損値の個数を確認
print(df["embarked"].value_counts())

S    644
C    168
Q     77
Name: embarked, dtype: int64


In [8]:
# enbarkedはカテゴリデータのため平均を計算できない
# 最頻値であった「S」で欠損を補完
df.fillna({"embarked":"S"},inplace=True)

In [9]:
print(df.isnull().sum())

survived      0
pclass        0
sex           0
age           0
sibsp         0
parch         0
fare          0
embarked      0
class         0
who           0
adult_male    0
alone         0
dtype: int64


In [10]:
#データを特徴量とターゲットに分割
X = df.drop("survived",axis=1)
Y = df["survived"]

In [11]:
print(X[:5])
print(Y[:5])

   pclass     sex   age  sibsp  parch  ...  embarked  class    who adult_male  alone
0       3    male  22.0      1      0  ...         S  Third    man       True  False
1       1  female  38.0      1      0  ...         C  First  woman      False  False
2       3  female  26.0      0      0  ...         S  Third  woman      False   True
3       1  female  35.0      1      0  ...         S  First  woman      False  False
4       3    male  35.0      0      0  ...         S  Third    man       True   True

[5 rows x 11 columns]
0    0
1    1
2    1
3    1
4    0
Name: survived, dtype: int64


In [12]:
# 特徴量のカテゴリカル変数をダミー変数化して確認
# ダミー変数：男性/女性など直接数値で表せないデータを数値データに変換する操作
X = pd.get_dummies(X)
X.head()

Unnamed: 0,pclass,age,sibsp,parch,fare,adult_male,alone,sex_female,sex_male,embarked_C,embarked_Q,embarked_S,class_First,class_Second,class_Third,who_child,who_man,who_woman
0,3,22.0,1,0,7.25,True,False,0,1,0,0,1,0,0,1,0,1,0
1,1,38.0,1,0,71.2833,False,False,1,0,1,0,0,1,0,0,0,0,1
2,3,26.0,0,0,7.925,False,True,1,0,0,0,1,0,0,1,0,0,1
3,1,35.0,1,0,53.1,False,False,1,0,0,0,1,1,0,0,0,0,1
4,3,35.0,0,0,8.05,True,True,0,1,0,0,1,0,0,1,0,1,0


# 学習の実施
- データを学習・検証用、テスト用に分割
- Cross Validationを利用してパラメータを決定
- テストデータを用いて精度の確認
- 機械学習モデルとしてはLightGBMというGradient Boostingのライブラリを使う

In [13]:
# 特徴量とターゲットを、訓練データとテストデータに分割
X_trainval,X_test,Y_trainval,Y_test = train_test_split(X,Y,test_size=0.3, random_state=0)

In [14]:
gbm = lgb.LGBMClassifier(objective='binary')

# 試行するパラメータを羅列する
params = {
    'max_depth': [2, 3, 4, 5],
    'reg_alpha': [0, 1, 10, 100],
    'reg_lambda': [0, 1, 10, 100],
}

# 予め探索するパラメータを登録しておくだけで、Cross Validationを行い、良いパラメータを返す
grid_search = GridSearchCV(
                           gbm,  # 分類器を渡す
                           param_grid=params,  # 試行してほしいパラメータを渡す
                           cv=5,  # 5分割交差検証でスコアを確認
                          )

In [15]:
grid_search.fit(X_trainval, Y_trainval) 

print(grid_search.best_score_)  # ベストスコアを表示
print(grid_search.best_params_)  # ベストスコアのパラメータを表示

0.8411225806451613
{'max_depth': 5, 'reg_alpha': 0, 'reg_lambda': 10}


In [16]:
gbm = lgb.LGBMClassifier(objective='binary', **grid_search.best_params_)

In [17]:
gbm.fit(X_trainval, Y_trainval)

LGBMClassifier(max_depth=5, objective='binary', reg_alpha=0, reg_lambda=10)

In [18]:
#モデルからYの値を予測して出力
Y_pred = gbm.predict(X_test)

In [19]:
#混同行列、正解率、適合率、再現率、F値を表示
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score,recall_score,f1_score
print('confusion matrix = \n', confusion_matrix(y_true = Y_test, y_pred = Y_pred))
print('accuracy = ',accuracy_score(y_true = Y_test , y_pred = Y_pred))
print('precision = ',precision_score(y_true = Y_test , y_pred = Y_pred))
print('recall = ',recall_score(y_true = Y_test , y_pred = Y_pred))
print('f1 score = ',f1_score(y_true = Y_test , y_pred = Y_pred))

confusion matrix = 
 [[150  18]
 [ 30  70]]
accuracy =  0.8208955223880597
precision =  0.7954545454545454
recall =  0.7
f1 score =  0.7446808510638298
