In [56]:
import warnings
warnings.simplefilter('ignore', FutureWarning)

In [57]:
import os
import pandas as pd
import numpy as np

In [58]:
# データ保存ディレクトリへ移動
ls = os.getcwd().split('\\')
ls[-1] = 'data'
lsr=""
for i in ls:
    lsr += i+'\\'

# ディレクトリ移動
os.chdir(lsr)

In [59]:
train = pd.read_csv('./datasets_nb001/nb001_train.csv', index_col=0)
test = pd.read_csv('./datasets_nb001/nb001_test.csv', index_col=0)

In [60]:
# 提出データ用に受け皿を作成
PassengerID_df = test.PassengerId

In [61]:
print(train.columns)
print(test.columns)

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'CategoricalAge',
       'SibSp_0_1_2over', 'Parch_0_1_2_3over', 'FamilySize', 'IsAlone',
       'CategoricalFare', 'Title', 'Title_num', 'SexC'],
      dtype='object')
Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked', 'CategoricalAge',
       'SibSp_0_1_2over', 'Parch_0_1_2_3over', 'FamilySize', 'IsAlone',
       'Title', 'Title_num', 'SexC'],
      dtype='object')


In [62]:
# 不要な列の削除
train.drop(['PassengerId', 'Name', 'Sex', 'Ticket', 'Cabin', 'CategoricalFare'],
            axis=1, inplace=True)

test.drop(['PassengerId', 'Name', 'Sex', 'Ticket', 'Cabin'],
            axis=1, inplace=True)

In [63]:
train.head(3)

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Embarked,CategoricalAge,SibSp_0_1_2over,Parch_0_1_2_3over,FamilySize,IsAlone,Title,Title_num,SexC
0,0,3,22.0,1,0,7.25,S,"(20.315, 30.263]",1,1,2,1,Mr,3,0.0
1,1,1,38.0,1,0,71.2833,C,"(30.263, 40.21]",1,1,2,1,Mrs,4,1.0
2,1,3,26.0,0,0,7.925,S,"(20.315, 30.263]",0,0,1,0,Miss,2,1.0


In [64]:
test.head(3)

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Embarked,CategoricalAge,SibSp_0_1_2over,Parch_0_1_2_3over,FamilySize,IsAlone,Title,Title_num,SexC
0,3,34.5,0,0,7.8292,Q,"(28.606, 38.085]",0,0,1,0,Mr,3,0.0
1,3,47.0,1,0,7.0,S,"(38.085, 47.564]",1,1,2,1,Mrs,4,1.0
2,2,62.0,0,0,9.6875,Q,"(57.043, 66.521]",0,0,1,0,Mr,3,0.0


In [65]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Survived           891 non-null    int64  
 1   Pclass             891 non-null    int64  
 2   Age                714 non-null    float64
 3   SibSp              891 non-null    int64  
 4   Parch              891 non-null    int64  
 5   Fare               891 non-null    float64
 6   Embarked           889 non-null    object 
 7   CategoricalAge     714 non-null    object 
 8   SibSp_0_1_2over    891 non-null    int64  
 9   Parch_0_1_2_3over  891 non-null    int64  
 10  FamilySize         891 non-null    int64  
 11  IsAlone            891 non-null    int64  
 12  Title              891 non-null    object 
 13  Title_num          891 non-null    int64  
 14  SexC               891 non-null    float64
dtypes: float64(3), int64(9), object(3)
memory usage: 111.4+ KB


In [66]:
train.isnull().sum()

Survived               0
Pclass                 0
Age                  177
SibSp                  0
Parch                  0
Fare                   0
Embarked               2
CategoricalAge       177
SibSp_0_1_2over        0
Parch_0_1_2_3over      0
FamilySize             0
IsAlone                0
Title                  0
Title_num              0
SexC                   0
dtype: int64

# 欠損データの削除と分割

In [67]:
from sklearn.model_selection import train_test_split

# Ageに含まれる欠損値の行を削除
X_full = train.dropna(axis=0, subset=['Age'], inplace=False).copy()

# yとXをそれぞれ定義
y = X_full.Survived
X_full.drop(['Survived'], axis=1, inplace=True)

# 話を簡単にするため、object型の列データを除外
X = X_full.select_dtypes(exclude=['object'])

# 分離
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

# RandomForestClassifier による学習

In [68]:
from sklearn.ensemble import RandomForestClassifier

nb004_rfr = RandomForestClassifier(random_state=1)

nb004_rfr.fit(X_train, y_train)

In [69]:
from sklearn.metrics import mean_absolute_error

# 推定
y_pred = nb004_rfr.predict(X_valid)
val_mae = mean_absolute_error(y_valid, y_pred)
print(f"Validation MAE: ", round(val_mae,3))

Validation MAE:  0.238


In [70]:
# 学習モデルの評価を表示
model_score = nb004_rfr.score(X, y)
model_score = round(model_score, 3)
print('\nモデルのスコア: ', model_score)


モデルのスコア:  0.945


# max_leaf_nodesを変数として、最小MAEを探索

In [71]:
# 決定木のノード数に応じてMAEを取得する関数を定義
def get_mae(max_leaf_nodes, X_train, X_valid, y_train, y_valid):
    model = RandomForestClassifier(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(X_train, y_train)
    preds_val = model.predict(X_valid)
    mae = mean_absolute_error(y_valid, preds_val)
    return(mae)

In [72]:
# 学習を行う決定木のノード数のレンジを設定
candidate_max_leaf_nodes = [x for x in range(2,102,1)]

In [73]:
# 最適な決定木のノード数を取得
scores = {leaf_size: get_mae(leaf_size, X_train, X_valid, y_train, y_valid) for leaf_size in candidate_max_leaf_nodes}
best_tree_size = min(scores, key=scores.get)
print('\n最適ツリーサイズ: ', best_tree_size)


最適ツリーサイズ:  5


In [74]:
# 最適なノード数で学習モデルを作成
nb004_rfr_fin = RandomForestClassifier(max_leaf_nodes=best_tree_size, random_state=1)

# 全データで学習モデルを作成
nb004_rfr_fin.fit(X, y)

In [75]:
# 推定
y_pred_fin = nb004_rfr_fin.predict(X_valid)
val_mae_fin = mean_absolute_error(y_valid, y_pred_fin)
print(f"Validation MAE: ", round(val_mae_fin,3))

Validation MAE:  0.182


In [76]:
# 学習モデルの評価を表示
model_score = nb004_rfr_fin.score(X, y)
model_score = round(model_score, 3)
print('\nモデルのスコア: ', model_score)


モデルのスコア:  0.818


# Kaggleへ提出

In [77]:
test.isnull().sum()

Pclass                0
Age                  86
SibSp                 0
Parch                 0
Fare                  1
Embarked              0
CategoricalAge       86
SibSp_0_1_2over       0
Parch_0_1_2_3over     0
FamilySize            0
IsAlone               0
Title                 0
Title_num             0
SexC                  0
dtype: int64

In [78]:
# Age列の抜けを補正
from sklearn.impute import SimpleImputer

X_test = test.select_dtypes(exclude=['object'])

# Imputation
my_imputer = SimpleImputer()
imputed_X_test = pd.DataFrame(my_imputer.fit_transform(X_test))

# Imputation removed column names; put them back
imputed_X_test.columns = X_test.columns

In [79]:
# 推定(Case-1)
y_pred = nb004_rfr.predict(imputed_X_test)

submission_1_df = pd.DataFrame()
submission_1_df['Survived'] = y_pred
submission_1_df['PassengerID'] = PassengerID_df
submission_1_df.head(3)

Unnamed: 0,Survived,PassengerID
0,0,892
1,0,893
2,0,894


In [80]:
# 推定(Case-2)
y_pred = nb004_rfr_fin.predict(imputed_X_test)

submission_2_df = pd.DataFrame()
submission_2_df['Survived'] = y_pred
submission_2_df['PassengerID'] = PassengerID_df
submission_2_df.head(3)

Unnamed: 0,Survived,PassengerID
0,0,892
1,1,893
2,0,894


In [81]:
# 出力
submission_1_df = submission_1_df.set_index('PassengerID')
submission_2_df = submission_2_df.set_index('PassengerID')

submission_1_df.to_csv(r"./submit/nb004_case1.csv", encoding='utf-8', index=True)
submission_2_df.to_csv(r"./submit/nb004_case2.csv", encoding='utf-8', index=True)