In [1]:
# 演習3 タイタニック

import os
import pandas as pd
import numpy as np

TITANIC_PATH = os.path.join("datasets", "titanic")

def load_titanic_data(filename, titanic_path=TITANIC_PATH):
    csv_path = os.path.join(titanic_path, filename)
    return pd.read_csv(csv_path)

In [2]:
train_data = load_titanic_data("train.csv")
test_data = load_titanic_data("test.csv")

In [3]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
train_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [5]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [6]:
train_data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [7]:
test_data.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [8]:
# 必要な属性のみを取り出しデータを変換する関数

from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attributes):
        self.attributes = attributes
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attributes]

In [9]:
# 数値属性用のパイプライン

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('selector', DataFrameSelector(["Age","SibSp","Parch","Fare"])),
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler()),
])

In [10]:
# カテゴリ属性用のパイプライン

from sklearn.preprocessing import OneHotEncoder
cat_pipeline = Pipeline([
    ('selector', DataFrameSelector(["Pclass","Sex","Embarked"])),
    ('imputer', SimpleImputer(strategy="most_frequent")),
    ('cat_encoder', OneHotEncoder(sparse=False)),
])

In [11]:
# パイプラインの統合

from sklearn.pipeline import FeatureUnion

full_pipeline = FeatureUnion(transformer_list=[
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline),
])

In [12]:
train_label = train_data["Survived"]
train_prepared = full_pipeline.fit_transform(train_data)
train_prepared

array([[-0.56573646,  0.43279337, -0.47367361, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.66386103,  0.43279337, -0.47367361, ...,  1.        ,
         0.        ,  0.        ],
       [-0.25833709, -0.4745452 , -0.47367361, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [-0.1046374 ,  0.43279337,  2.00893337, ...,  0.        ,
         0.        ,  1.        ],
       [-0.25833709, -0.4745452 , -0.47367361, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.20276197, -0.4745452 , -0.47367361, ...,  0.        ,
         1.        ,  0.        ]])

In [13]:
# RandomForestClassifierを訓練

from sklearn.ensemble import RandomForestClassifier

# グリッドサーチ

from sklearn.model_selection import GridSearchCV

param_grid = {"max_depth": [2, 3, 4],
              "n_estimators":[50, 100, 200, 300],
              "max_features": [4, 7, 10]}

forest_reg = RandomForestClassifier()

grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                          scoring='neg_mean_squared_error')

grid_search.fit(train_prepared, train_label)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_depth': [2, 3, 4], 'n_estimators': [50, 100, 200, 300], 'max_features': [4, 7, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [14]:
grid_search.best_params_

{'max_depth': 4, 'max_features': 7, 'n_estimators': 300}

In [15]:
grid_search.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=4, max_features=7, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [16]:
cvresults = grid_search.cv_results_
for score, params in zip (cvresults["mean_test_score"],
                         cvresults["params"]):
    print(np.sqrt(-score), params)

0.4714045207910317 {'max_depth': 2, 'max_features': 4, 'n_estimators': 50}
0.46299663177235717 {'max_depth': 2, 'max_features': 4, 'n_estimators': 100}
0.46299663177235717 {'max_depth': 2, 'max_features': 4, 'n_estimators': 200}
0.46299663177235717 {'max_depth': 2, 'max_features': 4, 'n_estimators': 300}
0.4605661864718383 {'max_depth': 2, 'max_features': 7, 'n_estimators': 50}
0.4605661864718383 {'max_depth': 2, 'max_features': 7, 'n_estimators': 100}
0.4605661864718383 {'max_depth': 2, 'max_features': 7, 'n_estimators': 200}
0.46420708254852755 {'max_depth': 2, 'max_features': 7, 'n_estimators': 300}
0.4737793696791343 {'max_depth': 2, 'max_features': 10, 'n_estimators': 50}
0.4737793696791343 {'max_depth': 2, 'max_features': 10, 'n_estimators': 100}
0.4737793696791343 {'max_depth': 2, 'max_features': 10, 'n_estimators': 200}
0.4737793696791343 {'max_depth': 2, 'max_features': 10, 'n_estimators': 300}
0.44064028507448966 {'max_depth': 3, 'max_features': 4, 'n_estimators': 50}
0.43936

In [17]:
# テストセットで評価

fin_forest_model = grid_search.best_estimator_

test_prepared = full_pipeline.transform(test_data)

forest_predictions = fin_forest_model.predict(test_prepared)

In [18]:
# 交差検証で正解率の測定

from sklearn.model_selection import cross_val_score

forest_scores = cross_val_score(fin_forest_model, train_prepared, train_label, cv=5, scoring="accuracy")
forest_scores.mean()

0.821584928223302

In [19]:
# datasetsディレクトリに結果を出力

result_out = load_titanic_data("test.csv")
result_out["Survived"] = forest_predictions

submission_path = os.path.join(TITANIC_PATH, "submission.csv")
result_out[["PassengerId","Survived"]].to_csv(submission_path,index=False)