# ランダムフォレスト

In [0]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.datasets import load_breast_cancer

In [0]:
cancer = load_breast_cancer()

In [0]:
X_train, X_test, y_train, y_test = train_test_split(
    cancer.data, cancer.target, random_state=42
)

In [0]:
models = {
    'RandomForest': RandomForestClassifier(criterion='gini',
                                           n_estimators=100, 
                                           max_depth=2, 
                                           max_features='auto', 
                                           min_samples_leaf=1, 
                                           n_jobs=None,
                                           oob_score=False, 
                                           random_state=0, 
                                           verbose=0, 
                                           warm_start=False),
    'GradientBoost': GradientBoostingClassifier(random_state=0)
}

In [0]:
scores = {}
for model_name, model in models.items():
    model.fit(X_train, y_train)
    scores[(model_name, 'train_score')] = model.score(X_train, y_train)
    scores[(model_name, 'test_score')] = model.score(X_test, y_test)
pd.Series(scores).unstack()

In [0]:
# モデル構築の中でどの変数が重要な役割を担ったのかを定量的に把握する
s = pd.Series(models['RandomForest'].feature_importances_, 
              index=cancer.feature_names )

# 取得した値を降順に表示
s.sort_values(ascending=False).plot.bar(color='C0')

## グリッドサーチとランダムフォレスト

## ボストンデータセットのランダムフォレスト

In [0]:
from sklearn.datasets import load_boston
boston = load_boston()