In [1]:
import sys
import numpy as np
import pandas as pd
import dill
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [2]:
# データ読み込み
df = pd.read_csv('./data/room_training.txt')
# データの重複を確認
# print(df.duplicated())
# print(df[df.duplicated()])
# データの重複を削除
print(df[~df.duplicated()].shape)
# データとラベルに分類（データの1行目にラベルデータが必要）
x = df.loc[:, ['uses','floor','name','ceiling','width','height','hip','points']]
y = df.loc[:, "ups"]

(1511, 9)


In [3]:
# 分類
train_x, test_x, train_y, test_y = train_test_split(x, y)
# train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.2)
# シャッフルされる場合、デフォルトでは実行するたびにランダムに分割される。引数random_seedを指定して乱数シードを固定すると常に同じように分割される。
seed = 42
train_x, test_x, train_y, test_y = train_test_split(x, y, random_state=seed)
# train_x.head()
# train_y.head()

In [4]:
# ランダムフォレスト（パラメーターはほぼ初期値）
clf = RandomForestClassifier(random_state=seed, n_jobs=-1)
# 学習
clf.fit(train_x,train_y)
print(clf.score(test_x,test_y))
# 未知のデータ
unknown_data = [[1,1,1,2.4,2.73,2.73,0,4]] # 002640300102631190202630290
# 推論
print(clf.predict(unknown_data))

0.06613756613756613
["'002640300102631190202630290'"]


In [27]:
# ハイパーパラメータをグリッドサーチを用いて調査
# from sklearn.model_selection import GridSearchCV
# search_params = {
#       'n_estimators'      : [5, 10, 20, 30, 50, 100, 300],
#       'max_features'      : [3, 5, 10, 15, 20],
#       'random_state'      : [seed],
#       'n_jobs'            : [1],
#       'min_samples_split' : [3, 5, 10, 15, 20, 25, 30, 40, 50, 100],
#       'max_depth'         : [3, 5, 10, 15, 20, 25, 30, 40, 50, 100]
# }
# gs = GridSearchCV(RandomForestClassifier(),  # 対象の機械学習モデル
#                   search_params,             # 探索パラメタ辞書
#                   cv=3,                      # クロスバリデーションの分割数
#                   verbose=True,              # ログ表示
#                   n_jobs=-1)                 # 並列処理
# gs.fit(train_x, train_y)
 
# print(gs.best_estimator_)

Fitting 3 folds for each of 3500 candidates, totalling 10500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done 188 tasks      | elapsed:   12.9s
[Parallel(n_jobs=-1)]: Done 438 tasks      | elapsed:   28.9s
[Parallel(n_jobs=-1)]: Done 1388 tasks      | elapsed:   55.6s
[Parallel(n_jobs=-1)]: Done 2462 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 3786 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 5564 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 7586 tasks      | elapsed:  5.2min
[Parallel(n_jobs=-1)]: Done 9666 tasks      | elapsed:  6.9min
[Parallel(n_jobs=-1)]: Done 10500 out of 10500 | elapsed:  7.3min finished


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=15, max_features=3,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=20,
                       min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=1,
                       oob_score=False, random_state=42, verbose=0,
                       warm_start=False)


In [5]:
# ハイパーパラメータチューニング
clf = RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                             criterion='gini', max_depth=15, max_features=3,
                             max_leaf_nodes=None, max_samples=None,
                             min_impurity_decrease=0.0, min_impurity_split=None,
                             min_samples_leaf=1, min_samples_split=20,
                             min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=-1,
                             oob_score=False, random_state=seed, verbose=0,
                             warm_start=False)

# 学習
clf.fit(train_x,train_y)
print(clf.score(test_x,test_y))
# 未知のデータ
unknown_data = [[1,1,1,2.4,2.73,2.73,0,4]] # 002640300102631190202630290
# 推論
print(clf.predict(unknown_data))

0.07407407407407407
["'002640300102631190202630290'"]


In [43]:
# モデルを保存する
fp = 'joinery_autoup.model'
# pickle.dump(clf, open(fp, 'wb'))
dill.dump(clf, open(fp,'wb'))