# 機械学習をやってみる

## この章で扱うもの

- 前章で作った各種データを再利用
- 機械学習を行ってみる
- 学習結果の評価を行う
- 機械学習の改善方法


## この章で取り扱う手順

- 学習データとテストデータの分割
- sklearnを用いて学習
- 評価
- 交差検証
- グリットサーチ
- 各種モデルを試す
- 評価結果の再確認


In [1]:
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_pickle("df.db")

# 学習データとテストデータの分割

http://qiita.com/terapyon/items/8f8d3518ee8eeb4f96b2

In [4]:
df.head()

Unnamed: 0,報告数,流行,増加,平均気温(℃),最高気温(℃),最低気温(℃),平均湿度(％),最小相対湿度(％),平均現地気圧(hPa),降水量の合計(mm),日照時間(時間),平均風速(m/s)
2014-01-01,0.178571,0,0,9.8,13.7,3.9,54.0,37.0,1005.3,0.0,9.2,5.3
2014-01-02,0.178571,0,0,8.0,12.9,4.4,41.0,26.0,1011.3,0.0,9.1,3.0
2014-01-03,0.178571,0,0,5.9,9.9,2.7,43.0,32.0,1014.9,0.0,4.1,1.6
2014-01-04,0.178571,0,0,6.7,11.5,2.1,47.0,29.0,1009.5,0.0,5.9,2.4
2014-01-05,0.178571,0,0,4.4,6.9,2.3,40.0,28.0,1016.6,0.0,1.1,2.5


In [5]:
X = df.iloc[:, 3:]

In [6]:
X.head()

Unnamed: 0,平均気温(℃),最高気温(℃),最低気温(℃),平均湿度(％),最小相対湿度(％),平均現地気圧(hPa),降水量の合計(mm),日照時間(時間),平均風速(m/s)
2014-01-01,9.8,13.7,3.9,54.0,37.0,1005.3,0.0,9.2,5.3
2014-01-02,8.0,12.9,4.4,41.0,26.0,1011.3,0.0,9.1,3.0
2014-01-03,5.9,9.9,2.7,43.0,32.0,1014.9,0.0,4.1,1.6
2014-01-04,6.7,11.5,2.1,47.0,29.0,1009.5,0.0,5.9,2.4
2014-01-05,4.4,6.9,2.3,40.0,28.0,1016.6,0.0,1.1,2.5


In [7]:
y = df['流行']

In [8]:
y.head()

2014-01-01    0
2014-01-02    0
2014-01-03    0
2014-01-04    0
2014-01-05    0
Name: 流行, dtype: int32

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8)

# ロジスティック回帰

In [11]:
from sklearn.linear_model import LogisticRegression

In [12]:
clf = LogisticRegression()

In [13]:
clf.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [14]:
y_train_pred = clf.predict(X_train)

## 正答率を確認

In [15]:
from sklearn.metrics import accuracy_score

In [16]:
accuracy_score(y_train, y_train_pred)

0.86529680365296802

In [17]:
y_val_pred = clf.predict(X_val)

In [18]:
accuracy_score(y_val, y_val_pred)

0.92272727272727273

## 混同行列

In [19]:
from sklearn.metrics import confusion_matrix

In [20]:
cm = confusion_matrix(y_val, y_val_pred)

In [21]:
cm

array([[172,   9],
       [  8,  31]])

In [22]:
cm_t = confusion_matrix(y_train, y_train_pred)
cm_t

array([[679,  47],
       [ 71,  79]])

### 混同行列の評価

```
[[TN, FP],
 [FN, TP]]
 ```

# 適合率(precision)・再現率(recall)・F値(f1-score)

## 適合率
 
- P(今回の場合は、流行していない) に判定された率 (178 / (178+14) = 0.93)
- N(今回の場合は、流行している) に判定された率 (17 / (17+11) = 0.61)
 
## 再現率
 
- Tと正しく予測できた割合 (178 / (178+11) = 0.94)
- Fと正しく予測できた割合 (17 / (17+14) = 0.55)
 
 
## F値
 
`2 / (1/適合率+1/再現率) = 2 * 適合率 * 再現率 / (適合率+再現率）`
 
- 0のF値 `(2 * 0.93 * 0.94 / (0.93 + 0.94) = 0.93`
- 1のF値 `(2 * 0.61 * 0.55 / (0.61 + 0.55) = 0.58`

In [23]:
from sklearn.metrics import classification_report

In [24]:
print(classification_report(y_val, y_val_pred))

             precision    recall  f1-score   support

          0       0.96      0.95      0.95       181
          1       0.78      0.79      0.78        39

avg / total       0.92      0.92      0.92       220



## レポート関係を関数化し再利用可能にする

In [25]:
def report(y, pred):
    print(accuracy_score(y, pred))
    cm = confusion_matrix(y, pred)
    print(cm)
    cr = classification_report(y, pred)
    print(cr)

In [26]:
report(y_train, y_train_pred)

0.865296803653
[[679  47]
 [ 71  79]]
             precision    recall  f1-score   support

          0       0.91      0.94      0.92       726
          1       0.63      0.53      0.57       150

avg / total       0.86      0.87      0.86       876



## 学習から評価までを関数化


In [27]:
def fit_to_pred(clf, X_train, X_val, y_train, y_val):
    # 学習
    clf.fit(X_train, y_train)
    
    # 学習データで評価
    y_train_pred = clf.predict(X_train)
    print("y_train_pred: ")
    report(y_train, y_train_pred)
    
    # テストデータで評価
    y_val_pred = clf.predict(X_val)
    print("y_val_pred: ")
    report(y_val, y_val_pred)
    
    # 学習済みデータを返す
    return clf

In [28]:
clf = LogisticRegression()
fit_to_pred(clf, X_train, X_val, y_train, y_val)

y_train_pred: 
0.865296803653
[[679  47]
 [ 71  79]]
             precision    recall  f1-score   support

          0       0.91      0.94      0.92       726
          1       0.63      0.53      0.57       150

avg / total       0.86      0.87      0.86       876

y_val_pred: 
0.922727272727
[[172   9]
 [  8  31]]
             precision    recall  f1-score   support

          0       0.96      0.95      0.95       181
          1       0.78      0.79      0.78        39

avg / total       0.92      0.92      0.92       220



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

# その他の機械学習アルゴリズム

- サポートベクターマシン SVC
- カーネルSVM
- 決定木 DecisionTreeClassifier
- ランダムフォレスト RandomForestClassifier
- k近傍

In [29]:
from sklearn.svm import SVC

In [30]:
svc = SVC(kernel="linear")
fit_to_pred(svc, X_train, X_val, y_train, y_val)

y_train_pred: 
0.86301369863
[[683  43]
 [ 77  73]]
             precision    recall  f1-score   support

          0       0.90      0.94      0.92       726
          1       0.63      0.49      0.55       150

avg / total       0.85      0.86      0.86       876

y_val_pred: 
0.890909090909
[[171  10]
 [ 14  25]]
             precision    recall  f1-score   support

          0       0.92      0.94      0.93       181
          1       0.71      0.64      0.68        39

avg / total       0.89      0.89      0.89       220



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [31]:
k_svc = SVC(kernel="rbf")
fit_to_pred(k_svc, X_train, X_val, y_train, y_val)

y_train_pred: 
0.997716894977
[[726   0]
 [  2 148]]
             precision    recall  f1-score   support

          0       1.00      1.00      1.00       726
          1       1.00      0.99      0.99       150

avg / total       1.00      1.00      1.00       876

y_val_pred: 
0.827272727273
[[181   0]
 [ 38   1]]
             precision    recall  f1-score   support

          0       0.83      1.00      0.91       181
          1       1.00      0.03      0.05        39

avg / total       0.86      0.83      0.75       220



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [32]:
from sklearn.tree import DecisionTreeClassifier

In [33]:
tree = DecisionTreeClassifier(max_depth=2)
fit_to_pred(tree, X_train, X_val, y_train, y_val)

y_train_pred: 
0.86301369863
[[672  54]
 [ 66  84]]
             precision    recall  f1-score   support

          0       0.91      0.93      0.92       726
          1       0.61      0.56      0.58       150

avg / total       0.86      0.86      0.86       876

y_val_pred: 
0.922727272727
[[167  14]
 [  3  36]]
             precision    recall  f1-score   support

          0       0.98      0.92      0.95       181
          1       0.72      0.92      0.81        39

avg / total       0.94      0.92      0.93       220



DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [34]:
from sklearn.ensemble import RandomForestClassifier

In [35]:
rf = RandomForestClassifier()
fit_to_pred(rf, X_train, X_val, y_train, y_val)

y_train_pred: 
0.988584474886
[[724   2]
 [  8 142]]
             precision    recall  f1-score   support

          0       0.99      1.00      0.99       726
          1       0.99      0.95      0.97       150

avg / total       0.99      0.99      0.99       876

y_val_pred: 
0.918181818182
[[172   9]
 [  9  30]]
             precision    recall  f1-score   support

          0       0.95      0.95      0.95       181
          1       0.77      0.77      0.77        39

avg / total       0.92      0.92      0.92       220



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [36]:
from sklearn.neighbors import KNeighborsClassifier

In [37]:
knn = KNeighborsClassifier()
fit_to_pred(knn, X_train, X_val, y_train, y_val)

y_train_pred: 
0.884703196347
[[689  37]
 [ 64  86]]
             precision    recall  f1-score   support

          0       0.92      0.95      0.93       726
          1       0.70      0.57      0.63       150

avg / total       0.88      0.88      0.88       876

y_val_pred: 
0.890909090909
[[171  10]
 [ 14  25]]
             precision    recall  f1-score   support

          0       0.92      0.94      0.93       181
          1       0.71      0.64      0.68        39

avg / total       0.89      0.89      0.89       220



KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

# 交差検証(クロスバリデーション)

In [38]:
from sklearn.model_selection import cross_val_score

In [39]:
from sklearn.model_selection import KFold

In [40]:
cv = KFold(5, shuffle=True)

In [41]:
clf = LogisticRegression()
cross_val_score(clf, X, y, cv=cv)

array([ 0.87272727,  0.87671233,  0.86757991,  0.89497717,  0.84474886])

In [42]:
k_svc = SVC(kernel="rbf")
cross_val_score(k_svc, X, y, cv=cv)

array([ 0.87727273,  0.82191781,  0.83561644,  0.79908676,  0.8173516 ])

In [43]:
rf = RandomForestClassifier()
cross_val_score(rf, X, y, cv=cv)

array([ 0.82727273,  0.8630137 ,  0.85388128,  0.85388128,  0.8630137 ])

## F1-score で評価

In [44]:
clf = LogisticRegression()
cross_val_score(clf, X, y, cv=cv, scoring="f1")

array([ 0.69444444,  0.54545455,  0.56716418,  0.59649123,  0.57142857])

In [45]:
k_svc = SVC(kernel="rbf")
cross_val_score(k_svc, X, y, cv=cv, scoring="f1")

  'precision', 'predicted', average, warn_for)


array([ 0.        ,  0.        ,  0.        ,  0.09756098,  0.05128205])

In [46]:
rf = RandomForestClassifier()
cross_val_score(rf, X, y, cv=cv, scoring="f1")

array([ 0.54237288,  0.56756757,  0.44776119,  0.42424242,  0.61538462])

# グリッドサーチ

In [47]:
from sklearn.model_selection import GridSearchCV

In [48]:
param_grid = {'max_depth': [2, 3, 4, 5, 10, 15, 20, 30], 'n_estimators': [2, 3, 4, 5, 10, 20, 30, 40]}

In [49]:
rf = RandomForestClassifier(max_depth=2, n_estimators=2)

In [50]:
grid_search = GridSearchCV(rf, param_grid, cv=cv, n_jobs=-1, verbose=1)

In [51]:
grid_search.fit(X, y)

Fitting 5 folds for each of 64 candidates, totalling 320 fits


[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed:    5.5s finished


GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=True),
       error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=2, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'max_depth': [2, 3, 4, 5, 10, 15, 20, 30], 'n_estimators': [2, 3, 4, 5, 10, 20, 30, 40]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=1)

In [52]:
grid_search.best_score_

0.86678832116788318

In [53]:
grid_search.best_params_

{'max_depth': 2, 'n_estimators': 30}

In [54]:
rf = RandomForestClassifier(max_depth=2, n_estimators=2)
grid_search = GridSearchCV(rf, param_grid, cv=cv, n_jobs=-1, verbose=1, scoring='f1')
grid_search.fit(X, y)

Fitting 5 folds for each of 64 candidates, totalling 320 fits


[Parallel(n_jobs=-1)]: Done 212 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed:    6.0s finished


GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=True),
       error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=2, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'max_depth': [2, 3, 4, 5, 10, 15, 20, 30], 'n_estimators': [2, 3, 4, 5, 10, 20, 30, 40]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='f1', verbose=1)

In [55]:
print(grid_search.best_score_)
print(grid_search.best_params_)

0.638510312405
{'max_depth': 2, 'n_estimators': 2}


In [56]:
rf = RandomForestClassifier(max_depth=3, n_estimators=40)
cross_val_score(rf, X, y, cv=cv, scoring="f1")

array([ 0.57894737,  0.64864865,  0.65168539,  0.62068966,  0.55263158])

# 最終確認

In [57]:
rf = RandomForestClassifier(max_depth=3, n_estimators=40)
fit_to_pred(rf, X_train, X_val, y_train, y_val)

y_train_pred: 
0.873287671233
[[671  55]
 [ 56  94]]
             precision    recall  f1-score   support

          0       0.92      0.92      0.92       726
          1       0.63      0.63      0.63       150

avg / total       0.87      0.87      0.87       876

y_val_pred: 
0.922727272727
[[168  13]
 [  4  35]]
             precision    recall  f1-score   support

          0       0.98      0.93      0.95       181
          1       0.73      0.90      0.80        39

avg / total       0.93      0.92      0.93       220



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=3, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=40, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [58]:
rf.predict(X_val)

array([0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [59]:
from sklearn.externals import joblib

In [60]:
joblib.dump(rf, "clf_rf.db")

['clf_rf.db']