# 機械学習をやってみる

## この章で扱うもの

- 前章で作った各種データを再利用
- 機械学習を行ってみる
- 学習結果の評価を行う
- 機械学習の改善方法


## この章で取り扱う手順

- 学習データとテストデータの分割
- sklearnを用いて学習
- 評価
- 交差検証
- グリットサーチ
- 各種モデルを試す
- 評価結果の再確認


In [1]:
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_pickle("df.db")

# 学習データとテストデータの分割

http://qiita.com/terapyon/items/8f8d3518ee8eeb4f96b2

In [4]:
df.head()

Unnamed: 0,報告数,流行,増加,平均気温(℃),最高気温(℃),最低気温(℃),平均湿度(％),最小相対湿度(％),平均現地気圧(hPa),降水量の合計(mm),日照時間(時間),平均風速(m/s)
2014-01-01,0.178571,0,0,9.8,13.7,3.9,54.0,37.0,1005.3,0.0,9.2,5.3
2014-01-02,0.178571,0,0,8.0,12.9,4.4,41.0,26.0,1011.3,0.0,9.1,3.0
2014-01-03,0.178571,0,0,5.9,9.9,2.7,43.0,32.0,1014.9,0.0,4.1,1.6
2014-01-04,0.178571,0,0,6.7,11.5,2.1,47.0,29.0,1009.5,0.0,5.9,2.4
2014-01-05,0.178571,0,0,4.4,6.9,2.3,40.0,28.0,1016.6,0.0,1.1,2.5


In [5]:
X = df.iloc[:, 3:]

In [6]:
X.head()

Unnamed: 0,平均気温(℃),最高気温(℃),最低気温(℃),平均湿度(％),最小相対湿度(％),平均現地気圧(hPa),降水量の合計(mm),日照時間(時間),平均風速(m/s)
2014-01-01,9.8,13.7,3.9,54.0,37.0,1005.3,0.0,9.2,5.3
2014-01-02,8.0,12.9,4.4,41.0,26.0,1011.3,0.0,9.1,3.0
2014-01-03,5.9,9.9,2.7,43.0,32.0,1014.9,0.0,4.1,1.6
2014-01-04,6.7,11.5,2.1,47.0,29.0,1009.5,0.0,5.9,2.4
2014-01-05,4.4,6.9,2.3,40.0,28.0,1016.6,0.0,1.1,2.5


In [7]:
y = df['流行']

In [8]:
y.head()

2014-01-01    0
2014-01-02    0
2014-01-03    0
2014-01-04    0
2014-01-05    0
Name: 流行, dtype: int32

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8)

# ロジスティック回帰

In [11]:
from sklearn.linear_model import LogisticRegression

In [12]:
clf = LogisticRegression()

In [14]:
clf.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [15]:
y_train_pred = clf.predict(X_train)

## 正答率を確認

In [16]:
from sklearn.metrics import accuracy_score

In [17]:
accuracy_score(y_train, y_train_pred)

0.8710045662100456

In [18]:
y_val_pred = clf.predict(X_val)

In [19]:
accuracy_score(y_val, y_val_pred)

0.9

## 混同行列

In [20]:
from sklearn.metrics import confusion_matrix

In [21]:
cm = confusion_matrix(y_val, y_val_pred)

In [22]:
cm

array([[176,   9],
       [ 13,  22]])

In [23]:
cm_t = confusion_matrix(y_train, y_train_pred)
cm_t

array([[674,  48],
       [ 65,  89]])

### 混同行列の評価

```
[[TN, FP],
 [FN, TP]]
 ```

# 適合率(precision)・再現率(recall)・F値(f1-score)

## 適合率
 
- P(今回の場合は、流行していない) に判定された率 (178 / (178+14) = 0.93)
- N(今回の場合は、流行している) に判定された率 (17 / (17+11) = 0.61)
 
## 再現率
 
- Tと正しく予測できた割合 (178 / (178+11) = 0.94)
- Fと正しく予測できた割合 (17 / (17+14) = 0.55)
 
 
## F値
 
`2 / (1/適合率+1/再現率) = 2 * 適合率 * 再現率 / (適合率+再現率）`
 
- 0のF値 `(2 * 0.93 * 0.94 / (0.93 + 0.94) = 0.93`
- 1のF値 `(2 * 0.61 * 0.55 / (0.61 + 0.55) = 0.58`

In [24]:
from sklearn.metrics import classification_report

In [25]:
print(classification_report(y_val, y_val_pred))

              precision    recall  f1-score   support

           0       0.93      0.95      0.94       185
           1       0.71      0.63      0.67        35

    accuracy                           0.90       220
   macro avg       0.82      0.79      0.80       220
weighted avg       0.90      0.90      0.90       220



## レポート関係を関数化し再利用可能にする

In [26]:
def report(y, pred):
    print(accuracy_score(y, pred))
    cm = confusion_matrix(y, pred)
    print(cm)
    cr = classification_report(y, pred)
    print(cr)

In [27]:
report(y_train, y_train_pred)

0.8710045662100456
[[674  48]
 [ 65  89]]
              precision    recall  f1-score   support

           0       0.91      0.93      0.92       722
           1       0.65      0.58      0.61       154

    accuracy                           0.87       876
   macro avg       0.78      0.76      0.77       876
weighted avg       0.87      0.87      0.87       876



## 学習から評価までを関数化


In [28]:
def fit_to_pred(clf, X_train, X_val, y_train, y_val):
    # 学習
    clf.fit(X_train, y_train)
    
    # 学習データで評価
    y_train_pred = clf.predict(X_train)
    print("y_train_pred: ")
    report(y_train, y_train_pred)
    
    # テストデータで評価
    y_val_pred = clf.predict(X_val)
    print("y_val_pred: ")
    report(y_val, y_val_pred)
    
    # 学習済みデータを返す
    return clf

In [29]:
clf = LogisticRegression()
fit_to_pred(clf, X_train, X_val, y_train, y_val)

y_train_pred: 
0.8710045662100456
[[674  48]
 [ 65  89]]
              precision    recall  f1-score   support

           0       0.91      0.93      0.92       722
           1       0.65      0.58      0.61       154

    accuracy                           0.87       876
   macro avg       0.78      0.76      0.77       876
weighted avg       0.87      0.87      0.87       876

y_val_pred: 
0.9
[[176   9]
 [ 13  22]]
              precision    recall  f1-score   support

           0       0.93      0.95      0.94       185
           1       0.71      0.63      0.67        35

    accuracy                           0.90       220
   macro avg       0.82      0.79      0.80       220
weighted avg       0.90      0.90      0.90       220





LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

# その他の機械学習アルゴリズム

- サポートベクターマシン SVC
- カーネルSVM
- 決定木 DecisionTreeClassifier
- ランダムフォレスト RandomForestClassifier
- k近傍

In [30]:
from sklearn.svm import SVC

In [31]:
svc = SVC(kernel="linear")
fit_to_pred(svc, X_train, X_val, y_train, y_val)

y_train_pred: 
0.8618721461187214
[[668  54]
 [ 67  87]]
              precision    recall  f1-score   support

           0       0.91      0.93      0.92       722
           1       0.62      0.56      0.59       154

    accuracy                           0.86       876
   macro avg       0.76      0.75      0.75       876
weighted avg       0.86      0.86      0.86       876

y_val_pred: 
0.8909090909090909
[[173  12]
 [ 12  23]]
              precision    recall  f1-score   support

           0       0.94      0.94      0.94       185
           1       0.66      0.66      0.66        35

    accuracy                           0.89       220
   macro avg       0.80      0.80      0.80       220
weighted avg       0.89      0.89      0.89       220



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [32]:
k_svc = SVC(kernel="rbf")
fit_to_pred(k_svc, X_train, X_val, y_train, y_val)

y_train_pred: 
0.9988584474885844
[[722   0]
 [  1 153]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       722
           1       1.00      0.99      1.00       154

    accuracy                           1.00       876
   macro avg       1.00      1.00      1.00       876
weighted avg       1.00      1.00      1.00       876

y_val_pred: 
0.8363636363636363
[[183   2]
 [ 34   1]]
              precision    recall  f1-score   support

           0       0.84      0.99      0.91       185
           1       0.33      0.03      0.05        35

    accuracy                           0.84       220
   macro avg       0.59      0.51      0.48       220
weighted avg       0.76      0.84      0.77       220





SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [33]:
from sklearn.tree import DecisionTreeClassifier

In [34]:
tree = DecisionTreeClassifier(max_depth=2)
fit_to_pred(tree, X_train, X_val, y_train, y_val)

y_train_pred: 
0.8744292237442922
[[660  62]
 [ 48 106]]
              precision    recall  f1-score   support

           0       0.93      0.91      0.92       722
           1       0.63      0.69      0.66       154

    accuracy                           0.87       876
   macro avg       0.78      0.80      0.79       876
weighted avg       0.88      0.87      0.88       876

y_val_pred: 
0.8772727272727273
[[170  15]
 [ 12  23]]
              precision    recall  f1-score   support

           0       0.93      0.92      0.93       185
           1       0.61      0.66      0.63        35

    accuracy                           0.88       220
   macro avg       0.77      0.79      0.78       220
weighted avg       0.88      0.88      0.88       220



DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [35]:
from sklearn.ensemble import RandomForestClassifier

In [36]:
rf = RandomForestClassifier()
fit_to_pred(rf, X_train, X_val, y_train, y_val)

y_train_pred: 
0.9920091324200914
[[722   0]
 [  7 147]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       722
           1       1.00      0.95      0.98       154

    accuracy                           0.99       876
   macro avg       1.00      0.98      0.99       876
weighted avg       0.99      0.99      0.99       876

y_val_pred: 
0.8681818181818182
[[175  10]
 [ 19  16]]
              precision    recall  f1-score   support

           0       0.90      0.95      0.92       185
           1       0.62      0.46      0.52        35

    accuracy                           0.87       220
   macro avg       0.76      0.70      0.72       220
weighted avg       0.86      0.87      0.86       220





RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [37]:
from sklearn.neighbors import KNeighborsClassifier

In [38]:
knn = KNeighborsClassifier()
fit_to_pred(knn, X_train, X_val, y_train, y_val)

y_train_pred: 
0.8904109589041096
[[682  40]
 [ 56  98]]
              precision    recall  f1-score   support

           0       0.92      0.94      0.93       722
           1       0.71      0.64      0.67       154

    accuracy                           0.89       876
   macro avg       0.82      0.79      0.80       876
weighted avg       0.89      0.89      0.89       876

y_val_pred: 
0.8545454545454545
[[171  14]
 [ 18  17]]
              precision    recall  f1-score   support

           0       0.90      0.92      0.91       185
           1       0.55      0.49      0.52        35

    accuracy                           0.85       220
   macro avg       0.73      0.71      0.71       220
weighted avg       0.85      0.85      0.85       220



KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

# 交差検証(クロスバリデーション)

In [45]:
from sklearn.model_selection import cross_val_score

In [46]:
from sklearn.model_selection import KFold

In [47]:
cv = KFold(5, shuffle=True)

In [48]:
clf = LogisticRegression()
cross_val_score(clf, X, y, cv=cv)



array([0.85      , 0.8630137 , 0.87671233, 0.87214612, 0.88584475])

In [49]:
k_svc = SVC(kernel="rbf")
cross_val_score(k_svc, X, y, cv=cv)



array([0.83181818, 0.8630137 , 0.81278539, 0.77625571, 0.85388128])

In [50]:
rf = RandomForestClassifier()
cross_val_score(rf, X, y, cv=cv)



array([0.87272727, 0.81278539, 0.85844749, 0.87671233, 0.82191781])

## F1-score で評価

In [51]:
clf = LogisticRegression()
cross_val_score(clf, X, y, cv=cv, scoring="f1")



array([0.57831325, 0.63888889, 0.65671642, 0.60869565, 0.57575758])

In [52]:
k_svc = SVC(kernel="rbf")
cross_val_score(k_svc, X, y, cv=cv, scoring="f1")



array([0.0625    , 0.08510638, 0.        , 0.05263158, 0.        ])

In [53]:
rf = RandomForestClassifier()
cross_val_score(rf, X, y, cv=cv, scoring="f1")



array([0.49230769, 0.59259259, 0.50909091, 0.47058824, 0.31034483])

# グリッドサーチ

In [54]:
from sklearn.model_selection import GridSearchCV

In [55]:
param_grid = {'max_depth': [2, 3, 4, 5, 10, 15, 20, 30], 'n_estimators': [2, 3, 4, 5, 10, 20, 30, 40]}

In [56]:
rf = RandomForestClassifier(max_depth=2, n_estimators=2)

In [57]:
grid_search = GridSearchCV(rf, param_grid, cv=cv, n_jobs=-1, verbose=1)

In [58]:
grid_search.fit(X, y)

Fitting 5 folds for each of 64 candidates, totalling 320 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  54 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed:    8.0s finished


GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=True),
             error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=2,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=2, n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                           

In [59]:
grid_search.best_score_

0.8777372262773723

In [60]:
grid_search.best_params_

{'max_depth': 3, 'n_estimators': 20}

In [61]:
rf = RandomForestClassifier(max_depth=2, n_estimators=2)
grid_search = GridSearchCV(rf, param_grid, cv=cv, n_jobs=-1, verbose=1, scoring='f1')
grid_search.fit(X, y)

Fitting 5 folds for each of 64 candidates, totalling 320 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 320 out of 320 | elapsed:    4.8s finished


GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=True),
             error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=2,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=2, n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                           

In [62]:
print(grid_search.best_score_)
print(grid_search.best_params_)

0.6184289744937692
{'max_depth': 2, 'n_estimators': 10}


In [63]:
rf = RandomForestClassifier(max_depth=3, n_estimators=40)
cross_val_score(rf, X, y, cv=cv, scoring="f1")

array([0.64367816, 0.61971831, 0.68421053, 0.64864865, 0.63157895])

# 最終確認

In [64]:
rf = RandomForestClassifier(max_depth=3, n_estimators=40)
fit_to_pred(rf, X_train, X_val, y_train, y_val)

y_train_pred: 
0.882420091324201
[[671  51]
 [ 52 102]]
              precision    recall  f1-score   support

           0       0.93      0.93      0.93       722
           1       0.67      0.66      0.66       154

    accuracy                           0.88       876
   macro avg       0.80      0.80      0.80       876
weighted avg       0.88      0.88      0.88       876

y_val_pred: 
0.8772727272727273
[[172  13]
 [ 14  21]]
              precision    recall  f1-score   support

           0       0.92      0.93      0.93       185
           1       0.62      0.60      0.61        35

    accuracy                           0.88       220
   macro avg       0.77      0.76      0.77       220
weighted avg       0.88      0.88      0.88       220



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=3, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=40,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [65]:
rf.predict(X_val)

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],
      dtype=int32)

In [66]:
from sklearn.externals import joblib



In [67]:
joblib.dump(rf, "clf_rf.db")

['clf_rf.db']