In [1]:
import pandas as pd

例11 特徴量を構築する関数

In [3]:
def makefeature(x):
    cn_num = ['age','balance','day','duration','campaign','pdays','previous']
    x_num = x[cn_num]
    x[cn_num] = (x_num - x_num.mean()) / x_num.std()
    x_dum = pd.get_dummies(x)
    return x_dum

例12 訓練データとテストデータの作成

In [4]:
import numpy as np
from sklearn.cross_validation import train_test_split

データの読み込み

In [7]:
bank = pd.read_csv("bank-full.csv",sep=";")
bank.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [8]:
features, label = makefeature(bank.drop('y',1)), bank.y
random_state = np.random.RandomState(123)
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=.3, random_state=random_state)

例13 Bank Marketingデータセットに対するサポートベクタマシンを用いた予測モデルの構築

In [9]:
from sklearn import svm
from sklearn import metrics
from sklearn.metrics import classification_report

In [10]:
clf = svm.SVC()
clf.fit(X_train,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

クラスラベルの予測

In [11]:
pred = clf.predict(X_test)
print(metrics.classification_report(y_test, pred, target_names=['no','yes']))

             precision    recall  f1-score   support

         no       0.91      0.98      0.94     11998
        yes       0.65      0.28      0.39      1566

avg / total       0.88      0.90      0.88     13564



例14 Bank Marketingデータセットに対するランダムフォレストを用いた予測モデル構築

In [12]:
import numpy as np
from sklearn import ensemble
from sklearn import metrics
from sklearn.metrics import classification_report

In [15]:
random_state = np.random.RandomState(123)
clf = ensemble.RandomForestClassifier(n_estimators=500, random_state=random_state)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
            oob_score=False,
            random_state=<mtrand.RandomState object at 0x000000002660EB00>,
            verbose=0, warm_start=False)

In [16]:
pred = clf.predict(X_test)
print(metrics.classification_report(y_test, pred, target_names=['no','yes']))

             precision    recall  f1-score   support

         no       0.92      0.97      0.95     11998
        yes       0.67      0.39      0.49      1566

avg / total       0.90      0.91      0.90     13564



例15 sciket-learnを用いた10分割のクロスバリデーションの実行

In [17]:
from sklearn import cross_validation as cv
from sklearn.metrics import classification_report
from sklearn import preprocessing
from sklearn import svm

層別k分割

In [18]:
skf = cv.StratifiedKFold(y_train, 10)
for train, test in skf:
    print("%s %s" % (train,test))

[ 3084  3098  3120 ..., 31644 31645 31646] [   0    1    2 ..., 3177 3178 3180]
[    0     1     2 ..., 31644 31645 31646] [3084 3098 3120 ..., 6367 6368 6369]
[    0     1     2 ..., 31644 31645 31646] [6071 6075 6076 ..., 9539 9540 9541]
[    0     1     2 ..., 31644 31645 31646] [ 9168  9173  9187 ..., 12759 12760 12761]
[    0     1     2 ..., 31644 31645 31646] [11997 11998 12004 ..., 15895 15896 15897]
[    0     1     2 ..., 31644 31645 31646] [15222 15231 15238 ..., 19059 19062 19063]
[    0     1     2 ..., 31644 31645 31646] [18552 18585 18587 ..., 22204 22205 22206]
[    0     1     2 ..., 31644 31645 31646] [21659 21662 21671 ..., 25345 25346 25347]
[    0     1     2 ..., 31644 31645 31646] [25106 25138 25143 ..., 28502 28503 28504]
[    0     1     2 ..., 28502 28503 28504] [28269 28282 28288 ..., 31644 31645 31646]


RBFカーネルのサポートベクタマシン

In [19]:
clf = svm.SVC()

クラスラベルを1,0に変換

In [21]:
lb = preprocessing.LabelBinarizer()
y_train_bin = lb.fit_transform(y_train).ravel()

クロスバリデーションによる評価指標

In [22]:
cv.cross_val_score(clf, X_train, y_train_bin, cv=skf, scoring='f1')

array([ 0.36329588,  0.38420108,  0.36111111,  0.38185255,  0.40740741,
        0.38931298,  0.39344262,  0.41666667,  0.38888889,  0.38356164])

クロスバリデーションによる予測結果

In [23]:
pred = cv.cross_val_predict(clf,X_train,y_train,cv=skf)
print(classification_report(y_train,pred,target_names=['no','yes']))

             precision    recall  f1-score   support

         no       0.91      0.98      0.94     27924
        yes       0.65      0.28      0.39      3723

avg / total       0.88      0.90      0.88     31647



例16 grid_searchモジュールを用いたハイパーパラメータのグリッドサーチ

In [24]:
from sklearn import grid_search as gs
from sklearn.metrics import classification_report
from sklearn import preprocessing
from sklearn import svm

探索するハイパーパラメータの範囲

In [25]:
param_grid = [ {'C':[0.5,1], 'gamma':[0.05,0.1]}, ]

各ハイパーパラメータに対するクロスバリデーションの実行

In [26]:
svc = svm.SVC()
clf= gs.GridSearchCV(svc,param_grid,cv=10)
clf.fit(X_train,y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'C': [0.5, 1], 'gamma': [0.05, 0.1]}],
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [27]:
pred = clf.predict(X_test)
print(classification_report(y_test,pred,target_names=['no','yes']))

             precision    recall  f1-score   support

         no       0.92      0.98      0.95     11998
        yes       0.66      0.35      0.46      1566

avg / total       0.89      0.90      0.89     13564

