<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#modeling" data-toc-modified-id="modeling-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>modeling</a></span></li></ul></div>

In [1]:
import sys
import os
sys.path.append(os.path.abspath("../src/"))

In [33]:
from load_data import LoadData
import optuna
from sklearn.svm import SVC
from sklearn.externals import joblib
import numpy as np
import pandas as pd
import uuid

In [3]:
uid = uuid.uuid1()

In [5]:
docs = "../docs/"
finfo = docs + "info/" + "ddl.csv"

data = "../data/"
data_in, data_mid, data_out = [data + fld for fld in ["in/", "mid/", "out/"]]

fin, ftest = data_in + "train_data.csv", data_in + "test_data.csv"
fmid = data_mid + "svc_%s.csv"%uid
fout = data_out + "svc_%s.csv"%uid

models = "../models/"
mout = models + "svc_%s.pkl"%uid

loader = LoadData(fin, ftest, finfo)

In [6]:
X, y, test_X = loader.get_features_v1()

# modeling

In [7]:
from sklearn.metrics import accuracy_score

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
train_X, valid_X, train_y, valid_y = train_test_split(X, y)

In [10]:
def objective(trial):
    Cs = trial.suggest_loguniform('C', 2**-5, 2**15)
    gammas = trial.suggest_loguniform('gamma', 2**-15, 2**3)
    
    clf = SVC(C=Cs, gamma=gammas, kernel="rbf")
    clf.fit(train_X, train_y)
    pred = clf.predict(valid_X)
    return 1 - accuracy_score(y_true=valid_y, y_pred=pred)

In [11]:
%%time
# optuna
study = optuna.create_study()
study.optimize(objective, n_trials=10)

# 最適解
print(study.best_params)
print(study.best_value)
print(study.best_trial)

[I 2019-04-14 01:23:43,895] Finished trial#0 resulted in value: 0.22844444444444445. Current best value is 0.22844444444444445 with parameters: {'C': 0.19945420660277335, 'gamma': 0.0004374764915424126}.
[I 2019-04-14 01:35:59,378] Finished trial#1 resulted in value: 0.198962962962963. Current best value is 0.198962962962963 with parameters: {'C': 1794.3858923295886, 'gamma': 0.011798127918355137}.
[I 2019-04-14 01:42:54,951] Finished trial#2 resulted in value: 0.22696296296296292. Current best value is 0.198962962962963 with parameters: {'C': 1794.3858923295886, 'gamma': 0.011798127918355137}.
[I 2019-04-14 01:43:40,581] Finished trial#3 resulted in value: 0.22844444444444445. Current best value is 0.198962962962963 with parameters: {'C': 1794.3858923295886, 'gamma': 0.011798127918355137}.
[I 2019-04-14 01:46:32,079] Finished trial#4 resulted in value: 0.22844444444444445. Current best value is 0.198962962962963 with parameters: {'C': 1794.3858923295886, 'gamma': 0.011798127918355137}

{'C': 3.3431317323115555, 'gamma': 0.1314875632773491}
0.18785185185185183
FrozenTrial(number=9, state=<TrialState.COMPLETE: 1>, value=0.18785185185185183, datetime_start=datetime.datetime(2019, 4, 14, 2, 9, 24, 703597), datetime_complete=datetime.datetime(2019, 4, 14, 2, 11, 37, 453766), params={'C': 3.3431317323115555, 'gamma': 0.1314875632773491}, user_attrs={}, system_attrs={'_number': 9}, intermediate_values={}, params_in_internal_repr={'C': 3.3431317323115555, 'gamma': 0.1314875632773491}, trial_id=9)
Wall time: 48min 39s


In [21]:
param = study.best_params
param["kernel"] = "rbf"

In [25]:
clf = SVC(**study.best_params)

In [26]:
clf.fit(X, y)

SVC(C=3.3431317323115555, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.1314875632773491,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [28]:
X[fmid] = clf.predict(X)
X[fmid].to_csv(fmid, index=False)

In [30]:
test_X["Y"] = clf.predict(test_X)

In [31]:
test_X["ID"] = test_X.index

In [36]:
test_X[["ID", "Y"]].to_csv(fout, index=False)

In [34]:
joblib.dump(clf, mout)

['../models/svc_5e08933a-5e08-11e9-9d4d-f06e0bb794dc.pkl']