In [3]:
from distributed import Executor, progress

In [4]:
e = Executor('dscheduler:8786')

In [60]:
e.restart()

In [39]:
from time import sleep
futures = e.map(sleep, [0.1] * 4800, pure=False)
progress(futures)

In [40]:
import subprocess


def install_libs():
    libs = 'scikit-learn joblib pandas matplotlib'.split()
    return subprocess.check_call('conda install -yq'.split() + libs)

In [41]:
install_libs()

0

In [42]:
e.run(install_libs)

{'10.0.0.3:35530': 0,
 '10.0.0.5:59239': 0,
 '10.0.0.6:46781': 0,
 '10.0.0.7:45373': 0}

In [43]:
import pandas as pd
from sklearn.datasets import load_digits
from sklearn.cross_validation import train_test_split

In [44]:
from joblib import hash
from time import time


def evaluate_one(model_class, split_idx, parameters):
    data = load_digits()  # XXX: hard-coded
    X_train, X_val, y_train, y_val = train_test_split(
        data.data, data.target, test_size=0.20,
        random_state=split_idx)
    
    t0 = time()
    model = model_class(**parameters).fit(X_train, y_train)
    training_time = time() - t0

    train_score = model.score(X_train, y_train)
    validation_score = model.score(X_val, y_val)
    
    results = {
        'split_idx': split_idx,
        'training_time': training_time,
        'train_score': train_score,
        'validation_score': validation_score,
        'parameters_hash': hash(parameters),
        'parameters': parameters,  # for debug
    }
    return results

In [45]:
import random


def launch_search(executor, model_class, param_space, n_combinations=100,
                  random_seed=None):
    rng = random.Random(random_seed)
    futures = []
    param_space = list(param_space)
    rng.shuffle(param_space)
    for params in param_space[:n_combinations]:
        for split_idx in range(3):
            f = executor.submit(evaluate_one, model_class,
                                split_idx, params)
            futures.append(f)
            
    return futures

In [46]:
from sklearn.grid_search import ParameterGrid
from sklearn.svm import SVC
import numpy as np


param_space = ParameterGrid({
    'C': np.logspace(-6, 6, 13),
    'gamma': np.logspace(-8, 8, 17),
    'tol': np.logspace(-4, -1, 4),
})

evaluations = launch_search(e, SVC, param_space, n_combinations=30)

In [47]:
progress(evaluations)

In [48]:
ready_list = [f.result() for f in evaluations if f.done()]
len(ready_list)

90

In [49]:
ready = pd.DataFrame.from_dict(ready_list)
ready.head(5)

Unnamed: 0,parameters,parameters_hash,split_idx,train_score,training_time,validation_score
0,"{'tol': 0.1, 'gamma': 1000.0, 'C': 100000.0}",79348625a8323a1d3adcc84a4aa687b2,0,1,4.079076,0.080556
1,"{'tol': 0.1, 'gamma': 1000.0, 'C': 100000.0}",79348625a8323a1d3adcc84a4aa687b2,1,1,1.598318,0.083333
2,"{'tol': 0.1, 'gamma': 1000.0, 'C': 100000.0}",79348625a8323a1d3adcc84a4aa687b2,2,1,5.028648,0.077778
3,"{'tol': 0.001, 'gamma': 100000.0, 'C': 1000.0}",3832ef1fd77a4cbeaf0d0a8c568e1d81,0,1,5.014644,0.080556
4,"{'tol': 0.001, 'gamma': 100000.0, 'C': 1000.0}",3832ef1fd77a4cbeaf0d0a8c568e1d81,1,1,6.650279,0.083333


In [50]:
ready.describe()

Unnamed: 0,split_idx,train_score,training_time,validation_score
count,90.0,90.0,90.0,90.0
mean,1.0,0.609627,2.596174,0.251358
std,0.821071,0.442923,1.435758,0.345247
min,0.0,0.105776,0.158288,0.077778
25%,0.0,0.105776,1.62259,0.077778
50%,1.0,0.996868,2.517442,0.080556
75%,2.0,1.0,3.295945,0.083333
max,2.0,1.0,7.307937,0.991667


In [51]:
mean_evaluations = ready.groupby('parameters_hash').agg({
    'train_score': np.mean,
    'validation_score': np.mean,
    'training_time': np.mean,
}).reset_index()
mean_evaluations.head()

Unnamed: 0,parameters_hash,train_score,validation_score,training_time
0,01ec66ef7212dae19e06dd8fc100525f,0.10624,0.080556,2.944398
1,25d862c8f645447812a6519f02828d3c,1.0,0.080556,3.501465
2,28bc903127c2f1b9be5c182bab52d99a,1.0,0.080556,2.190076
3,2d53d827062a137c47be4030037519f6,0.992577,0.983333,0.363173
4,31336609bd5800b97daf694e02e38230,0.10624,0.080556,2.302443


In [52]:
all_params = []
for params in param_space:
    params['parameters_hash'] = hash(params)
    all_params.append(params)
all_params = pd.DataFrame.from_dict(all_params)
all_params.head()

Unnamed: 0,C,gamma,parameters_hash,tol
0,1e-06,1e-08,d2ec602b9e9dd18340ebf2694f533f7e,0.0001
1,1e-06,1e-08,f7e165bdc9fafd1b8ef92cf22bfd4dc9,0.001
2,1e-06,1e-08,888edeb2aebb12929e939172ad8a2b8b,0.01
3,1e-06,1e-08,6f7d5e4e632119f8afdf02265ae22a2b,0.1
4,1e-06,1e-07,05bfe881972afabca8f123a6552889a7,0.0001


In [53]:
evaluations_with_parameters = mean_evaluations.merge(all_params).drop(['parameters_hash'], axis=1)
evaluations_with_parameters['gap'] = np.abs(
    evaluations_with_parameters['train_score'] - 
    evaluations_with_parameters['validation_score'])

In [59]:
evaluations_with_parameters.sort_values(by='validation_score', ascending=False)

Unnamed: 0,train_score,validation_score,training_time,C,gamma,tol,gap
17,1.0,0.984259,1.354743,10000.0,0.0001,0.01,0.015741
3,0.992577,0.983333,0.363173,100.0,1e-05,0.1,0.009244
22,1.0,0.981481,0.407052,1000000.0,1e-06,0.01,0.018519
19,1.0,0.981481,0.349499,10000.0,1e-06,0.0001,0.018519
9,0.915101,0.897222,1.074582,100.0,1e-07,0.01,0.017879
8,1.0,0.77963,2.462457,1.0,0.01,0.0001,0.22037
0,0.10624,0.080556,2.944398,1e-06,10.0,0.01,0.025684
28,1.0,0.080556,3.038641,100.0,100000000.0,0.001,0.919444
27,0.10624,0.080556,3.180577,0.01,1.0,0.001,0.025684
26,0.10624,0.080556,1.678649,1e-05,0.001,0.001,0.025684
