<a href="https://colab.research.google.com/github/jonrtaylor/example-scripts/blob/master/parameter_optimization_with_era_subsets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install --upgrade numerox
import numerox as nx
import pandas as pd
from sklearn import model_selection
from sklearn import linear_model

#download the training data using numerox

data = nx.download('numerai_dataset.zip')

#define your train and test data by selecting which eras you want to evaluate on

X1_train = data['era1':'era40'].x
X1_test = data['validation'].x
y1_train = data['era1':'era40'].y['kazutsugi']
y1_test = data['validation'].y['kazutsugi']
era_train = data['era1':'era40'].era_float

#define your cross-validation iterator

CV = model_selection.GroupKFold(n_splits = 3)

#store the train/test set as a list which will be passed to grid search

grp = list(CV.split(X = X1_train, y = y1_train,  groups = era_train))

Requirement already up-to-date: numerox in /usr/local/lib/python3.6/dist-packages (4.1.6)


numerai_dataset.zip: 372MB [00:39, 16.9MB/s]                           

Let's optimize [Ridge Regression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html#sklearn.linear_model.Ridge) on 'alpha' and 'tol':

In [2]:
R = linear_model.Ridge(copy_X=True, fit_intercept=True, max_iter=None, normalize=False, random_state=None, solver='auto') #make sure you omit the keyword arguments for the parameter(s) you wish to optimize
params1 = {'alpha': [0.001, 0.01, 0.1, 1.0, 10.0], 'tol': [.00001, .0001, .001, .01, 0.1]}
GS = model_selection.GridSearchCV(estimator = R, param_grid = params1, cv = grp, return_train_score = True)
GS.fit(X1_train, y1_train)

GridSearchCV(cv=[(array([  2408,   2409,   2410, ..., 137657, 137658, 137659]),
                  array([     0,      1,      2, ..., 142094, 142095, 142096])),
                 (array([     0,      1,      2, ..., 142094, 142095, 142096]),
                  array([  2408,   2409,   2410, ..., 137657, 137658, 137659])),
                 (array([     0,      1,      2, ..., 142094, 142095, 142096]),
                  array([  7203,   7204,   7205, ..., 133291, 133292, 133293]))],
             error_score=nan,
             estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True,
                             max_iter=None, normalize=False, random_state=None,
                             solver='auto', tol=0.001),
             iid='deprecated', n_jobs=None,
             param_grid={'alpha': [0.001, 0.01, 0.1, 1.0, 10.0],
                         'tol': [1e-05, 0.0001, 0.001, 0.01, 0.1]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring=None

What was the best score?

In [3]:
GS.best_score_

0.00023175979651816428

Which parameter value(s) give the best score?

In [4]:
print(GS.best_estimator_)

Ridge(alpha=10.0, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=1e-05)


Want to see more data? No problem!

In [5]:
scores = pd.DataFrame(GS.cv_results_); scores

#if we type GS.cv_results_ then we are given a dictionary. We can use Pandas to convert a dictionary to a dataframe using method chaining.

#the code above defines a new variable called scores, and we use the DataFrame() function from Pandas to interpret the GS1.cv_results_ method's output as a dataframe.
#we then used the shortcut method: ; scores to print the table below.

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_tol,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,0.55378,0.006401,0.033214,0.002122,0.001,1e-05,"{'alpha': 0.001, 'tol': 1e-05}",-0.001754,0.001209,0.001041,0.000165,0.001359,21,0.009104,0.007766,0.008057,0.008309,0.000575
1,0.542435,0.005866,0.032584,0.002471,0.001,0.0001,"{'alpha': 0.001, 'tol': 0.0001}",-0.001754,0.001209,0.001041,0.000165,0.001359,21,0.009104,0.007766,0.008057,0.008309,0.000575
2,0.550618,0.012439,0.031429,0.000543,0.001,0.001,"{'alpha': 0.001, 'tol': 0.001}",-0.001754,0.001209,0.001041,0.000165,0.001359,21,0.009104,0.007766,0.008057,0.008309,0.000575
3,0.547604,0.010559,0.03129,0.000683,0.001,0.01,"{'alpha': 0.001, 'tol': 0.01}",-0.001754,0.001209,0.001041,0.000165,0.001359,21,0.009104,0.007766,0.008057,0.008309,0.000575
4,0.547448,0.008178,0.031289,0.00052,0.001,0.1,"{'alpha': 0.001, 'tol': 0.1}",-0.001754,0.001209,0.001041,0.000165,0.001359,21,0.009104,0.007766,0.008057,0.008309,0.000575
5,0.547646,0.010054,0.031569,0.000851,0.01,1e-05,"{'alpha': 0.01, 'tol': 1e-05}",-0.001754,0.001209,0.001041,0.000165,0.001359,16,0.009104,0.007766,0.008057,0.008309,0.000575
6,0.553102,0.014821,0.03107,0.000995,0.01,0.0001,"{'alpha': 0.01, 'tol': 0.0001}",-0.001754,0.001209,0.001041,0.000165,0.001359,16,0.009104,0.007766,0.008057,0.008309,0.000575
7,0.546482,0.006615,0.031441,0.000723,0.01,0.001,"{'alpha': 0.01, 'tol': 0.001}",-0.001754,0.001209,0.001041,0.000165,0.001359,16,0.009104,0.007766,0.008057,0.008309,0.000575
8,0.549666,0.006136,0.031407,0.000579,0.01,0.01,"{'alpha': 0.01, 'tol': 0.01}",-0.001754,0.001209,0.001041,0.000165,0.001359,16,0.009104,0.007766,0.008057,0.008309,0.000575
9,0.550602,0.005078,0.034892,0.004736,0.01,0.1,"{'alpha': 0.01, 'tol': 0.1}",-0.001754,0.001209,0.001041,0.000165,0.001359,16,0.009104,0.007766,0.008057,0.008309,0.000575
