In [1]:
# import math
import pickle

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler  
from sklearn.svm import LinearSVC
from sklearn.svm import SVC

In [2]:
recipes_data_df = pd.read_csv('out/recipes_data.csv')

In [3]:
recipes_data_df.head()

Unnamed: 0,title,year
0,terrina de melón con gelée de oporto,1987
1,mousse de trufa negra '87,1987
2,"ensalada de pasta fresca con caviar, tempura d...",1987
3,"raviolis de cigala, patatas y trufa negra",1987
4,tempura de flor de calabacín rellena de mozzar...,1987


In [4]:
recipes_ml_df = pd.read_csv('out/recipes_ml.csv')

In [5]:
recipes_ml_df.head()

Unnamed: 0,num_ingredients,num_preparations,num_styles,num_techniques,num_techniquesR,num_worlds,i_chocolate troceado,i_pizza,i_jugo de trufa negra,i_fresitas liofilizadas,...,temp_CALIENTE/FRÍA,temp_HELADA/FRÍA,temp_TIBIA/CALIENTE,temp_FRÍA/HELADA,temp_CALIENTE/HELADA,temp_TIBIA,temp_HELADA/FRÍA/AMBIENTE,temp_TIBIA/AMBIENTE,w_DULCE,w_SALADO
0,7,4,1,17,0,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
1,15,5,0,22,0,2,0,0,1,0,...,0,0,0,0,0,0,0,0,0,2
2,21,5,0,29,0,4,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
3,30,10,1,56,0,3,0,0,1,0,...,0,0,0,0,0,0,0,0,0,3
4,25,7,1,37,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [6]:
X = recipes_ml_df
y = recipes_data_df.year

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0, stratify=y)

In [8]:
y_df = pd.DataFrame(
    {'train': y_train.value_counts(), 'test': y_test.value_counts(), 'total': y.value_counts()},
    columns=['train', 'test', 'total'],
)

In [9]:
y_df

Unnamed: 0,train,test,total
1987,14,1,15
1988,23,3,26
1989,25,3,28
1990,29,3,32
1991,51,6,57
1992,34,4,38
1993,27,3,30
1994,54,6,60
1995,46,5,51
1996,50,6,56


In [10]:
scaler = StandardScaler()
scaler.fit(X_train) # fit only on training data
X_train_norm = scaler.transform(X_train) # transform training data
X_test_norm = scaler.transform(X_test) # apply same transformation to test data
X_train_dict = {
    'MLPClassifier': X_train_norm,
}
X_test_dict = {
    'MLPClassifier': X_test_norm,
}

In [11]:
def my_score(f, xs, ys):
    assert(len(xs) == len(ys))
    a = 18
    b = sum(abs(x - y) for x, y in zip(xs, ys)) / len(xs)
    return (f(a) - f(b)) / f(a)

def my_linear_score(xs, ys):
    return my_score(lambda x: x, xs, ys)

# def my_squared_score(xs, ys):
#     return my_score(math.sqrt, xs, ys)

In [12]:
with open('out/clf_results.pickle', 'rb') as f:
    results = pickle.load(f)

In [13]:
summary_df = pd.DataFrame(
    dict((clf_name, results[clf_name].best_score_) for clf_name in results),
    index=['score']
).T

In [14]:
summary_df

Unnamed: 0,score
KNeighborsClassifier,0.858313
LinearSVC,0.938543
MLPClassifier,0.941697
RandomForestClassifier,0.942664
SVC,0.938187


In [15]:
def max_index_value(xs):
    m = max(xs)
    indices = [i for i, j in enumerate(xs) if j == m]
    return indices, m

def min_index_value(xs):
    m = min(xs)
    indices = [i for i, j in enumerate(xs) if j == m]
    return indices, m

In [16]:
from IPython.display import display

for clf_name in results:
    print('-' * 80)
    print('Classifier:', clf_name)
    print('-' * 80)
    grid_search_cv = results[clf_name]
    n_opts = len(grid_search_cv.cv_results_['rank_test_score'])
    print('Best score:', grid_search_cv.best_score_)
    print('Best index:', grid_search_cv.best_index_, '[0 - ' + str(n_opts - 1) + ']')
    print('-' * 80)
    means = grid_search_cv.cv_results_['mean_test_score']
    stds = grid_search_cv.cv_results_['std_test_score']
    print('Means:', means)
    print('Stds:', stds)
    print('-' * 80)
    max_means = max_index_value(means)
    min_stds = min_index_value(stds)
    print('Max mean:', max_means, '- Std:', [(i, stds[i]) for i in max_means[0]])
    print('Min std:', min_stds, '- Mean:', [(i, means[i]) for i in min_stds[0]])
    print('-' * 80)
    result_df = pd.DataFrame(
        [grid_search_cv.cv_results_['split' + str(i) + '_test_score'] for i in range(grid_search_cv.n_splits_)],
    )
    display(result_df)

--------------------------------------------------------------------------------
Classifier: MLPClassifier
--------------------------------------------------------------------------------
Best score: 0.941697191697
Best index: 8 [0 - 23]
--------------------------------------------------------------------------------
Means: [ 0.91966829  0.91966829  0.89850427  0.89850427  0.82371795  0.82371795
  0.93930606  0.93930606  0.94169719  0.94169719  0.88059626  0.88059626
  0.92892755  0.92892755  0.91107041  0.91107041  0.84366097  0.84366097
  0.93136956  0.93136956  0.91132479  0.91132479  0.89748677  0.89748677]
Stds: [ 0.00879911  0.00879911  0.01294261  0.01294261  0.02565244  0.02565244
  0.00603422  0.00603422  0.00743576  0.00743576  0.01382295  0.01382295
  0.0069485   0.0069485   0.01145507  0.01145507  0.02196844  0.02196844
  0.01014505  0.01014505  0.01205306  0.01205306  0.01502516  0.01502516]
--------------------------------------------------------------------------------
M

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,0.927966,0.927966,0.901601,0.901601,0.821563,0.821563,0.939266,0.939266,0.943974,0.943974,...,0.925141,0.925141,0.855461,0.855461,0.922787,0.922787,0.902072,0.902072,0.911959,0.911959
1,0.914272,0.914272,0.886494,0.886494,0.844828,0.844828,0.930556,0.930556,0.929119,0.929119,...,0.890805,0.890805,0.863027,0.863027,0.905651,0.905651,0.899425,0.899425,0.89272,0.89272
2,0.920019,0.920019,0.903257,0.903257,0.794061,0.794061,0.930556,0.930556,0.944444,0.944444,...,0.898946,0.898946,0.811303,0.811303,0.933429,0.933429,0.918582,0.918582,0.896073,0.896073
3,0.93412,0.93412,0.918879,0.918879,0.843166,0.843166,0.944936,0.944936,0.951327,0.951327,...,0.911504,0.911504,0.849558,0.849558,0.93707,0.93707,0.914946,0.914946,0.903147,0.903147
4,0.912844,0.912844,0.917431,0.917431,0.795107,0.795107,0.931193,0.931193,0.946483,0.946483,...,0.909276,0.909276,0.832314,0.832314,0.942915,0.942915,0.927625,0.927625,0.915392,0.915392
5,0.929387,0.929387,0.895119,0.895119,0.869159,0.869159,0.942887,0.942887,0.940291,0.940291,...,0.903946,0.903946,0.874351,0.874351,0.937695,0.937695,0.897715,0.897715,0.892523,0.892523
6,0.921693,0.921693,0.885714,0.885714,0.830159,0.830159,0.946561,0.946561,0.935979,0.935979,...,0.92381,0.92381,0.826455,0.826455,0.932275,0.932275,0.930159,0.930159,0.911111,0.911111
7,0.903846,0.903846,0.875,0.875,0.785791,0.785791,0.942842,0.942842,0.952457,0.952457,...,0.90438,0.90438,0.810363,0.810363,0.934829,0.934829,0.912393,0.912393,0.868056,0.868056
8,0.919633,0.919633,0.898598,0.898598,0.81014,0.81014,0.942287,0.942287,0.93096,0.93096,...,0.923948,0.923948,0.841963,0.841963,0.934196,0.934196,0.893204,0.893204,0.873786,0.873786
9,0.910891,0.910891,0.90099,0.90099,0.843234,0.843234,0.943894,0.943894,0.941694,0.941694,...,0.921342,0.921342,0.871837,0.871837,0.935644,0.935644,0.918042,0.918042,0.907591,0.907591


--------------------------------------------------------------------------------
Classifier: RandomForestClassifier
--------------------------------------------------------------------------------
Best score: 0.942663817664
Best index: 98 [0 - 107]
--------------------------------------------------------------------------------
Means: [ 0.78947904  0.78947904  0.76541514  0.76541514  0.75579976  0.75579976
  0.83786121  0.83786121  0.85597273  0.85597273  0.85933048  0.85933048
  0.89255189  0.89255189  0.89718152  0.89718152  0.89707977  0.89707977
  0.85129223  0.85129223  0.85907611  0.85907611  0.85927961  0.85927961
  0.89397639  0.89397639  0.90532153  0.90532153  0.90577941  0.90577941
  0.91381766  0.91381766  0.92094017  0.92094017  0.9231278   0.9231278
  0.88298738  0.88298738  0.93264143  0.93264143  0.93894994  0.93894994
  0.91442816  0.91442816  0.93579569  0.93579569  0.93935694  0.93935694
  0.91778592  0.91778592  0.92531543  0.92531543  0.92791005  0.92791005
  0.838

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,98,99,100,101,102,103,104,105,106,107
0,0.796139,0.796139,0.761299,0.761299,0.747175,0.747175,0.813089,0.813089,0.855461,0.855461,...,0.930791,0.930791,0.934557,0.934557,0.916667,0.916667,0.920904,0.920904,0.923258,0.923258
1,0.74569,0.74569,0.745211,0.745211,0.735153,0.735153,0.838123,0.838123,0.857759,0.857759,...,0.934387,0.934387,0.936782,0.936782,0.91954,0.91954,0.925287,0.925287,0.923851,0.923851
2,0.788314,0.788314,0.745211,0.745211,0.744732,0.744732,0.826149,0.826149,0.833333,0.833333,...,0.943008,0.943008,0.939655,0.939655,0.912356,0.912356,0.937261,0.937261,0.94205,0.94205
3,0.779744,0.779744,0.757129,0.757129,0.742871,0.742871,0.792527,0.792527,0.820059,0.820059,...,0.946903,0.946903,0.942478,0.942478,0.926254,0.926254,0.935595,0.935595,0.925762,0.925762
4,0.781346,0.781346,0.751274,0.751274,0.747197,0.747197,0.869521,0.869521,0.87156,0.87156,...,0.950051,0.950051,0.946993,0.946993,0.926606,0.926606,0.942915,0.942915,0.940877,0.940877
5,0.798546,0.798546,0.76947,0.76947,0.759605,0.759605,0.821911,0.821911,0.85514,0.85514,...,0.95379,0.95379,0.948598,0.948598,0.911734,0.911734,0.923676,0.923676,0.926791,0.926791
6,0.805291,0.805291,0.771429,0.771429,0.766667,0.766667,0.84709,0.84709,0.860317,0.860317,...,0.938624,0.938624,0.948148,0.948148,0.921693,0.921693,0.939683,0.939683,0.937037,0.937037
7,0.803419,0.803419,0.788462,0.788462,0.783654,0.783654,0.844017,0.844017,0.870726,0.870726,...,0.942842,0.942842,0.946581,0.946581,0.925214,0.925214,0.933761,0.933761,0.935897,0.935897
8,0.798274,0.798274,0.789105,0.789105,0.773463,0.773463,0.855448,0.855448,0.868393,0.868393,...,0.93096,0.93096,0.926106,0.926106,0.925566,0.925566,0.929342,0.929342,0.922869,0.922869
9,0.80363,0.80363,0.782728,0.782728,0.764026,0.764026,0.879538,0.879538,0.872387,0.872387,...,0.957096,0.957096,0.951045,0.951045,0.937294,0.937294,0.944444,0.944444,0.946095,0.946095


--------------------------------------------------------------------------------
Classifier: KNeighborsClassifier
--------------------------------------------------------------------------------
Best score: 0.858312983313
Best index: 1 [0 - 17]
--------------------------------------------------------------------------------
Means: [ 0.81659544  0.85831298  0.83898046  0.85470085  0.84279609  0.8503256
  0.81664632  0.85800773  0.83887871  0.8548026   0.84310134  0.85108873
  0.81679894  0.85719373  0.83567359  0.85388685  0.84238909  0.85027473]
Stds: [ 0.01462143  0.0156129   0.01151133  0.01530145  0.01213505  0.01232171
  0.01549183  0.01505218  0.0112015   0.01459152  0.01050521  0.01253393
  0.01443898  0.01385064  0.01137132  0.01531033  0.01188337  0.01237564]
--------------------------------------------------------------------------------
Max mean: ([1], 0.85831298331298334) - Std: [(1, 0.015612897076307699)]
Min std: ([10], 0.010505214475383348) - Mean: [(10, 0.843101343101343

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,0.842279,0.879002,0.859699,0.87806,0.847458,0.84887,0.84322,0.875235,0.862524,0.875235,0.848399,0.851224,0.84275,0.874765,0.857345,0.877589,0.846987,0.846987
1,0.811303,0.850096,0.837165,0.834291,0.855843,0.864943,0.812261,0.851054,0.837165,0.834291,0.85249,0.8659,0.812261,0.851054,0.831418,0.833812,0.854406,0.864943
2,0.797893,0.851533,0.840517,0.84387,0.82567,0.825192,0.798851,0.851533,0.842433,0.845307,0.826628,0.82567,0.802203,0.850096,0.842912,0.84387,0.823755,0.82567
3,0.797935,0.854474,0.815634,0.828909,0.818584,0.833825,0.795477,0.852999,0.818092,0.830383,0.821534,0.833825,0.794985,0.855949,0.816618,0.830875,0.817601,0.832842
4,0.804791,0.849134,0.83843,0.848624,0.833843,0.851172,0.804791,0.849134,0.841998,0.850153,0.840979,0.852192,0.803772,0.849134,0.823649,0.840979,0.83945,0.851682
5,0.814642,0.871236,0.825545,0.870717,0.851506,0.867601,0.812565,0.871236,0.833853,0.873313,0.853063,0.86812,0.819315,0.869678,0.829699,0.870717,0.847871,0.865524
6,0.83545,0.88836,0.843915,0.858201,0.851852,0.854497,0.837037,0.88836,0.841799,0.856614,0.850265,0.856614,0.834392,0.88254,0.841799,0.857672,0.848677,0.857672
7,0.813568,0.831731,0.84562,0.858974,0.852564,0.854701,0.813034,0.831731,0.831197,0.858974,0.849893,0.854167,0.813034,0.831731,0.838141,0.860043,0.85203,0.856303
8,0.829558,0.852211,0.837109,0.866775,0.847357,0.852211,0.832794,0.85383,0.832255,0.865156,0.842503,0.855448,0.828479,0.851672,0.829558,0.864617,0.8452,0.854369
9,0.820132,0.854235,0.845985,0.861386,0.845435,0.852585,0.817932,0.854235,0.845985,0.861386,0.847085,0.849835,0.817932,0.854235,0.844884,0.861386,0.850385,0.849285


--------------------------------------------------------------------------------
Classifier: SVC
--------------------------------------------------------------------------------
Best score: 0.938186813187
Best index: 16 [0 - 31]
--------------------------------------------------------------------------------
Means: [ 0.93452381  0.93452381  0.71275946  0.71275946  0.71377696  0.71377696
  0.71815222  0.71815222  0.93452381  0.93452381  0.71275946  0.71275946
  0.71377696  0.71377696  0.71815222  0.71815222  0.93818681  0.93818681
  0.74537037  0.74537037  0.75529101  0.75529101  0.68167481  0.68167481
  0.93818681  0.93818681  0.74537037  0.74537037  0.75529101  0.75529101
  0.68167481  0.68167481]
Stds: [ 0.00482109  0.00482109  0.00726976  0.00726976  0.00679318  0.00679318
  0.0083594   0.0083594   0.00482109  0.00482109  0.00726976  0.00726976
  0.00679318  0.00679318  0.0083594   0.0083594   0.00516983  0.00516983
  0.01204597  0.01204597  0.01191897  0.01191897  0.01116556  0.011

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,0.935499,0.935499,0.702919,0.702919,0.70339,0.70339,0.707156,0.707156,0.935499,0.935499,...,0.6629,0.6629,0.938795,0.938795,0.736347,0.736347,0.751883,0.751883,0.6629,0.6629
1,0.929119,0.929119,0.70546,0.70546,0.707375,0.707375,0.709291,0.709291,0.929119,0.929119,...,0.683429,0.683429,0.934387,0.934387,0.741379,0.741379,0.748563,0.748563,0.683429,0.683429
2,0.932471,0.932471,0.703544,0.703544,0.707375,0.707375,0.709291,0.709291,0.932471,0.932471,...,0.674808,0.674808,0.933908,0.933908,0.725575,0.725575,0.735632,0.735632,0.674808,0.674808
3,0.929204,0.929204,0.70944,0.70944,0.70944,0.70944,0.710423,0.710423,0.929204,0.929204,...,0.670108,0.670108,0.939036,0.939036,0.733038,0.733038,0.735497,0.735497,0.670108,0.670108
4,0.940367,0.940367,0.710499,0.710499,0.712029,0.712029,0.718145,0.718145,0.940367,0.940367,...,0.673293,0.673293,0.944444,0.944444,0.753313,0.753313,0.75841,0.75841,0.673293,0.673293
5,0.93406,0.93406,0.716511,0.716511,0.719107,0.719107,0.723261,0.723261,0.93406,0.93406,...,0.687954,0.687954,0.934579,0.934579,0.746106,0.746106,0.761682,0.761682,0.687954,0.687954
6,0.945503,0.945503,0.719577,0.719577,0.722751,0.722751,0.724868,0.724868,0.945503,0.945503,...,0.684656,0.684656,0.940741,0.940741,0.74709,0.74709,0.764021,0.764021,0.684656,0.684656
7,0.931624,0.931624,0.722222,0.722222,0.722222,0.722222,0.727564,0.727564,0.931624,0.931624,...,0.692308,0.692308,0.930556,0.930556,0.746795,0.746795,0.767628,0.767628,0.692308,0.692308
8,0.932039,0.932039,0.721683,0.721683,0.721683,0.721683,0.725998,0.725998,0.932039,0.932039,...,0.690399,0.690399,0.936893,0.936893,0.766451,0.766451,0.768069,0.768069,0.690399,0.690399
9,0.936194,0.936194,0.719472,0.719472,0.715622,0.715622,0.729923,0.729923,0.936194,0.936194,...,0.70187,0.70187,0.949395,0.949395,0.762926,0.762926,0.766777,0.766777,0.70187,0.70187


--------------------------------------------------------------------------------
Classifier: LinearSVC
--------------------------------------------------------------------------------
Best score: 0.938542938543
Best index: 3 [0 - 3]
--------------------------------------------------------------------------------
Means: [ 0.92114367  0.93162393  0.93711844  0.93854294]
Stds: [ 0.01246695  0.00654971  0.00585091  0.00637784]
--------------------------------------------------------------------------------
Max mean: ([3], 0.93854293854293858) - Std: [(3, 0.0063778385811532057)]
Min std: ([2], 0.0058509128016236136) - Mean: [(2, 0.93711843711843701)]
--------------------------------------------------------------------------------


Unnamed: 0,0,1,2,3
0,0.915725,0.932203,0.93597,0.939736
1,0.909004,0.920019,0.925287,0.926245
2,0.904693,0.923851,0.938218,0.936782
3,0.917896,0.933137,0.938053,0.936087
4,0.925586,0.93527,0.931193,0.931702
5,0.928349,0.935099,0.943406,0.940291
6,0.94709,0.945503,0.943386,0.948677
7,0.936966,0.928953,0.932158,0.9375
8,0.910464,0.93096,0.942826,0.946063
9,0.919142,0.932893,0.942244,0.944444


In [17]:
grid_search_cv = results['RandomForestClassifier']
best_estimator = grid_search_cv.best_estimator_

In [18]:
best_estimator

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=True)

In [19]:
grid_search_cv.best_params_

{'class_weight': None,
 'max_depth': None,
 'max_features': 'auto',
 'n_estimators': 100,
 'n_jobs': -1,
 'random_state': 0,
 'warm_start': True}

In [20]:
grid_search_cv.cv_results_

{'mean_fit_time': array([  1.35362220e-01,   1.31701541e-01,   3.38694358e-01,
          3.15328646e-01,   2.75847230e+00,   2.89844530e+00,
          1.31958127e-01,   1.37901211e-01,   4.41786289e-01,
          4.66601968e-01,   5.10951388e+00,   5.28257387e+00,
          8.52949023e-01,   6.77123737e-01,   8.64378912e+00,
          9.68664005e+00,   1.04791235e+02,   1.03196397e+02,
          2.77846861e-01,   1.95573044e-01,   5.42894053e-01,
          5.16217971e-01,   5.27265134e+00,   5.42738438e+00,
          1.73891187e-01,   1.44829822e-01,   1.06491604e+00,
          1.14952736e+00,   1.11739807e+01,   1.12098390e+01,
          1.59857678e+00,   1.39608605e+00,   1.56050876e+01,
          1.55422227e+01,   1.55624943e+02,   1.48978384e+02,
          2.94592094e-01,   1.58010268e-01,   7.84864068e-01,
          8.50752497e-01,   8.69089534e+00,   8.44566281e+00,
          2.13960361e-01,   1.40977597e-01,   1.42526162e+00,
          1.37072380e+00,   1.47418383e+01,   1.50284