In [1]:
import pickle

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [2]:
recipes_data_df = pd.read_csv('out/recipes_data.csv')

In [3]:
recipes_data_df.head()

Unnamed: 0,title,year
0,terrina de melón con gelée de oporto,1987
1,mousse de trufa negra '87,1987
2,"ensalada de pasta fresca con caviar, tempura d...",1987
3,"raviolis de cigala, patatas y trufa negra",1987
4,tempura de flor de calabacín rellena de mozzar...,1987


In [4]:
recipes_ml_df = pd.read_csv('out/recipes_ml.csv')

In [5]:
recipes_ml_df.head()

Unnamed: 0,num_ingredients,num_preparations,num_styles,num_techniques,num_techniquesR,num_worlds,i_chocolate troceado,i_pizza,i_jugo de trufa negra,i_fresitas liofilizadas,...,temp_CALIENTE/FRÍA,temp_HELADA/FRÍA,temp_TIBIA/CALIENTE,temp_FRÍA/HELADA,temp_CALIENTE/HELADA,temp_TIBIA,temp_HELADA/FRÍA/AMBIENTE,temp_TIBIA/AMBIENTE,w_DULCE,w_SALADO
0,7,4,1,17,0,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
1,15,5,0,22,0,2,0,0,1,0,...,0,0,0,0,0,0,0,0,0,2
2,21,5,0,29,0,4,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
3,30,10,1,56,0,3,0,0,1,0,...,0,0,0,0,0,0,0,0,0,3
4,25,7,1,37,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [6]:
valid_cols = [c for c in recipes_ml_df.columns if c.startswith('i_') or c.startswith('t_')]
X = recipes_ml_df[valid_cols]
y = recipes_data_df.year

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0, stratify=y)

In [8]:
y_df = pd.DataFrame(
    {'train': y_train.value_counts(), 'test': y_test.value_counts(), 'total': y.value_counts()},
    columns=['train', 'test', 'total'],
)

In [9]:
y_df

Unnamed: 0,train,test,total
1987,14,1,15
1988,23,3,26
1989,25,3,28
1990,29,3,32
1991,51,6,57
1992,34,4,38
1993,27,3,30
1994,54,6,60
1995,46,5,51
1996,50,6,56


In [10]:
clfs_params = [
    (RandomForestClassifier, {
        'n_estimators': (10, 100, 1000),
        'max_features': ('auto', None),
        'n_jobs': (-1,),
        'random_state': (0,),
        'class_weight': ('balanced', None),
    }),
]

In [11]:
def my_score(f, xs, ys):
    assert(len(xs) == len(ys))
    a = 18
    b = sum(abs(x - y) for x, y in zip(xs, ys)) / len(xs)
    return (f(a) - f(b)) / f(a)

def my_linear_score(xs, ys):
    return my_score(lambda x: x, xs, ys)

In [12]:
%%time

results = {}
for clf, params in clfs_params:
    grid_search_cv = GridSearchCV(clf(), params, cv=10, scoring=make_scorer(my_linear_score), error_score=0, n_jobs=-1)
    grid_search_cv.fit(X_train, y_train)
    results[clf.__name__] = grid_search_cv
    print(clf.__name__, 'done.')

RandomForestClassifier done.
CPU times: user 21.1 s, sys: 540 ms, total: 21.7 s
Wall time: 10min 23s


In [13]:
with open('out/clf_results_2.pickle', 'wb') as f:
    pickle.dump(results, f)

In [14]:
grid_search_cv

GridSearchCV(cv=10, error_score=0,
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'class_weight': ('balanced', None), 'n_estimators': (10, 100, 1000), 'max_features': ('auto', None), 'random_state': (0,), 'n_jobs': (-1,)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=make_scorer(my_linear_score), verbose=0)

In [15]:
grid_search_cv.best_score_

0.90776353276353294

In [16]:
grid_search_cv.best_estimator_.feature_importances_

array([  4.05908276e-05,   9.75644568e-05,   1.50292835e-03, ...,
         4.47602104e-03,   4.41519013e-03,   2.29949548e-03])