In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [2]:
recipes_info_df = pd.read_csv('out/recipes_info.csv')

In [3]:
recipes_info_df.head()

Unnamed: 0,title,year
0,terrina de melón con gelée de oporto,1987
1,mousse de trufa negra '87,1987
2,"ensalada de pasta fresca con caviar, tempura d...",1987
3,"raviolis de cigala, patatas y trufa negra",1987
4,tempura de flor de calabacín rellena de mozzar...,1987


In [4]:
recipes_data_df = pd.read_csv('out/recipes_data.csv')

In [5]:
recipes_data_df.head()

Unnamed: 0,num_ingredients,num_preparations,num_styles,num_techniques,num_techniquesR,num_worlds,i_consomé de tucuppí,i_anchoas en salazón,i_tamarindo,i_leche de nuez,...,temp_TIBIA/AMBIENTE,temp_FRÍA/AMBIENTE,temp_HELADA/FRÍA/AMBIENTE,temp_CALIENTE/TIBIA,temp_FRÍA/AMBIENTE/HELADA,temp_FRÍA,temp_CALIENTE/FRÍA,temp_FRÍA/HELADA,w_SALADO,w_DULCE
0,7,4,1,8,0,2,0,0,0,0,...,0,0,0,0,0,1,0,0,2,0
1,15,5,0,12,0,2,0,0,0,0,...,0,0,0,0,0,1,0,0,2,0
2,21,5,0,17,0,4,0,0,0,0,...,0,0,0,0,0,0,0,0,4,0
3,30,10,1,38,0,3,0,0,0,0,...,0,0,0,0,0,0,0,0,3,0
4,25,7,1,21,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [6]:
X = recipes_data_df
y = recipes_info_df.year

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0, stratify=y)

In [8]:
def my_score(f, xs, ys):
    assert(len(xs) == len(ys))
    a = 18
    b = sum(abs(x - y) for x, y in zip(xs, ys)) / len(xs)
    return (f(a) - f(b)) / f(a)

def my_linear_score(xs, ys):
    return my_score(lambda x: x, xs, ys)

In [9]:
clf = RandomForestClassifier(**{
    'class_weight': None,
     'max_depth': None,
     'max_features': 'auto',
     'n_estimators': 1000,
     'n_jobs': -1,
     'random_state': 0
})

In [10]:
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [11]:
y_pred = clf.predict(X_test)

In [12]:
my_linear_score(y_pred, y_test)

0.93715846994535523

In [13]:
features_df = pd.DataFrame(
    [X.columns, clf.feature_importances_],
    index=['feature', 'importance']
).T

In [14]:
len(clf.feature_importances_)

2643

In [15]:
features_df.sort_values('importance', ascending=False).head(20)

Unnamed: 0,feature,importance
0,num_ingredients,0.0151832
5,num_worlds,0.0146502
3,num_techniques,0.0145047
4,num_techniquesR,0.0130641
1,num_preparations,0.0116046
2641,w_SALADO,0.0112679
2166,pf_SALSAS,0.00794124
2,num_styles,0.00756563
2227,pf_TOQUES,0.00657253
2320,pf_OTRAS ELABORACIONES: Elaboraciones compradas,0.00595962


Conclusions:<br>
The 6 metrics (num_\*) are in the top 10 of important features: rank 1, 2, 3, 4, 6, and 10

In [16]:
top_ingredients = []
for i, x in features_df.sort_values('importance', ascending=False).iterrows():
    if x.feature.startswith('i_'):
        top_ingredients.append(x.feature)

In [17]:
top_ingredients[:20]

['i_agar',
 'i_agua',
 'i_agar-agar en polvo',
 'i_azúcar',
 'i_sal',
 'i_hoja de gelatina',
 'i_perifollo',
 'i_aceite de oliva',
 'i_sal maldon',
 'i_glucosa',
 'i_nata líquida',
 'i_mantequilla',
 'i_aceite',
 'i_cebollino fresco',
 'i_aceite de girasol',
 'i_nata',
 'i_harina de trigo',
 'i_limón',
 'i_claras de huevo',
 'i_isomalt']

In [18]:
top_techniques = []
for i, x in features_df.sort_values('importance', ascending=False).iterrows():
    if x.feature.startswith('t_'):
        top_techniques.append(x.feature)

In [19]:
top_techniques[:20]

['t_hervir',
 't_pasar',
 't_cocer',
 't_horno',
 't_triturar',
 't_mezclar',
 't_estirar',
 't_reposar',
 't_pelar',
 't_colar',
 't_secar',
 't_escurrir',
 't_disolver',
 't_escaldar',
 't_sal',
 't_puré',
 't_agua',
 't_sartén',
 't_espuma',
 't_montar']