In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [2]:
recipes_data_df = pd.read_csv('out/recipes_data.csv')

In [3]:
recipes_data_df.head()

Unnamed: 0,title,year
0,terrina de melón con gelée de oporto,1987
1,mousse de trufa negra '87,1987
2,"ensalada de pasta fresca con caviar, tempura d...",1987
3,"raviolis de cigala, patatas y trufa negra",1987
4,tempura de flor de calabacín rellena de mozzar...,1987


In [4]:
recipes_ml_df = pd.read_csv('out/recipes_ml.csv')

In [5]:
recipes_ml_df.head()

Unnamed: 0,num_ingredients,num_preparations,num_styles,num_techniques,num_techniquesR,num_worlds,i_chocolate troceado,i_pizza,i_jugo de trufa negra,i_fresitas liofilizadas,...,temp_CALIENTE/FRÍA,temp_HELADA/FRÍA,temp_TIBIA/CALIENTE,temp_FRÍA/HELADA,temp_CALIENTE/HELADA,temp_TIBIA,temp_HELADA/FRÍA/AMBIENTE,temp_TIBIA/AMBIENTE,w_DULCE,w_SALADO
0,7,4,1,17,0,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
1,15,5,0,22,0,2,0,0,1,0,...,0,0,0,0,0,0,0,0,0,2
2,21,5,0,29,0,4,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
3,30,10,1,56,0,3,0,0,1,0,...,0,0,0,0,0,0,0,0,0,3
4,25,7,1,37,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [6]:
X = recipes_ml_df
y = recipes_data_df.year

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0, stratify=y)

In [8]:
def my_score(f, xs, ys):
    assert(len(xs) == len(ys))
    a = 18
    b = sum(abs(x - y) for x, y in zip(xs, ys)) / len(xs)
    return (f(a) - f(b)) / f(a)

def my_linear_score(xs, ys):
    return my_score(lambda x: x, xs, ys)

In [9]:
clf = RandomForestClassifier(**{'class_weight': None,
 'max_depth': None,
 'max_features': 'auto',
 'n_estimators': 100,
 'n_jobs': -1,
 'random_state': 0,
 'warm_start': True})

In [10]:
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=True)

In [11]:
y_pred = clf.predict(X_test)

In [12]:
my_linear_score(y_pred, y_test)

0.94307832422586513

In [13]:
features_df = pd.DataFrame(
    [X.columns, clf.feature_importances_],
    index=['feature', 'importance']
).T

In [14]:
features_df.sort_values('importance', ascending=False)

Unnamed: 0,feature,importance
3,num_techniques,0.0132332
5,num_worlds,0.0132191
0,num_ingredients,0.0126246
4,num_techniquesR,0.0122363
2736,w_SALADO,0.00983503
1,num_preparations,0.00942976
2684,t_horno,0.00745607
2334,pf_SALSAS,0.00743707
2487,t_guardar,0.00682988
2,num_styles,0.00674511


In [15]:
len(clf.feature_importances_)

2737

Conclusions:<br>
The 6 metrics (num_\*) are the top important features: rank 1, 2, 3, 4, 6, and 10

In [16]:
# Top 20 techniques
c = 0
for i, x in features_df.sort_values('importance', ascending=False).iterrows():
    if x.feature.startswith('t_'):
        print(c, x.feature)
        c += 1
        if c >= 20:
            break

0 t_horno
1 t_guardar
2 t_hervir
3 t_remover
4 t_terminar
5 t_agua
6 t_introducir
7 t_cocer
8 t_juntar
9 t_cocinar
10 t_calentar
11 t_retirar
12 t_enfriar
13 t_añadir
14 t_colocar
15 t_triturar
16 t_obtener
17 t_dejar
18 t_servir
19 t_pasar


In [17]:
# Top 20 ingredients
c = 0
for i, x in features_df.sort_values('importance', ascending=False).iterrows():
    if x.feature.startswith('i_'):
        print(c, x.feature)
        c += 1
        if c >= 20:
            break

0 i_agar
1 i_azúcar
2 i_hoja de gelatina
3 i_agua
4 i_sal
5 i_perifollo
6 i_agar-agar en polvo
7 i_sal maldon
8 i_nata líquida
9 i_aceite de oliva
10 i_nata
11 i_glucosa
12 i_aceite de girasol
13 i_aceite
14 i_mantequilla
15 i_yemas de huevo
16 i_claras de huevo
17 i_gelatina
18 i_limón
19 i_isomalt
