In [1]:
# import math
import pickle

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler  
from sklearn.svm import LinearSVC
from sklearn.svm import SVC

In [2]:
recipes_data_df = pd.read_csv('out/recipes_data.csv')

In [3]:
recipes_data_df

Unnamed: 0,title,year
0,terrina de melón con gelée de oporto,1987
1,mousse de trufa negra '87,1987
2,"ensalada de pasta fresca con caviar, tempura d...",1987
3,"raviolis de cigala, patatas y trufa negra",1987
4,tempura de flor de calabacín rellena de mozzar...,1987
5,muslitos de codorniz a la salsa de soja,1987
6,gourmandise de salmón y patata confitada a la ...,1987
7,nido de judías verdes con pinzas de bogavante ...,1987
8,"canapé de patata, salmonete y puré de trufa de...",1987
9,salmonetes Gaudí,1987


In [4]:
recipes_ml_df = pd.read_csv('out/recipes_ml.csv')

In [5]:
recipes_ml_df.head()

Unnamed: 0,num_ingredients,num_preparations,num_styles,num_techniques,num_techniquesR,num_worlds,i_chocolate troceado,i_pizza,i_jugo de trufa negra,i_fresitas liofilizadas,...,temp_CALIENTE/FRÍA,temp_HELADA/FRÍA,temp_TIBIA/CALIENTE,temp_FRÍA/HELADA,temp_CALIENTE/HELADA,temp_TIBIA,temp_HELADA/FRÍA/AMBIENTE,temp_TIBIA/AMBIENTE,w_DULCE,w_SALADO
0,7,4,1,17,0,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
1,15,5,0,22,0,2,0,0,1,0,...,0,0,0,0,0,0,0,0,0,2
2,21,5,0,29,0,4,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
3,30,10,1,56,0,3,0,0,1,0,...,0,0,0,0,0,0,0,0,0,3
4,25,7,1,37,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [6]:
X = recipes_ml_df
y = recipes_data_df.year

In [7]:
y.value_counts().sort_index()

1987     15
1988     26
1989     28
1990     32
1991     57
1992     38
1993     30
1994     60
1995     51
1996     56
1997     61
1998     73
1999     88
2000    101
2001    109
2003    138
2004    126
2005    125
Name: year, dtype: int64

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, stratify=y)

In [9]:
scaler = StandardScaler()
scaler.fit(X_train) # fit only on training data
X_train_norm = scaler.transform(X_train) # transform training data
X_test_norm = scaler.transform(X_test) # apply same transformation to test data
X_train_dict = {
    'MLPClassifier': X_train_norm,
}
X_test_dict = {
    'MLPClassifier': X_test_norm,
}

In [10]:
y_train.value_counts().sort_index()

1987     11
1988     20
1989     21
1990     24
1991     43
1992     28
1993     22
1994     45
1995     38
1996     42
1997     46
1998     55
1999     66
2000     76
2001     82
2003    103
2004     94
2005     94
Name: year, dtype: int64

In [11]:
class_weights = dict((i, c / y_train.count()) for i, c in y_train.value_counts().iteritems())

In [12]:
clfs_params = [
    (RandomForestClassifier, {
        'n_estimators': (10, 100, 1000),
        'max_features': (10, 'auto', None),
        'max_depth': (10, 20, None),
        'n_jobs': (-1,),
        'random_state': (0,),
        'warm_start': (True, False),
        'class_weight': (class_weights, None),
    }),
    (KNeighborsClassifier, {
        'n_neighbors': (5, 10, 20),
        'weights': ('uniform', 'distance'),
        'leaf_size': (10, 30, 60),
        'n_jobs': (-1,),
    }),
    (MLPClassifier, {
        'hidden_layer_sizes': ((100,),),
        'activation': ('identity', 'logistic', 'tanh', 'relu'),
        'solver': ('lbfgs', 'sgd', 'adam'),
        'max_iter': (4000,),
        'random_state': (0,),
        'warm_start': (True, False),
    }),
    (LinearSVC, {
        'loss': ('hinge', 'squared_hinge'),
        'class_weight': (class_weights, None),
        'random_state': (0,),
    }),
    (SVC, {
        'kernel': ('linear', 'poly', 'rbf', 'sigmoid'),
        'shrinking': (True, False),
        'class_weight': (class_weights, None),
        'decision_function_shape': ('ovr', 'ovo'),
        'random_state': (0,),
    }),
]

In [13]:
clfs_params = [
    (MLPClassifier, {
        'hidden_layer_sizes': ((100,),),
        'activation': ('identity', 'logistic', 'tanh', 'relu'),
        'solver': ('lbfgs', 'sgd', 'adam'),
        'max_iter': (2000,),
        'random_state': (0,),
        'warm_start': (True, False),
    }),
]

In [14]:
clfs_params = [
    (MLPClassifier, {
        'hidden_layer_sizes': ((200, 100,),),
        'activation': ('identity', 'logistic', 'tanh', 'relu'),
        'solver': ('lbfgs', 'sgd', 'adam'),
        'max_iter': (2000,),
        'random_state': (0,),
        'warm_start': (True, False),
    }),
]

In [None]:
def my_score(f, xs, ys):
    assert(len(xs) == len(ys))
    a = 18
    b = sum(abs(x - y) for x, y in zip(xs, ys)) / len(xs)
    return (f(a) - f(b)) / f(a)

def my_linear_score(xs, ys):
    return my_score(lambda x: x, xs, ys)

# def my_squared_score(xs, ys):
#     return my_score(math.sqrt, xs, ys)

In [None]:
%%time

results = {}
for clf, params in clfs_params:
    grid_search_cv = GridSearchCV(clf(), params, scoring=make_scorer(my_linear_score), error_score=0, n_jobs=-1)
    X_train_ = X_train_dict.get(clf.__name__, X_train)
    y_train_ = y_train
    grid_search_cv.fit(X_train_, y_train_)
    results[clf.__name__] = grid_search_cv
    
    print(clf.__name__, ' done.')

In [None]:
for clf_name in results:
    grid_search_cv = results[clf_name]
    X_test_ = X_test_dict.get(clf_name, X_test)
    y_pred = grid_search_cv.predict(X_test_)
    score = my_linear_score(y_test, y_pred)
    
    print(clf_name)
    print(grid_search_cv.best_score_)
    print(score)
    print()

In [None]:
with open('out/clf_results.pickle', 'wb') as f:
    pickle.dump(results, f)