In [1]:
import pickle

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler  
from sklearn.svm import LinearSVC
from sklearn.svm import SVC

In [2]:
recipes_info_df = pd.read_csv('out/recipes_info.csv')

In [3]:
recipes_info_df.head()

Unnamed: 0,title,year
0,terrina de melón con gelée de oporto,1987
1,mousse de trufa negra '87,1987
2,"ensalada de pasta fresca con caviar, tempura d...",1987
3,"raviolis de cigala, patatas y trufa negra",1987
4,tempura de flor de calabacín rellena de mozzar...,1987


In [4]:
recipes_data_df = pd.read_csv('out/recipes_data.csv')

In [5]:
recipes_data_df.head()

Unnamed: 0,num_ingredients,num_preparations,num_styles,num_techniques,num_techniquesR,num_worlds,i_consomé de tucuppí,i_anchoas en salazón,i_tamarindo,i_leche de nuez,...,temp_TIBIA/AMBIENTE,temp_FRÍA/AMBIENTE,temp_HELADA/FRÍA/AMBIENTE,temp_CALIENTE/TIBIA,temp_FRÍA/AMBIENTE/HELADA,temp_FRÍA,temp_CALIENTE/FRÍA,temp_FRÍA/HELADA,w_SALADO,w_DULCE
0,7,4,1,8,0,2,0,0,0,0,...,0,0,0,0,0,1,0,0,2,0
1,15,5,0,12,0,2,0,0,0,0,...,0,0,0,0,0,1,0,0,2,0
2,21,5,0,17,0,4,0,0,0,0,...,0,0,0,0,0,0,0,0,4,0
3,30,10,1,38,0,3,0,0,0,0,...,0,0,0,0,0,0,0,0,3,0
4,25,7,1,21,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [6]:
X = recipes_data_df
y = recipes_info_df.year

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0, stratify=y)

In [8]:
y_df = pd.DataFrame(
    {'train': y_train.value_counts(), 'test': y_test.value_counts(), 'total': y.value_counts()},
    columns=['train', 'test', 'total'],
)

In [9]:
y_df

Unnamed: 0,train,test,total
1987,14,1,15
1988,23,3,26
1989,25,3,28
1990,29,3,32
1991,51,6,57
1992,34,4,38
1993,27,3,30
1994,54,6,60
1995,46,5,51
1996,50,6,56


In [10]:
scaler = StandardScaler()
scaler.fit(X_train) # fit only on training data
X_train_norm = scaler.transform(X_train) # transform training data
X_test_norm = scaler.transform(X_test) # apply same transformation to test data
X_train_dict = {
    'MLPClassifier': X_train_norm,
}
X_test_dict = {
    'MLPClassifier': X_test_norm,
}

In [11]:
class_weights = dict((i, c / y_train.count()) for i, c in y_train.value_counts().iteritems())

In [12]:
clfs_params = [
    (RandomForestClassifier, {
        'n_estimators': (10, 100, 1000),
        'max_features': (10, 'auto', None),
        'max_depth': (10, 20, None),
        'n_jobs': (-1,),
        'random_state': (0,),
        'class_weight': (class_weights, None),
    }),
    (KNeighborsClassifier, {
        'n_neighbors': (5, 10, 20),
        'weights': ('uniform', 'distance'),
        'leaf_size': (10, 30, 60),
        'n_jobs': (-1,),
    }),
    (MLPClassifier, {
#         'hidden_layer_sizes': ((100,),),
        'activation': ('identity', 'logistic', 'tanh', 'relu'),
        'solver': ('lbfgs', 'sgd', 'adam'),
        'max_iter': (2000,),
        'random_state': (0,),
    }),
    (LinearSVC, {
        'loss': ('hinge', 'squared_hinge'),
        'class_weight': (class_weights, None),
        'random_state': (0,),
    }),
    (SVC, {
        'kernel': ('linear', 'poly', 'rbf', 'sigmoid'),
        'shrinking': (True, False),
        'class_weight': (class_weights, None),
        'decision_function_shape': ('ovr', 'ovo'),
        'random_state': (0,),
    }),
]

In [13]:
def my_score(f, xs, ys):
    assert(len(xs) == len(ys))
    a = 18
    b = sum(abs(x - y) for x, y in zip(xs, ys)) / len(xs)
    return (f(a) - f(b)) / f(a)

def my_linear_score(xs, ys):
    return my_score(lambda x: x, xs, ys)

In [14]:
%%time

results = {}
for clf, params in clfs_params:
    grid_search_cv = GridSearchCV(clf(), params, cv=10, scoring=make_scorer(my_linear_score), error_score=0, n_jobs=-1)
    X_train_ = X_train_dict.get(clf.__name__, X_train)
    y_train_ = y_train
    grid_search_cv.fit(X_train_, y_train_)
    results[clf.__name__] = grid_search_cv
    print(clf.__name__, 'done.')

RandomForestClassifier done.
KNeighborsClassifier done.
MLPClassifier done.
LinearSVC done.
SVC done.
CPU times: user 4min 32s, sys: 1min 55s, total: 6min 28s
Wall time: 1h 23min 43s


In [15]:
with open('out/clf_results.pickle', 'wb') as f:
    pickle.dump(results, f)