In [1]:
import json
import pickle
from time import time

import networkx as nx
import nltk
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.cross_validation import StratifiedKFold, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import PassiveAggressiveClassifier, Perceptron, RidgeClassifier, SGDClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
# Classifiers obtained from http://scikit-learn.org/stable/auto_examples/text/document_classification_20newsgroups.html

# Original data

## Preparing data

In [2]:
elbulli_raw_recipes_df = pd.read_csv('data/dbs/recipes_elbulli_raw.csv')

In [3]:
elbulli_raw_recipes_df['creativity'] = elbulli_raw_recipes_df.apply(
    lambda x: 10 if 1987 <= x['year'] <= 1997 else \
              20 if 1998 <= x['year'] <= 2001 else \
              30 if 2002 <= x['year'] <= 2005 else \
              'UNKOWN', axis=1)

In [4]:
elbulli_raw_recipes_df['creativity'].value_counts()

10    454
30    389
20    371
Name: creativity, dtype: int64

In [5]:
cookpad_size = int(np.average(elbulli_raw_recipes_df['creativity'].value_counts()))

In [6]:
cookpad_size

404

In [7]:
cookpad_raw_recipes_df = pd.read_csv('data/dbs/recipes_cookpad_raw.csv')

In [8]:
cookpad_sample = cookpad_raw_recipes_df.sample(cookpad_size, random_state=0)

In [9]:
cookpad_indices = cookpad_sample.index.sort_values()

In [10]:
cookpad_indices

Int64Index([  33,   38,   39,   44,   49,   50,   72,  125,  134,  152,
            ...
            7809, 7847, 7854, 7855, 7880, 7904, 7918, 7923, 7940, 7943],
           dtype='int64', length=404)

In [11]:
def my_sample(df, indices=cookpad_indices):
    return df.loc[indices]

## Raw ingredients and techniques

### elbulli

In [12]:
elbulli_raw_recipes_df = pd.read_csv('data/dbs/recipes_elbulli_raw.csv')

In [13]:
elbulli_raw_recipes_df.shape

(1214, 5)

In [14]:
elbulli_raw_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
0,1,terrina de melón con gelée de oporto,1987,"[""oporto"",""aceite de oliva"",""agua"",""pimienta b...","[""agua""]"
1,10,salmonetes gaudí,1987,"[""sal"",""pimiento"",""pimiento rojo"",""piñones tos...","[""sal"",""marcar"",""cocción"",""asado"",""hirviendo"",..."
2,100,oursins,1990,"[""chocolate"",""claras de huevo"",""avellanas tost...","[""horno"",""al horno"",""secar""]"
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[""zumo de limón"",""limón"",""alginato sódico"",""ag...","[""agua"",""cocer"",""fría""]"
4,1001,ceps en ámbar de su caldo,2004,"[""sal"",""caldo de ceps"",""carragenato kappa en p...","[""sal"",""asar"",""cocer"",""hervido"",""confitar"",""ca..."


In [15]:
elbulli_raw_recipes_df['ingredients'] = elbulli_raw_recipes_df.apply(lambda x: json.loads(x['ingredients']), axis=1)
elbulli_raw_recipes_df['techniques'] = elbulli_raw_recipes_df.apply(lambda x: json.loads(x['techniques']), axis=1)

In [16]:
elbulli_raw_recipes_df['creativity'] = elbulli_raw_recipes_df.apply(
    lambda x: 10 if 1987 <= x['year'] <= 1997 else \
              20 if 1998 <= x['year'] <= 2001 else \
              30 if 2002 <= x['year'] <= 2005 else \
              'UNKOWN', axis=1)

In [17]:
elbulli_raw_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
0,1,terrina de melón con gelée de oporto,1987,"[oporto, aceite de oliva, agua, pimienta blanc...",[agua],10
1,10,salmonetes gaudí,1987,"[sal, pimiento, pimiento rojo, piñones tostado...","[sal, marcar, cocción, asado, hirviendo, horno...",10
2,100,oursins,1990,"[chocolate, claras de huevo, avellanas tostada...","[horno, al horno, secar]",10
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[zumo de limón, limón, alginato sódico, agua, ...","[agua, cocer, fría]",30
4,1001,ceps en ámbar de su caldo,2004,"[sal, caldo de ceps, carragenato kappa en polv...","[sal, asar, cocer, hervido, confitar, cazuela,...",30


### cookpad

In [18]:
cookpad_raw_recipes_df = pd.read_csv('data/dbs/recipes_cookpad_raw.csv')
cookpad_raw_recipes_df = my_sample(cookpad_raw_recipes_df)

In [19]:
cookpad_raw_recipes_df.shape

(404, 5)

In [20]:
cookpad_raw_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
33,100475,langostinos rebozados,2009,"[""levadura"",""clara"",""langostinos"",""pimienta"",""...","[""freir""]"
38,100485,lomos de bacalao rebozados,2009,"[""bacalao fresco"",""harina"",""huevo"",""pimienta""]","[""freir""]"
39,100486,cazuela de berenjenas,2009,"[""tomates maduros"",""berenjenas asadas"",""aceite...","[""al horno"",""gratinar"",""plancha"",""freir"",""sart..."
44,100549,pimientos y patatas,2009,"[""sal"",""aceite de oliva"",""vinagre"",""atún""]","[""sal"",""cocer"",""salar""]"
49,100588,muffins de moka,2009,"[""chocolate"",""mantequilla"",""harina de maíz"",""h...","[""rellenar"",""hornear"",""baño maría""]"


In [21]:
cookpad_raw_recipes_df['ingredients'] = cookpad_raw_recipes_df.apply(lambda x: json.loads(x['ingredients']), axis=1)
cookpad_raw_recipes_df['techniques'] = cookpad_raw_recipes_df.apply(lambda x: json.loads(x['techniques']), axis=1)

In [22]:
cookpad_raw_recipes_df['creativity'] = cookpad_raw_recipes_df.apply(
    lambda x: 0, axis=1)

In [23]:
cookpad_raw_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
33,100475,langostinos rebozados,2009,"[levadura, clara, langostinos, pimienta, aceit...",[freir],0
38,100485,lomos de bacalao rebozados,2009,"[bacalao fresco, harina, huevo, pimienta]",[freir],0
39,100486,cazuela de berenjenas,2009,"[tomates maduros, berenjenas asadas, aceite de...","[al horno, gratinar, plancha, freir, sartén, s...",0
44,100549,pimientos y patatas,2009,"[sal, aceite de oliva, vinagre, atún]","[sal, cocer, salar]",0
49,100588,muffins de moka,2009,"[chocolate, mantequilla, harina de maíz, huevo...","[rellenar, hornear, baño maría]",0


### elbulli & cookpad

In [24]:
raw_recipes_df = elbulli_raw_recipes_df.append(cookpad_raw_recipes_df, ignore_index=True)

In [25]:
raw_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
0,1,terrina de melón con gelée de oporto,1987,"[oporto, aceite de oliva, agua, pimienta blanc...",[agua],10
1,10,salmonetes gaudí,1987,"[sal, pimiento, pimiento rojo, piñones tostado...","[sal, marcar, cocción, asado, hirviendo, horno...",10
2,100,oursins,1990,"[chocolate, claras de huevo, avellanas tostada...","[horno, al horno, secar]",10
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[zumo de limón, limón, alginato sódico, agua, ...","[agua, cocer, fría]",30
4,1001,ceps en ámbar de su caldo,2004,"[sal, caldo de ceps, carragenato kappa en polv...","[sal, asar, cocer, hervido, confitar, cazuela,...",30


## Representative ingredients and techniques

### elbulli

In [26]:
elbulli_repr_recipes_df = pd.read_csv('data/dbs/recipes_elbulli_representatives.csv')

In [27]:
elbulli_repr_recipes_df.dtypes

_id             int64
title          object
year            int64
ingredients    object
techniques     object
dtype: object

In [28]:
elbulli_repr_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
0,1,terrina de melón con gelée de oporto,1987,"[""oporto"",""aceite de oliva"",""agua"",""pimienta b...","[""agua""]"
1,10,salmonetes gaudí,1987,"[""sal"",""pimientos"",""pimiento rojo"",""piñones to...","[""sal"",""marcar"",""cocción"",""horno"",""hervir"",""ho..."
2,100,oursins,1990,"[""chocolate"",""claras de huevo"",""avellanas tost...","[""horno"",""horno"",""secar""]"
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[""zumo de limón"",""limón"",""alginato sódico"",""ag...","[""agua"",""cocer"",""horno""]"
4,1001,ceps en ámbar de su caldo,2004,"[""sal"",""caldo de ceps"",""carragenato kappa en p...","[""sal"",""horno"",""cocer"",""hervir"",""confitar"",""ca..."


In [29]:
elbulli_repr_recipes_df['ingredients'] = elbulli_repr_recipes_df.apply(lambda x: json.loads(x['ingredients']), axis=1)
elbulli_repr_recipes_df['techniques'] = elbulli_repr_recipes_df.apply(lambda x: json.loads(x['techniques']), axis=1)

In [30]:
elbulli_repr_recipes_df['creativity'] = elbulli_repr_recipes_df.apply(
    lambda x: 10 if 1987 <= x['year'] <= 1997 else \
              20 if 1998 <= x['year'] <= 2001 else \
              30 if 2002 <= x['year'] <= 2005 else \
              'UNKOWN', axis=1)

In [31]:
elbulli_repr_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
0,1,terrina de melón con gelée de oporto,1987,"[oporto, aceite de oliva, agua, pimienta blanc...",[agua],10
1,10,salmonetes gaudí,1987,"[sal, pimientos, pimiento rojo, piñones tostad...","[sal, marcar, cocción, horno, hervir, horno, s...",10
2,100,oursins,1990,"[chocolate, claras de huevo, avellanas tostada...","[horno, horno, secar]",10
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[zumo de limón, limón, alginato sódico, agua, ...","[agua, cocer, horno]",30
4,1001,ceps en ámbar de su caldo,2004,"[sal, caldo de ceps, carragenato kappa en polv...","[sal, horno, cocer, hervir, confitar, cazuela,...",30


### cookpad

In [32]:
cookpad_repr_recipes_df = pd.read_csv('data/dbs/recipes_cookpad_representatives.csv')
cookpad_repr_recipes_df = my_sample(cookpad_repr_recipes_df)

In [33]:
cookpad_repr_recipes_df.shape

(404, 5)

In [34]:
cookpad_repr_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
33,100475,langostinos rebozados,2009,"[""levadura"",""claras"",""langostinos"",""pimienta"",...","[""horno""]"
38,100485,lomos de bacalao rebozados,2009,"[""bacalao fresco"",""harina"",""huevos"",""pimienta""]","[""horno""]"
39,100486,cazuela de berenjenas,2009,"[""tomates maduros"",""berenjena asada"",""aceite d...","[""horno"",""horno"",""plancha"",""horno"",""sartén"",""s..."
44,100549,pimientos y patatas,2009,"[""sal"",""aceite de oliva"",""vinagre"",""atún""]","[""sal"",""cocer"",""sal""]"
49,100588,muffins de moka,2009,"[""chocolate"",""mantequilla"",""harina de maíz"",""h...","[""relleno"",""horno"",""al baño maria""]"


In [35]:
cookpad_repr_recipes_df['ingredients'] = cookpad_repr_recipes_df.apply(lambda x: json.loads(x['ingredients']), axis=1)
cookpad_repr_recipes_df['techniques'] = cookpad_repr_recipes_df.apply(lambda x: json.loads(x['techniques']), axis=1)

In [36]:
cookpad_repr_recipes_df['creativity'] = cookpad_repr_recipes_df.apply(
    lambda x: 0, axis=1)

In [37]:
cookpad_repr_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
33,100475,langostinos rebozados,2009,"[levadura, claras, langostinos, pimienta, acei...",[horno],0
38,100485,lomos de bacalao rebozados,2009,"[bacalao fresco, harina, huevos, pimienta]",[horno],0
39,100486,cazuela de berenjenas,2009,"[tomates maduros, berenjena asada, aceite de o...","[horno, horno, plancha, horno, sartén, sal, en...",0
44,100549,pimientos y patatas,2009,"[sal, aceite de oliva, vinagre, atún]","[sal, cocer, sal]",0
49,100588,muffins de moka,2009,"[chocolate, mantequilla, harina de maíz, huevo...","[relleno, horno, al baño maria]",0


### elbulli & cookpad

In [38]:
repr_recipes_df = elbulli_repr_recipes_df.append(cookpad_repr_recipes_df, ignore_index=True)

In [39]:
repr_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
0,1,terrina de melón con gelée de oporto,1987,"[oporto, aceite de oliva, agua, pimienta blanc...",[agua],10
1,10,salmonetes gaudí,1987,"[sal, pimientos, pimiento rojo, piñones tostad...","[sal, marcar, cocción, horno, hervir, horno, s...",10
2,100,oursins,1990,"[chocolate, claras de huevo, avellanas tostada...","[horno, horno, secar]",10
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[zumo de limón, limón, alginato sódico, agua, ...","[agua, cocer, horno]",30
4,1001,ceps en ámbar de su caldo,2004,"[sal, caldo de ceps, carragenato kappa en polv...","[sal, horno, cocer, hervir, confitar, cazuela,...",30


## Superclasses of ingredients and techniques

### elbulli

In [40]:
elbulli_super_recipes_df = pd.read_csv('data/dbs/recipes_elbulli_superclasses.csv')

In [41]:
elbulli_super_recipes_df.dtypes

_id             int64
title          object
year            int64
ingredients    object
techniques     object
dtype: object

In [42]:
elbulli_super_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
0,1,terrina de melón con gelée de oporto,1987,"[""oporto"",""aceite"",""agua"",""pimienta"",""gelatina...","[""agua""]"
1,10,salmonetes gaudí,1987,"[""sal"",""pimientos"",""pimiento"",""piñones"",""vinag...","[""sal"",""marcar"",""cocción"",""horno"",""hervir"",""ho..."
2,100,oursins,1990,"[""chocolate"",""claras"",""avellanas"",""yemas"",""azú...","[""horno"",""horno"",""secar""]"
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[""zumo"",""limón"",""alginato"",""agua"",""té"",""azúcar""]","[""agua"",""cocer"",""horno""]"
4,1001,ceps en ámbar de su caldo,2004,"[""sal"",""ceps"",""carragenato"",""romero"",""ceps"",""c...","[""sal"",""horno"",""cocer"",""hervir"",""confitar"",""ca..."


In [43]:
elbulli_super_recipes_df['ingredients'] = elbulli_super_recipes_df.apply(lambda x: json.loads(x['ingredients']), axis=1)
elbulli_super_recipes_df['techniques'] = elbulli_super_recipes_df.apply(lambda x: json.loads(x['techniques']), axis=1)

In [44]:
elbulli_super_recipes_df['creativity'] = elbulli_super_recipes_df.apply(
    lambda x: 10 if 1987 <= x['year'] <= 1997 else \
              20 if 1998 <= x['year'] <= 2001 else \
              30 if 2002 <= x['year'] <= 2005 else \
              'UNKOWN', axis=1)

In [45]:
elbulli_super_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
0,1,terrina de melón con gelée de oporto,1987,"[oporto, aceite, agua, pimienta, gelatina, mel...",[agua],10
1,10,salmonetes gaudí,1987,"[sal, pimientos, pimiento, piñones, vinagre, o...","[sal, marcar, cocción, horno, hervir, horno, s...",10
2,100,oursins,1990,"[chocolate, claras, avellanas, yemas, azúcar, ...","[horno, horno, secar]",10
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[zumo, limón, alginato, agua, té, azúcar]","[agua, cocer, horno]",30
4,1001,ceps en ámbar de su caldo,2004,"[sal, ceps, carragenato, romero, ceps, ceps, a...","[sal, horno, cocer, hervir, confitar, cazuela,...",30


### cookpad

In [46]:
cookpad_super_recipes_df = pd.read_csv('data/dbs/recipes_cookpad_superclasses.csv')
cookpad_super_recipes_df = my_sample(cookpad_super_recipes_df)

In [47]:
cookpad_super_recipes_df.dtypes

_id             int64
title          object
year            int64
ingredients    object
techniques     object
dtype: object

In [48]:
cookpad_super_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
33,100475,langostinos rebozados,2009,"[""levadura"",""claras"",""langostinos"",""pimienta"",...","[""horno""]"
38,100485,lomos de bacalao rebozados,2009,"[""bacalao"",""harina"",""huevos"",""pimienta""]","[""horno""]"
39,100486,cazuela de berenjenas,2009,"[""tomates"",""berenjena"",""aceite"",""queso"",""ajo"",...","[""horno"",""horno"",""plancha"",""horno"",""sartén"",""s..."
44,100549,pimientos y patatas,2009,"[""sal"",""aceite"",""vinagre"",""atún""]","[""sal"",""cocer"",""sal""]"
49,100588,muffins de moka,2009,"[""chocolate"",""mantequilla"",""harina"",""huevos"",""...","[""relleno"",""horno"",""al""]"


In [49]:
cookpad_super_recipes_df['ingredients'] = cookpad_super_recipes_df.apply(lambda x: json.loads(x['ingredients']), axis=1)
cookpad_super_recipes_df['techniques'] = cookpad_super_recipes_df.apply(lambda x: json.loads(x['techniques']), axis=1)

In [50]:
cookpad_super_recipes_df['creativity'] = cookpad_super_recipes_df.apply(
    lambda x: 0, axis=1)

In [51]:
cookpad_super_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
33,100475,langostinos rebozados,2009,"[levadura, claras, langostinos, pimienta, acei...",[horno],0
38,100485,lomos de bacalao rebozados,2009,"[bacalao, harina, huevos, pimienta]",[horno],0
39,100486,cazuela de berenjenas,2009,"[tomates, berenjena, aceite, queso, ajo, sal, ...","[horno, horno, plancha, horno, sartén, sal, ro...",0
44,100549,pimientos y patatas,2009,"[sal, aceite, vinagre, atún]","[sal, cocer, sal]",0
49,100588,muffins de moka,2009,"[chocolate, mantequilla, harina, huevos, café,...","[relleno, horno, al]",0


### elbulli & cookpad

In [52]:
super_recipes_df = elbulli_super_recipes_df.append(cookpad_super_recipes_df, ignore_index=True)

In [53]:
super_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
0,1,terrina de melón con gelée de oporto,1987,"[oporto, aceite, agua, pimienta, gelatina, mel...",[agua],10
1,10,salmonetes gaudí,1987,"[sal, pimientos, pimiento, piñones, vinagre, o...","[sal, marcar, cocción, horno, hervir, horno, s...",10
2,100,oursins,1990,"[chocolate, claras, avellanas, yemas, azúcar, ...","[horno, horno, secar]",10
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[zumo, limón, alginato, agua, té, azúcar]","[agua, cocer, horno]",30
4,1001,ceps en ámbar de su caldo,2004,"[sal, ceps, carragenato, romero, ceps, ceps, a...","[sal, horno, cocer, hervir, confitar, cazuela,...",30


## Types of ingredients, representative techniques

### elbulli

In [54]:
elbulli_types_recipes_df = pd.read_csv('data/dbs/recipes_elbulli_types.csv')

In [55]:
elbulli_types_recipes_df.dtypes

_id             int64
title          object
year            int64
ingredients    object
techniques     object
dtype: object

In [56]:
elbulli_types_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
0,1,terrina de melón con gelée de oporto,1987,"[""western"",""western"",""1"",""western"",""modernist""...","[""agua""]"
1,10,salmonetes gaudí,1987,"[""western"",""western"",""western"",""1"",""western"",""...","[""sal"",""marcar"",""cocción"",""horno"",""hervir"",""ho..."
2,100,oursins,1990,"[""western"",""western"",""1"",""western"",""western"",""...","[""horno"",""horno"",""secar""]"
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[""1"",""western"",""modernist"",""1"",""western"",""west...","[""agua"",""cocer"",""horno""]"
4,1001,ceps en ámbar de su caldo,2004,"[""western"",""modernist"",""modernist"",""modernist""...","[""sal"",""horno"",""cocer"",""hervir"",""confitar"",""ca..."


In [57]:
elbulli_types_recipes_df['ingredients'] = elbulli_types_recipes_df.apply(lambda x: json.loads(x['ingredients']), axis=1)
elbulli_types_recipes_df['techniques'] = elbulli_types_recipes_df.apply(lambda x: json.loads(x['techniques']), axis=1)

In [58]:
elbulli_types_recipes_df['creativity'] = elbulli_types_recipes_df.apply(
    lambda x: 10 if 1987 <= x['year'] <= 1997 else \
              20 if 1998 <= x['year'] <= 2001 else \
              30 if 2002 <= x['year'] <= 2005 else \
              'UNKOWN', axis=1)

In [59]:
elbulli_types_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
0,1,terrina de melón con gelée de oporto,1987,"[western, western, 1, western, modernist, west...",[agua],10
1,10,salmonetes gaudí,1987,"[western, western, western, 1, western, wester...","[sal, marcar, cocción, horno, hervir, horno, s...",10
2,100,oursins,1990,"[western, western, 1, western, western, western]","[horno, horno, secar]",10
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[1, western, modernist, 1, western, western]","[agua, cocer, horno]",30
4,1001,ceps en ámbar de su caldo,2004,"[western, modernist, modernist, modernist, wes...","[sal, horno, cocer, hervir, confitar, cazuela,...",30


### cookpad

In [60]:
cookpad_types_recipes_df = pd.read_csv('data/dbs/recipes_cookpad_types.csv')
cookpad_types_recipes_df = my_sample(cookpad_types_recipes_df)

In [61]:
cookpad_types_recipes_df.dtypes

_id             int64
title          object
year            int64
ingredients    object
techniques     object
dtype: object

In [62]:
cookpad_types_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
33,100475,langostinos rebozados,2009,"[""western"",""western"",""western"",""western"",""west...","[""horno""]"
38,100485,lomos de bacalao rebozados,2009,"[""western"",""western"",""western"",""western""]","[""horno""]"
39,100486,cazuela de berenjenas,2009,"[""western"",""western"",""western"",""western"",""west...","[""horno"",""horno"",""plancha"",""horno"",""sartén"",""s..."
44,100549,pimientos y patatas,2009,"[""western"",""western"",""western"",""western""]","[""sal"",""cocer"",""sal""]"
49,100588,muffins de moka,2009,"[""western"",""western"",""western"",""western"",""west...","[""relleno"",""horno"",""al baño maria""]"


In [63]:
cookpad_types_recipes_df['ingredients'] = cookpad_types_recipes_df.apply(lambda x: json.loads(x['ingredients']), axis=1)
cookpad_types_recipes_df['techniques'] = cookpad_types_recipes_df.apply(lambda x: json.loads(x['techniques']), axis=1)

In [64]:
cookpad_types_recipes_df['creativity'] = cookpad_types_recipes_df.apply(
    lambda x: 0, axis=1)

In [65]:
cookpad_types_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
33,100475,langostinos rebozados,2009,"[western, western, western, western, western, ...",[horno],0
38,100485,lomos de bacalao rebozados,2009,"[western, western, western, western]",[horno],0
39,100486,cazuela de berenjenas,2009,"[western, western, western, western, western, ...","[horno, horno, plancha, horno, sartén, sal, en...",0
44,100549,pimientos y patatas,2009,"[western, western, western, western]","[sal, cocer, sal]",0
49,100588,muffins de moka,2009,"[western, western, western, western, western, ...","[relleno, horno, al baño maria]",0


### elbulli & cookpad

In [66]:
types_recipes_df = elbulli_types_recipes_df.append(cookpad_types_recipes_df, ignore_index=True)

In [67]:
types_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
0,1,terrina de melón con gelée de oporto,1987,"[western, western, 1, western, modernist, west...",[agua],10
1,10,salmonetes gaudí,1987,"[western, western, western, 1, western, wester...","[sal, marcar, cocción, horno, hervir, horno, s...",10
2,100,oursins,1990,"[western, western, 1, western, western, western]","[horno, horno, secar]",10
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[1, western, modernist, 1, western, western]","[agua, cocer, horno]",30
4,1001,ceps en ámbar de su caldo,2004,"[western, modernist, modernist, modernist, wes...","[sal, horno, cocer, hervir, confitar, cazuela,...",30


## Cuisines of ingredients, representative techniques

### elbulli

In [68]:
elbulli_cuis_recipes_df = pd.read_csv('data/dbs/recipes_elbulli_cuisines.csv')

In [69]:
elbulli_cuis_recipes_df.dtypes

_id             int64
title          object
year            int64
ingredients    object
techniques     object
dtype: object

In [70]:
elbulli_cuis_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
0,1,terrina de melón con gelée de oporto,1987,"[""other"",""spicies_and_condimients"",""drinks"",""2...","[""agua""]"
1,10,salmonetes gaudí,1987,"[""spicies_and_condimients"",""vegetables"",""veget...","[""sal"",""marcar"",""cocción"",""horno"",""hervir"",""ho..."
2,100,oursins,1990,"[""sweets"",""spicies_and_condimients"",""nuts"",""sp...","[""horno"",""horno"",""secar""]"
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[""14"",""fruits"",""other"",""drinks"",""drinks"",""swee...","[""agua"",""cocer"",""horno""]"
4,1001,ceps en ámbar de su caldo,2004,"[""spicies_and_condimients"",""other"",""other"",""sp...","[""sal"",""horno"",""cocer"",""hervir"",""confitar"",""ca..."


In [71]:
elbulli_cuis_recipes_df['ingredients'] = elbulli_cuis_recipes_df.apply(lambda x: json.loads(x['ingredients']), axis=1)
elbulli_cuis_recipes_df['techniques'] = elbulli_cuis_recipes_df.apply(lambda x: json.loads(x['techniques']), axis=1)

In [72]:
elbulli_cuis_recipes_df['creativity'] = elbulli_cuis_recipes_df.apply(
    lambda x: 10 if 1987 <= x['year'] <= 1997 else \
              20 if 1998 <= x['year'] <= 2001 else \
              30 if 2002 <= x['year'] <= 2005 else \
              'UNKOWN', axis=1)

In [73]:
elbulli_cuis_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
0,1,terrina de melón con gelée de oporto,1987,"[other, spicies_and_condimients, drinks, 2, ot...",[agua],10
1,10,salmonetes gaudí,1987,"[spicies_and_condimients, vegetables, vegetabl...","[sal, marcar, cocción, horno, hervir, horno, s...",10
2,100,oursins,1990,"[sweets, spicies_and_condimients, nuts, spicie...","[horno, horno, secar]",10
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[14, fruits, other, drinks, drinks, sweets]","[agua, cocer, horno]",30
4,1001,ceps en ámbar de su caldo,2004,"[spicies_and_condimients, other, other, spicie...","[sal, horno, cocer, hervir, confitar, cazuela,...",30


### cookpad

In [74]:
cookpad_cuis_recipes_df = pd.read_csv('data/dbs/recipes_cookpad_cuisines.csv')
cookpad_cuis_recipes_df = my_sample(cookpad_cuis_recipes_df)

In [75]:
cookpad_cuis_recipes_df.dtypes

_id             int64
title          object
year            int64
ingredients    object
techniques     object
dtype: object

In [76]:
cookpad_cuis_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
33,100475,langostinos rebozados,2009,"[""other"",""spicies_and_condimients"",""seafood"",""...","[""horno""]"
38,100485,lomos de bacalao rebozados,2009,"[""seafood"",""cereals"",""spicies_and_condimients""...","[""horno""]"
39,100486,cazuela de berenjenas,2009,"[""vegetables"",""vegetables"",""spicies_and_condim...","[""horno"",""horno"",""plancha"",""horno"",""sartén"",""s..."
44,100549,pimientos y patatas,2009,"[""spicies_and_condimients"",""spicies_and_condim...","[""sal"",""cocer"",""sal""]"
49,100588,muffins de moka,2009,"[""sweets"",""dairy"",""cereals"",""spicies_and_condi...","[""relleno"",""horno"",""al baño maria""]"


In [77]:
cookpad_cuis_recipes_df['ingredients'] = cookpad_cuis_recipes_df.apply(lambda x: json.loads(x['ingredients']), axis=1)
cookpad_cuis_recipes_df['techniques'] = cookpad_cuis_recipes_df.apply(lambda x: json.loads(x['techniques']), axis=1)

In [78]:
cookpad_cuis_recipes_df['creativity'] = cookpad_cuis_recipes_df.apply(
    lambda x: 0, axis=1)

In [79]:
cookpad_cuis_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
33,100475,langostinos rebozados,2009,"[other, spicies_and_condimients, seafood, 1, s...",[horno],0
38,100485,lomos de bacalao rebozados,2009,"[seafood, cereals, spicies_and_condimients, 1]",[horno],0
39,100486,cazuela de berenjenas,2009,"[vegetables, vegetables, spicies_and_condimien...","[horno, horno, plancha, horno, sartén, sal, en...",0
44,100549,pimientos y patatas,2009,"[spicies_and_condimients, spicies_and_condimie...","[sal, cocer, sal]",0
49,100588,muffins de moka,2009,"[sweets, dairy, cereals, spicies_and_condimien...","[relleno, horno, al baño maria]",0


### elbulli & cookpad

In [80]:
cuis_recipes_df = elbulli_cuis_recipes_df.append(cookpad_cuis_recipes_df, ignore_index=True)

In [81]:
cuis_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
0,1,terrina de melón con gelée de oporto,1987,"[other, spicies_and_condimients, drinks, 2, ot...",[agua],10
1,10,salmonetes gaudí,1987,"[spicies_and_condimients, vegetables, vegetabl...","[sal, marcar, cocción, horno, hervir, horno, s...",10
2,100,oursins,1990,"[sweets, spicies_and_condimients, nuts, spicie...","[horno, horno, secar]",10
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[14, fruits, other, drinks, drinks, sweets]","[agua, cocer, horno]",30
4,1001,ceps en ámbar de su caldo,2004,"[spicies_and_condimients, other, other, spicie...","[sal, horno, cocer, hervir, confitar, cazuela,...",30


# Formatting data

In [82]:
def join_ingredients_and_techniques(ingr_list, tech_list):
    i_list = ['i_' + '_'.join(x.split()) for x in ingr_list]
    t_list = ['t_' + '_'.join(x.split()) for x in tech_list]
    return ' '.join(i_list + t_list)

In [83]:
columns = ['text', 'creativity']

In [84]:
raw_text_df = pd.DataFrame(columns=columns)
raw_text_df['creativity'] = raw_recipes_df['creativity']
raw_text_df['text'] = raw_recipes_df.apply(
    lambda x: join_ingredients_and_techniques(x['ingredients'], x['techniques']), axis=1)

In [85]:
raw_text_df.head()

Unnamed: 0,text,creativity
0,i_oporto i_aceite_de_oliva i_agua i_pimienta_b...,10
1,i_sal i_pimiento i_pimiento_rojo i_piñones_tos...,10
2,i_chocolate i_claras_de_huevo i_avellanas_tost...,10
3,i_zumo_de_limón i_limón i_alginato_sódico i_ag...,30
4,i_sal i_caldo_de_ceps i_carragenato_kappa_en_p...,30


In [86]:
repr_text_df = pd.DataFrame(columns=columns)
repr_text_df['creativity'] = repr_recipes_df['creativity']
repr_text_df['text'] = repr_recipes_df.apply(
    lambda x: join_ingredients_and_techniques(x['ingredients'], x['techniques']), axis=1)

In [87]:
repr_text_df.head()

Unnamed: 0,text,creativity
0,i_oporto i_aceite_de_oliva i_agua i_pimienta_b...,10
1,i_sal i_pimientos i_pimiento_rojo i_piñones_to...,10
2,i_chocolate i_claras_de_huevo i_avellanas_tost...,10
3,i_zumo_de_limón i_limón i_alginato_sódico i_ag...,30
4,i_sal i_caldo_de_ceps i_carragenato_kappa_en_p...,30


In [88]:
super_text_df = pd.DataFrame(columns=columns)
super_text_df['creativity'] = super_recipes_df['creativity']
super_text_df['text'] = super_recipes_df.apply(
    lambda x: join_ingredients_and_techniques(x['ingredients'], x['techniques']), axis=1)

In [89]:
super_text_df.head()

Unnamed: 0,text,creativity
0,i_oporto i_aceite i_agua i_pimienta i_gelatina...,10
1,i_sal i_pimientos i_pimiento i_piñones i_vinag...,10
2,i_chocolate i_claras i_avellanas i_yemas i_azú...,10
3,i_zumo i_limón i_alginato i_agua i_té i_azúcar...,30
4,i_sal i_ceps i_carragenato i_romero i_ceps i_c...,30


In [90]:
types_text_df = pd.DataFrame(columns=columns)
types_text_df['creativity'] = types_recipes_df['creativity']
types_text_df['text'] = types_recipes_df.apply(
    lambda x: join_ingredients_and_techniques(x['ingredients'], x['techniques']), axis=1)

In [91]:
types_text_df.head()

Unnamed: 0,text,creativity
0,i_western i_western i_1 i_western i_modernist ...,10
1,i_western i_western i_western i_1 i_western i_...,10
2,i_western i_western i_1 i_western i_western i_...,10
3,i_1 i_western i_modernist i_1 i_western i_west...,30
4,i_western i_modernist i_modernist i_modernist ...,30


In [92]:
cuis_text_df = pd.DataFrame(columns=columns)
cuis_text_df['creativity'] = cuis_recipes_df['creativity']
cuis_text_df['text'] = cuis_recipes_df.apply(
    lambda x: join_ingredients_and_techniques(x['ingredients'], x['techniques']), axis=1)

In [93]:
cuis_text_df.head()

Unnamed: 0,text,creativity
0,i_other i_spicies_and_condimients i_drinks i_2...,10
1,i_spicies_and_condimients i_vegetables i_veget...,10
2,i_sweets i_spicies_and_condimients i_nuts i_sp...,10
3,i_14 i_fruits i_other i_drinks i_drinks i_swee...,30
4,i_spicies_and_condimients i_other i_other i_sp...,30


# Classification

In [94]:
def update(d1, d2):
    d = dict(d1)
    d.update(d2)
    return d

In [95]:
K = 10

In [96]:
parameters = {
#     'vect__max_df': (0.5, 0.8, 1.0),
#     'vect__min_df': (0.0, 0.2, 1),
#     'tfidf__norm': (None, 'l1', 'l2'),
#     'tfidf__use_idf': (True, False),
}

classifiers = [
    ('BernoulliNB', BernoulliNB, update(parameters, {
#         'clf__alpha': (1.0, 1e-1, 1e-2, 1e-3),
#         'clf__fit_prior': (False, True),
    })),
#     ('KNeighborsClassifier', KNeighborsClassifier, update(parameters, {
#         'clf__weights': ('uniform', 'distance'),
#         'clf__metric': ('euclidean', 'manhattan', 'minkowski')
#     })),
#     ('LinearSVC', LinearSVC, [
#         update(parameters, {
#         'clf__loss': ('squared_hinge', 'hinge',),
#         'clf__tol': (1e-1, 1e-2, 1e-3),
#         }),
#         update(parameters, {
#         'clf__loss': ('hinge',),
#         'clf__penalty': ('l1', 'l2'),
#         'clf__dual': (False,),
#         'clf__tol': (1e-1, 1e-2, 1e-3),
#         }),
#     ]),
#     ('MultinomialNB', MultinomialNB, update(parameters, {
#         'clf__alpha': (1.0, 1e-1, 1e-2, 1e-3),
#         'clf__fit_prior': (False, True),
#     })),
#     ('NearestCentroid', NearestCentroid, update(parameters, {})),
#     ('PassiveAggressiveClassifier', PassiveAggressiveClassifier, update(parameters, {
#         'vect__min_df': (0.0, 1),
#         'clf__loss': ('squared_hinge', 'hinge'),
#         'clf__n_iter': (5, 10, 20),
#         'clf__class_weight': (None, 'balanced', {0: 0.25, 10: 0.28, 20: 0.23, 30: 0.24}),
#         'clf__warm_start': (False, True),
#     })),
#     ('Perceptron', Perceptron, update(parameters, {
#         'vect__min_df': (0.0, 1),
#         'tfidf__norm': (None, 'l2'),
#         'clf__penalty': (None, 'l1', 'l2', 'elasticnet'),
#         'clf__alpha': (1e-4, 1e-5, 1e-6),
#         'clf__class_weight': (None, 'balanced', {0: 0.25, 10: 0.28, 20: 0.23, 30: 0.24}),
#         'clf__warm_start': (False, True),
#     })),
#     ('RandomForestClassifier', RandomForestClassifier, update(parameters, {
#         'clf__n_estimators': (10, 20, 30),
#         'clf__criterion': ('gini', 'entropy'),
#         'clf__max_depth': (None, 5, 10),
#         'clf__class_weight': (None, {0: 0.25, 10: 0.28, 20: 0.23, 30: 0.24}),
#         'clf__warm_start': (False, True),})),
#     ('RidgeClassifier', RidgeClassifier, update(parameters, {
#         'clf__alpha': (1.0, 1e-1, 1e-2, 1e-3),
#         'clf__class_weight': (None, 'balanced', {0: 0.25, 10: 0.28, 20: 0.23, 30: 0.24}),
#         'clf__normalize': (True, False),
#         'clf__tol': (1e-1, 1e-2, 1e-3),
#     ('SGDClassifier', SGDClassifier, [
#         update(parameters, {
#         'vect__min_df': (0.0, 1),
#         'clf__loss': ('hinge', 'log', 'modified_huber', 'perceptron',),
#         'clf__penalty': ('none', 'l1', 'l2', 'elasticnet'),
#         'clf__class_weight': (None, 'balanced'),
#         'clf__warm_start': (False, True),
#         }),
#         update(parameters, {
#         'vect__min_df': (0.0, 1),
#         'clf__loss': ('hinge', 'log', 'modified_huber', 'perceptron',),
#         'clf__penalty': ('none', 'l1', 'l2', 'elasticnet'),
#         'clf__alpha': (1e-5, 1e-6),
#         'clf__class_weight': ({0: 0.25, 10: 0.28, 20: 0.23, 30: 0.24},),
#         'clf__warm_start': (False, True),
#         }),
#     ]),
]

In [97]:
def benchmark(grid_search_cv, X_train, X_test, y_train, y_test, name):
    print('Training %s...' % name)
    t0 = time()
    grid_search_cv.fit(X_train, y_train)
    training_time = time() - t0
    print('Training time: %0.3fs' % training_time)
    print()
    score = float('%0.2f' % grid_search_cv.best_score_)
    print('Best score:', score)
    print('Best parameters:', grid_search_cv.best_params_)
    print()
    print('Testing %s...' % name)
    t0 = time()
    pred = grid_search_cv.predict(X_test)
    testing_time = time() - t0
    print('Testing time:  %0.3fs' % testing_time)
    print()
    print('Metrics:')
    accuracy = metrics.accuracy_score(y_test, pred)
    accuracy = float('%0.2f' % accuracy)
    print('accuracy  =', accuracy)
    precision = metrics.precision_score(y_test, pred, average='weighted')
    precision = float('%0.2f' % precision)
    print('precision =', precision)
    recall = metrics.recall_score(y_test, pred, average='weighted')
    recall = float('%0.2f' % recall)
    print('recall    =', recall)
    f1_score = metrics.f1_score(y_test, pred, average='weighted')
    f1_score = float('%0.2f' % f1_score)
    print('f1_score  =', f1_score)
    print()
    print('Classification report:')
    print(metrics.classification_report(y_test, pred, target_names=['None' ,'Low', 'Medium', 'High']))
    print()
    print('Confusion matrix:')
    print(metrics.confusion_matrix(y_test, pred))
    print()
    return {
        'best_estimator': grid_search_cv.best_estimator_,
        'best_score': score,
        'best_parameters': grid_search_cv.best_params_,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1_score,
    }

In [98]:
def update(d1, d2):
    d = dict(d1)
    d.update(d2)
    return d

## Raw ingredients and techniques

In [99]:
X = raw_text_df['text']
y = raw_text_df['creativity']

In [100]:
y.value_counts().sort_index()

0     404
10    454
20    371
30    389
Name: creativity, dtype: int64

In [101]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0, stratify=y)

In [102]:
skf = StratifiedKFold(y_train, K)

In [103]:
print('Performing grid search with cross-validation...')
print('=' * 80)
print()
best_estimators = []
for name, clf, parameters in classifiers:
    pipeline = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', clf()),
    ])
    grid_search_cv = GridSearchCV(pipeline, parameters, cv=skf, scoring='f1_weighted', error_score=0, n_jobs=-1)
    best_estimators.append(benchmark(grid_search_cv, X_train, X_test, y_train, y_test, name))
    print('-' * 80)

Performing grid search with cross-validation...

Training BernoulliNB...
Training time: 0.437s

Best score: 0.78
Best parameters: {}

Testing BernoulliNB...
Testing time:  0.005s

Metrics:
accuracy  = 0.78
precision = 0.78
recall    = 0.78
f1_score  = 0.77

Classification report:
             precision    recall  f1-score   support

       None       0.81      0.95      0.87        40
        Low       0.81      0.56      0.66        45
     Medium       0.68      0.73      0.70        37
       High       0.81      0.90      0.85        39

avg / total       0.78      0.78      0.77       161


Confusion matrix:
[[38  0  0  2]
 [ 3 25 13  4]
 [ 5  3 27  2]
 [ 1  3  0 35]]

--------------------------------------------------------------------------------


In [104]:
with open('data/raw_best_estimators.pickle', 'wb') as f:
    pickle.dump(best_estimators, f)

## Representative ingredients and techniques

In [105]:
X = repr_text_df['text']
y = repr_text_df['creativity']

In [106]:
y.value_counts().sort_index()

0     404
10    454
20    371
30    389
Name: creativity, dtype: int64

In [107]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0, stratify=y)

In [108]:
skf = StratifiedKFold(y_train, K)

In [109]:
print('Performing grid search with cross-validation...')
print('=' * 80)
print()
best_estimators = []
for name, clf, parameters in classifiers:
    pipeline = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', clf()),
    ])
    grid_search_cv = GridSearchCV(pipeline, parameters, cv=skf, scoring='f1_weighted', error_score=0, n_jobs=-1)
    best_estimators.append(benchmark(grid_search_cv, X_train, X_test, y_train, y_test, name))
    print('-' * 80)

Performing grid search with cross-validation...

Training BernoulliNB...
Training time: 0.342s

Best score: 0.78
Best parameters: {}

Testing BernoulliNB...
Testing time:  0.006s

Metrics:
accuracy  = 0.76
precision = 0.77
recall    = 0.76
f1_score  = 0.76

Classification report:
             precision    recall  f1-score   support

       None       0.89      0.97      0.93        40
        Low       0.79      0.58      0.67        45
     Medium       0.68      0.68      0.68        37
       High       0.70      0.85      0.77        39

avg / total       0.77      0.76      0.76       161


Confusion matrix:
[[39  0  0  1]
 [ 2 26 10  7]
 [ 2  4 25  6]
 [ 1  3  2 33]]

--------------------------------------------------------------------------------


In [110]:
with open('data/repr_best_estimators.pickle', 'wb') as f:
    pickle.dump(best_estimators, f)

## Superclasses of ingredients and techniques

In [111]:
X = super_text_df['text']
y = super_text_df['creativity']

In [112]:
y.value_counts().sort_index()

0     404
10    454
20    371
30    389
Name: creativity, dtype: int64

In [113]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0, stratify=y)

In [114]:
skf = StratifiedKFold(y_train, K)

In [115]:
print('Performing grid search with cross-validation...')
print('=' * 80)
print()
best_estimators = []
for name, clf, parameters in classifiers:
    pipeline = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', clf()),
    ])
    grid_search_cv = GridSearchCV(pipeline, parameters, cv=skf, scoring='f1_weighted', error_score=0, n_jobs=-1)
    best_estimators.append(benchmark(grid_search_cv, X_train, X_test, y_train, y_test, name))
    print('-' * 80)

Performing grid search with cross-validation...

Training BernoulliNB...
Training time: 0.338s

Best score: 0.71
Best parameters: {}

Testing BernoulliNB...
Testing time:  0.005s

Metrics:
accuracy  = 0.68
precision = 0.68
recall    = 0.68
f1_score  = 0.67

Classification report:
             precision    recall  f1-score   support

       None       0.84      0.90      0.87        40
        Low       0.71      0.44      0.55        45
     Medium       0.56      0.62      0.59        37
       High       0.61      0.77      0.68        39

avg / total       0.68      0.68      0.67       161


Confusion matrix:
[[36  0  0  4]
 [ 3 20 13  9]
 [ 3  5 23  6]
 [ 1  3  5 30]]

--------------------------------------------------------------------------------


In [116]:
with open('data/super_best_estimators.pickle', 'wb') as f:
    pickle.dump(best_estimators, f)

## Types of ingredients, representative techniques

In [117]:
X = types_text_df['text']
y = types_text_df['creativity']

In [118]:
y.value_counts().sort_index()

0     404
10    454
20    371
30    389
Name: creativity, dtype: int64

In [119]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0, stratify=y)

In [120]:
skf = StratifiedKFold(y_train, K)

In [121]:
print('Performing grid search with cross-validation...')
print('=' * 80)
print()
best_estimators = []
for name, clf, parameters in classifiers:
    pipeline = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', clf()),
    ])
    grid_search_cv = GridSearchCV(pipeline, parameters, cv=skf, scoring='f1_weighted', error_score=0, n_jobs=-1)
    best_estimators.append(benchmark(grid_search_cv, X_train, X_test, y_train, y_test, name))
    print('-' * 80)

Performing grid search with cross-validation...

Training BernoulliNB...
Training time: 0.328s

Best score: 0.65
Best parameters: {}

Testing BernoulliNB...
Testing time:  0.005s

Metrics:
accuracy  = 0.6
precision = 0.58
recall    = 0.6
f1_score  = 0.58

Classification report:
             precision    recall  f1-score   support

       None       0.81      0.97      0.89        40
        Low       0.50      0.44      0.47        45
     Medium       0.43      0.35      0.39        37
       High       0.56      0.62      0.59        39

avg / total       0.58      0.60      0.58       161


Confusion matrix:
[[39  0  0  1]
 [ 5 20 11  9]
 [ 4 11 13  9]
 [ 0  9  6 24]]

--------------------------------------------------------------------------------


In [122]:
with open('data/types_best_estimators.pickle', 'wb') as f:
    pickle.dump(best_estimators, f)

## Cuisines of ingredients, representative techniques

In [123]:
X = cuis_text_df['text']
y = cuis_text_df['creativity']

In [124]:
y.value_counts().sort_index()

0     404
10    454
20    371
30    389
Name: creativity, dtype: int64

In [125]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0, stratify=y)

In [126]:
skf = StratifiedKFold(y_train, K)

In [127]:
print('Performing grid search with cross-validation...')
print('=' * 80)
print()
best_estimators = []
for name, clf, parameters in classifiers:
    pipeline = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', clf()),
    ])
    grid_search_cv = GridSearchCV(pipeline, parameters, cv=skf, scoring='f1_weighted', error_score=0, n_jobs=-1)
    best_estimators.append(benchmark(grid_search_cv, X_train, X_test, y_train, y_test, name))
    print('-' * 80)

Performing grid search with cross-validation...

Training BernoulliNB...
Training time: 0.325s

Best score: 0.61
Best parameters: {}

Testing BernoulliNB...
Testing time:  0.004s

Metrics:
accuracy  = 0.52
precision = 0.49
recall    = 0.52
f1_score  = 0.5

Classification report:
             precision    recall  f1-score   support

       None       0.66      0.88      0.75        40
        Low       0.47      0.42      0.45        45
     Medium       0.32      0.22      0.26        37
       High       0.49      0.54      0.51        39

avg / total       0.49      0.52      0.50       161


Confusion matrix:
[[35  0  2  3]
 [ 7 19 11  8]
 [ 7 11  8 11]
 [ 4 10  4 21]]

--------------------------------------------------------------------------------


In [128]:
with open('data/cuis_best_estimators.pickle', 'wb') as f:
    pickle.dump(best_estimators, f)

In [129]:
# # split a training set and a test set
# y_train = y_data
# y_test = y_val

# vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5)

# print("Extracting features from the training data using a sparse vectorizer")
# X_train = vectorizer.fit_transform(X_data)
# print("n_samples: %d, n_features: %d" % X_train.shape)
# print()

# print("Extracting features from the test data using the same vectorizer")
# X_test = vectorizer.transform(X_data)
# print("n_samples: %d, n_features: %d" % X_test.shape)
# print()

In [130]:
# # mapping from integer feature name to original token string
# feature_names = vectorizer.get_feature_names()
# opts_select_chi2 = 50

# print("Extracting %d best features by a chi-squared test" %
#       opts_select_chi2)
# ch2 = SelectKBest(chi2, k=opts_select_chi2)
# X_train = ch2.fit_transform(X_train, y_train)
# X_test = ch2.transform(X_test)
# if feature_names:
#     # keep selected feature names
#     feature_names = [feature_names[i] for i
#                      in ch2.get_support(indices=True)]
# print()

# feature_names = np.asarray(feature_names)

In [131]:
# feature_names