In [1]:
import json
from random import randint
from time import time

import networkx as nx
import nltk
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.cross_validation import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import PassiveAggressiveClassifier, Perceptron, RidgeClassifier, SGDClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
# Classifiers obtained from http://scikit-learn.org/stable/auto_examples/text/document_classification_20newsgroups.html

# Original data

## Preparing sets' sizes

In [2]:
elbulli_raw_recipes_df = pd.read_csv('data/dbs/recipes_elbulli_raw.csv')

In [3]:
elbulli_raw_recipes_df['creativity'] = elbulli_raw_recipes_df.apply(
    lambda x: 10 if 1987 <= x['year'] <= 1997 else \
              20 if 1998 <= x['year'] <= 2001 else \
              30 if 2002 <= x['year'] <= 2005 else \
              'UNKOWN', axis=1)

In [4]:
elbulli_raw_recipes_df['creativity'].value_counts()

10    454
30    389
20    371
Name: creativity, dtype: int64

In [5]:
cookpad_size = int(np.average(elbulli_raw_recipes_df['creativity'].value_counts()))

In [6]:
cookpad_size

404

In [7]:
cookpad_raw_recipes_df = pd.read_csv('data/dbs/recipes_cookpad_raw.csv')

In [8]:
cookpad_sample = cookpad_raw_recipes_df.sample(cookpad_size)

In [9]:
cookpad_indices = cookpad_sample.index.sort_values()

In [10]:
cookpad_indices

Int64Index([  48,  102,  131,  157,  181,  194,  227,  228,  243,  247,
            ...
            7713, 7741, 7778, 7841, 7863, 7866, 7911, 7960, 7963, 7970],
           dtype='int64', length=404)

In [11]:
def my_sample(df, indices=cookpad_indices):
    return df.ix[indices]

## Raw ingredients and techniques

### elbulli

In [12]:
elbulli_raw_recipes_df = pd.read_csv('data/dbs/recipes_elbulli_raw.csv')

In [13]:
elbulli_raw_recipes_df.shape

(1214, 5)

In [14]:
elbulli_raw_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
0,1,terrina de melón con gelée de oporto,1987,"[""oporto"",""aceite de oliva"",""agua"",""pimienta b...","[""agua""]"
1,10,salmonetes gaudí,1987,"[""sal"",""pimiento"",""pimiento rojo"",""piñones tos...","[""sal"",""marcar"",""cocción"",""asado"",""hirviendo"",..."
2,100,oursins,1990,"[""chocolate"",""claras de huevo"",""avellanas tost...","[""horno"",""al horno"",""secar""]"
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[""zumo de limón"",""limón"",""alginato sódico"",""ag...","[""agua"",""cocer"",""fría""]"
4,1001,ceps en ámbar de su caldo,2004,"[""sal"",""caldo de ceps"",""carragenato kappa en p...","[""sal"",""asar"",""cocer"",""hervido"",""confitar"",""ca..."


In [15]:
elbulli_raw_recipes_df['ingredients'] = elbulli_raw_recipes_df.apply(lambda x: json.loads(x['ingredients']), axis=1)
elbulli_raw_recipes_df['techniques'] = elbulli_raw_recipes_df.apply(lambda x: json.loads(x['techniques']), axis=1)

In [16]:
elbulli_raw_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
0,1,terrina de melón con gelée de oporto,1987,"[oporto, aceite de oliva, agua, pimienta blanc...",[agua]
1,10,salmonetes gaudí,1987,"[sal, pimiento, pimiento rojo, piñones tostado...","[sal, marcar, cocción, asado, hirviendo, horno..."
2,100,oursins,1990,"[chocolate, claras de huevo, avellanas tostada...","[horno, al horno, secar]"
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[zumo de limón, limón, alginato sódico, agua, ...","[agua, cocer, fría]"
4,1001,ceps en ámbar de su caldo,2004,"[sal, caldo de ceps, carragenato kappa en polv...","[sal, asar, cocer, hervido, confitar, cazuela,..."


In [17]:
elbulli_raw_recipes_df['creativity'] = elbulli_raw_recipes_df.apply(
    lambda x: 10 if 1987 <= x['year'] <= 1997 else \
              20 if 1998 <= x['year'] <= 2001 else \
              30 if 2002 <= x['year'] <= 2005 else \
              'UNKOWN', axis=1)

In [18]:
elbulli_raw_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
0,1,terrina de melón con gelée de oporto,1987,"[oporto, aceite de oliva, agua, pimienta blanc...",[agua],10
1,10,salmonetes gaudí,1987,"[sal, pimiento, pimiento rojo, piñones tostado...","[sal, marcar, cocción, asado, hirviendo, horno...",10
2,100,oursins,1990,"[chocolate, claras de huevo, avellanas tostada...","[horno, al horno, secar]",10
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[zumo de limón, limón, alginato sódico, agua, ...","[agua, cocer, fría]",30
4,1001,ceps en ámbar de su caldo,2004,"[sal, caldo de ceps, carragenato kappa en polv...","[sal, asar, cocer, hervido, confitar, cazuela,...",30


### cookpad

In [19]:
cookpad_raw_recipes_df = pd.read_csv('data/dbs/recipes_cookpad_raw.csv')
cookpad_raw_recipes_df = my_sample(cookpad_raw_recipes_df)

In [20]:
cookpad_raw_recipes_df.shape

(404, 5)

In [21]:
cookpad_raw_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
48,100571,pastel de fiambre,2009,"[""tomate"",""tocino"",""lechugas"",""vino"",""mantequi...","[""al horno"",""sal"",""baño maría""]"
102,101608,ensalada de verano con patatas,2010,"[""naranjas"",""orégano"",""vinagre de modena"",""oli...","[""sal"",""agua"",""cocer""]"
131,102187,torta de pan con sardinas de bota,2009,"[""pan"",""aceite de oliva virgen extra"",""sardinas""]","[""fria"",""cocer"",""horno""]"
157,102249,setas revueltas con gulas,2009,"[""aceite de oliva virgen"",""gulas"",""ajo"",""huevo...","[""sartén"",""frito""]"
181,102280,helado de limón con yogur,2009,"[""zumo"",""azúcar"",""yogures""]","[""secamos""]"


In [22]:
cookpad_raw_recipes_df['ingredients'] = cookpad_raw_recipes_df.apply(lambda x: json.loads(x['ingredients']), axis=1)
cookpad_raw_recipes_df['techniques'] = cookpad_raw_recipes_df.apply(lambda x: json.loads(x['techniques']), axis=1)

In [23]:
cookpad_raw_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
48,100571,pastel de fiambre,2009,"[tomate, tocino, lechugas, vino, mantequilla, ...","[al horno, sal, baño maría]"
102,101608,ensalada de verano con patatas,2010,"[naranjas, orégano, vinagre de modena, olivas,...","[sal, agua, cocer]"
131,102187,torta de pan con sardinas de bota,2009,"[pan, aceite de oliva virgen extra, sardinas]","[fria, cocer, horno]"
157,102249,setas revueltas con gulas,2009,"[aceite de oliva virgen, gulas, ajo, huevos, s...","[sartén, frito]"
181,102280,helado de limón con yogur,2009,"[zumo, azúcar, yogures]",[secamos]


In [24]:
cookpad_raw_recipes_df['creativity'] = cookpad_raw_recipes_df.apply(
    lambda x: 0, axis=1)

In [25]:
cookpad_raw_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
48,100571,pastel de fiambre,2009,"[tomate, tocino, lechugas, vino, mantequilla, ...","[al horno, sal, baño maría]",0
102,101608,ensalada de verano con patatas,2010,"[naranjas, orégano, vinagre de modena, olivas,...","[sal, agua, cocer]",0
131,102187,torta de pan con sardinas de bota,2009,"[pan, aceite de oliva virgen extra, sardinas]","[fria, cocer, horno]",0
157,102249,setas revueltas con gulas,2009,"[aceite de oliva virgen, gulas, ajo, huevos, s...","[sartén, frito]",0
181,102280,helado de limón con yogur,2009,"[zumo, azúcar, yogures]",[secamos],0


### elbulli & cookpad

In [26]:
raw_recipes_df = elbulli_raw_recipes_df.append(cookpad_raw_recipes_df, ignore_index=True)

In [27]:
raw_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
0,1,terrina de melón con gelée de oporto,1987,"[oporto, aceite de oliva, agua, pimienta blanc...",[agua],10
1,10,salmonetes gaudí,1987,"[sal, pimiento, pimiento rojo, piñones tostado...","[sal, marcar, cocción, asado, hirviendo, horno...",10
2,100,oursins,1990,"[chocolate, claras de huevo, avellanas tostada...","[horno, al horno, secar]",10
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[zumo de limón, limón, alginato sódico, agua, ...","[agua, cocer, fría]",30
4,1001,ceps en ámbar de su caldo,2004,"[sal, caldo de ceps, carragenato kappa en polv...","[sal, asar, cocer, hervido, confitar, cazuela,...",30


In [28]:
raw_recipes_df.tail()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
1613,99269,pollo salteado con setas,2009,"[pimentón, aceite de oliva, ajo, perejil, seta...","[freír, salar]",0
1614,99683,gazpaxo cordobés,2009,"[tomates maduros, vinagre, aceite, ajo, pimien...","[sal, hirviendo, agua, cocer]",0
1615,99876,consomé de pollo,2009,"[chalote, apio, puerros, zanahorias, pechugas ...","[espuma, a fuego lento, cocer, reducido, fría,...",0
1616,99954,pimientos asados en microondas,2009,"[pimientos, aceite de oliva]","[crudo, microondas]",0
1617,99978,revuelto de espárragos con gambas,2009,"[esparragos, aceite de oliva, huevos, ajo, gam...","[sal, dorar, sartén]",0


## Representative ingredients and techniques

### elbulli

In [29]:
elbulli_repr_recipes_df = pd.read_csv('data/dbs/recipes_elbulli_representatives.csv')

In [30]:
elbulli_repr_recipes_df.dtypes

_id             int64
title          object
year            int64
ingredients    object
techniques     object
dtype: object

In [31]:
elbulli_repr_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
0,1,terrina de melón con gelée de oporto,1987,"[""oporto"",""aceite de oliva"",""agua"",""pimienta b...","[""agua""]"
1,10,salmonetes gaudí,1987,"[""sal"",""pimientos"",""pimiento rojo"",""piñones to...","[""sal"",""marcar"",""cocción"",""horno"",""hervir"",""ho..."
2,100,oursins,1990,"[""chocolate"",""claras de huevo"",""avellanas tost...","[""horno"",""horno"",""secar""]"
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[""zumo de limón"",""limón"",""alginato sódico"",""ag...","[""agua"",""cocer"",""horno""]"
4,1001,ceps en ámbar de su caldo,2004,"[""sal"",""caldo de ceps"",""carragenato kappa en p...","[""sal"",""horno"",""cocer"",""hervir"",""confitar"",""ca..."


In [32]:
elbulli_repr_recipes_df['ingredients'] = elbulli_repr_recipes_df.apply(lambda x: json.loads(x['ingredients']), axis=1)
elbulli_repr_recipes_df['techniques'] = elbulli_repr_recipes_df.apply(lambda x: json.loads(x['techniques']), axis=1)

In [33]:
elbulli_repr_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
0,1,terrina de melón con gelée de oporto,1987,"[oporto, aceite de oliva, agua, pimienta blanc...",[agua]
1,10,salmonetes gaudí,1987,"[sal, pimientos, pimiento rojo, piñones tostad...","[sal, marcar, cocción, horno, hervir, horno, s..."
2,100,oursins,1990,"[chocolate, claras de huevo, avellanas tostada...","[horno, horno, secar]"
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[zumo de limón, limón, alginato sódico, agua, ...","[agua, cocer, horno]"
4,1001,ceps en ámbar de su caldo,2004,"[sal, caldo de ceps, carragenato kappa en polv...","[sal, horno, cocer, hervir, confitar, cazuela,..."


In [34]:
elbulli_repr_recipes_df['creativity'] = elbulli_repr_recipes_df.apply(
    lambda x: 10 if 1987 <= x['year'] <= 1997 else \
              20 if 1998 <= x['year'] <= 2001 else \
              30 if 2002 <= x['year'] <= 2005 else \
              'UNKOWN', axis=1)

In [35]:
elbulli_repr_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
0,1,terrina de melón con gelée de oporto,1987,"[oporto, aceite de oliva, agua, pimienta blanc...",[agua],10
1,10,salmonetes gaudí,1987,"[sal, pimientos, pimiento rojo, piñones tostad...","[sal, marcar, cocción, horno, hervir, horno, s...",10
2,100,oursins,1990,"[chocolate, claras de huevo, avellanas tostada...","[horno, horno, secar]",10
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[zumo de limón, limón, alginato sódico, agua, ...","[agua, cocer, horno]",30
4,1001,ceps en ámbar de su caldo,2004,"[sal, caldo de ceps, carragenato kappa en polv...","[sal, horno, cocer, hervir, confitar, cazuela,...",30


### cookpad

In [36]:
cookpad_repr_recipes_df = pd.read_csv('data/dbs/recipes_cookpad_representatives.csv')
cookpad_repr_recipes_df = my_sample(cookpad_repr_recipes_df)

In [37]:
cookpad_repr_recipes_df.shape

(404, 5)

In [38]:
cookpad_repr_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
48,100571,pastel de fiambre,2009,"[""tomate"",""tocino"",""lechuga"",""vino"",""mantequil...","[""horno"",""sal"",""al baño maria""]"
102,101608,ensalada de verano con patatas,2010,"[""naranja"",""orégano"",""vinagre de módena"",""acei...","[""sal"",""agua"",""cocer""]"
131,102187,torta de pan con sardinas de bota,2009,"[""pan"",""aceite de oliva virgen extra"",""sardinas""]","[""horno"",""cocer"",""horno""]"
157,102249,setas revueltas con gulas,2009,"[""aceite de oliva virgen"",""gulas"",""ajo"",""huevo...","[""sartén"",""horno""]"
181,102280,helado de limón con yogur,2009,"[""zumo"",""azúcar"",""yogur""]","[""secar""]"


In [39]:
cookpad_repr_recipes_df['ingredients'] = cookpad_repr_recipes_df.apply(lambda x: json.loads(x['ingredients']), axis=1)
cookpad_repr_recipes_df['techniques'] = cookpad_repr_recipes_df.apply(lambda x: json.loads(x['techniques']), axis=1)

In [40]:
cookpad_repr_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
48,100571,pastel de fiambre,2009,"[tomate, tocino, lechuga, vino, mantequilla, c...","[horno, sal, al baño maria]"
102,101608,ensalada de verano con patatas,2010,"[naranja, orégano, vinagre de módena, aceituna...","[sal, agua, cocer]"
131,102187,torta de pan con sardinas de bota,2009,"[pan, aceite de oliva virgen extra, sardinas]","[horno, cocer, horno]"
157,102249,setas revueltas con gulas,2009,"[aceite de oliva virgen, gulas, ajo, huevos, s...","[sartén, horno]"
181,102280,helado de limón con yogur,2009,"[zumo, azúcar, yogur]",[secar]


In [41]:
cookpad_repr_recipes_df['creativity'] = cookpad_repr_recipes_df.apply(
    lambda x: 0, axis=1)

In [42]:
cookpad_repr_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
48,100571,pastel de fiambre,2009,"[tomate, tocino, lechuga, vino, mantequilla, c...","[horno, sal, al baño maria]",0
102,101608,ensalada de verano con patatas,2010,"[naranja, orégano, vinagre de módena, aceituna...","[sal, agua, cocer]",0
131,102187,torta de pan con sardinas de bota,2009,"[pan, aceite de oliva virgen extra, sardinas]","[horno, cocer, horno]",0
157,102249,setas revueltas con gulas,2009,"[aceite de oliva virgen, gulas, ajo, huevos, s...","[sartén, horno]",0
181,102280,helado de limón con yogur,2009,"[zumo, azúcar, yogur]",[secar],0


### elbulli & cookpad

In [43]:
repr_recipes_df = elbulli_repr_recipes_df.append(cookpad_repr_recipes_df, ignore_index=True)

In [44]:
repr_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
0,1,terrina de melón con gelée de oporto,1987,"[oporto, aceite de oliva, agua, pimienta blanc...",[agua],10
1,10,salmonetes gaudí,1987,"[sal, pimientos, pimiento rojo, piñones tostad...","[sal, marcar, cocción, horno, hervir, horno, s...",10
2,100,oursins,1990,"[chocolate, claras de huevo, avellanas tostada...","[horno, horno, secar]",10
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[zumo de limón, limón, alginato sódico, agua, ...","[agua, cocer, horno]",30
4,1001,ceps en ámbar de su caldo,2004,"[sal, caldo de ceps, carragenato kappa en polv...","[sal, horno, cocer, hervir, confitar, cazuela,...",30


In [45]:
repr_recipes_df.tail()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
1613,99269,pollo salteado con setas,2009,"[pimentón, aceite de oliva, ajo, perejil, seta...","[horno, sal]",0
1614,99683,gazpaxo cordobés,2009,"[tomates maduros, vinagre, aceite, ajo, pimien...","[sal, hervir, agua, cocer]",0
1615,99876,consomé de pollo,2009,"[chalotes, apio, puerro, zanahoria, pechugas d...","[espuma, a fuego lento, cocer, reducir, horno,...",0
1616,99954,pimientos asados en microondas,2009,"[pimiento, aceite de oliva]","[crudo, microondas]",0
1617,99978,revuelto de espárragos con gambas,2009,"[espárragos, aceite de oliva, huevos, ajo, gam...","[sal, horno, sartén]",0


## Superclasses of ingredients and techniques

### elbulli

In [46]:
elbulli_super_recipes_df = pd.read_csv('data/dbs/recipes_elbulli_superclasses.csv')

In [47]:
elbulli_super_recipes_df.dtypes

_id             int64
title          object
year            int64
ingredients    object
techniques     object
dtype: object

In [48]:
elbulli_super_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
0,1,terrina de melón con gelée de oporto,1987,"[""oporto"",""aceite"",""agua"",""pimienta"",""gelatina...","[""agua""]"
1,10,salmonetes gaudí,1987,"[""sal"",""pimientos"",""pimiento"",""piñones"",""vinag...","[""sal"",""marcar"",""cocción"",""horno"",""hervir"",""ho..."
2,100,oursins,1990,"[""chocolate"",""claras"",""avellanas"",""yemas"",""azú...","[""horno"",""horno"",""secar""]"
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[""zumo"",""limón"",""alginato"",""agua"",""té"",""azúcar""]","[""agua"",""cocer"",""horno""]"
4,1001,ceps en ámbar de su caldo,2004,"[""sal"",""ceps"",""carragenato"",""romero"",""ceps"",""c...","[""sal"",""horno"",""cocer"",""hervir"",""confitar"",""ca..."


In [49]:
elbulli_super_recipes_df['ingredients'] = elbulli_super_recipes_df.apply(lambda x: json.loads(x['ingredients']), axis=1)
elbulli_super_recipes_df['techniques'] = elbulli_super_recipes_df.apply(lambda x: json.loads(x['techniques']), axis=1)

In [50]:
elbulli_super_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
0,1,terrina de melón con gelée de oporto,1987,"[oporto, aceite, agua, pimienta, gelatina, mel...",[agua]
1,10,salmonetes gaudí,1987,"[sal, pimientos, pimiento, piñones, vinagre, o...","[sal, marcar, cocción, horno, hervir, horno, s..."
2,100,oursins,1990,"[chocolate, claras, avellanas, yemas, azúcar, ...","[horno, horno, secar]"
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[zumo, limón, alginato, agua, té, azúcar]","[agua, cocer, horno]"
4,1001,ceps en ámbar de su caldo,2004,"[sal, ceps, carragenato, romero, ceps, ceps, a...","[sal, horno, cocer, hervir, confitar, cazuela,..."


In [51]:
elbulli_super_recipes_df['creativity'] = elbulli_super_recipes_df.apply(
    lambda x: 10 if 1987 <= x['year'] <= 1997 else \
              20 if 1998 <= x['year'] <= 2001 else \
              30 if 2002 <= x['year'] <= 2005 else \
              'UNKOWN', axis=1)

In [52]:
elbulli_super_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
0,1,terrina de melón con gelée de oporto,1987,"[oporto, aceite, agua, pimienta, gelatina, mel...",[agua],10
1,10,salmonetes gaudí,1987,"[sal, pimientos, pimiento, piñones, vinagre, o...","[sal, marcar, cocción, horno, hervir, horno, s...",10
2,100,oursins,1990,"[chocolate, claras, avellanas, yemas, azúcar, ...","[horno, horno, secar]",10
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[zumo, limón, alginato, agua, té, azúcar]","[agua, cocer, horno]",30
4,1001,ceps en ámbar de su caldo,2004,"[sal, ceps, carragenato, romero, ceps, ceps, a...","[sal, horno, cocer, hervir, confitar, cazuela,...",30


### cookpad

In [53]:
cookpad_super_recipes_df = pd.read_csv('data/dbs/recipes_cookpad_superclasses.csv')
cookpad_super_recipes_df = my_sample(cookpad_super_recipes_df)

In [54]:
cookpad_super_recipes_df.dtypes

_id             int64
title          object
year            int64
ingredients    object
techniques     object
dtype: object

In [55]:
cookpad_super_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
48,100571,pastel de fiambre,2009,"[""tomate"",""tocino"",""lechuga"",""vino"",""mantequil...","[""horno"",""sal"",""al""]"
102,101608,ensalada de verano con patatas,2010,"[""naranja"",""orégano"",""vinagre"",""aceitunas"",""hu...","[""sal"",""agua"",""cocer""]"
131,102187,torta de pan con sardinas de bota,2009,"[""pan"",""aceite"",""sardinas""]","[""horno"",""cocer"",""horno""]"
157,102249,setas revueltas con gulas,2009,"[""aceite"",""gulas"",""ajo"",""huevos"",""sal"",""setas""]","[""sartén"",""horno""]"
181,102280,helado de limón con yogur,2009,"[""zumo"",""azúcar"",""yogur""]","[""secar""]"


In [56]:
cookpad_super_recipes_df['ingredients'] = cookpad_super_recipes_df.apply(lambda x: json.loads(x['ingredients']), axis=1)
cookpad_super_recipes_df['techniques'] = cookpad_super_recipes_df.apply(lambda x: json.loads(x['techniques']), axis=1)

In [57]:
cookpad_super_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
48,100571,pastel de fiambre,2009,"[tomate, tocino, lechuga, vino, mantequilla, c...","[horno, sal, al]"
102,101608,ensalada de verano con patatas,2010,"[naranja, orégano, vinagre, aceitunas, huevos,...","[sal, agua, cocer]"
131,102187,torta de pan con sardinas de bota,2009,"[pan, aceite, sardinas]","[horno, cocer, horno]"
157,102249,setas revueltas con gulas,2009,"[aceite, gulas, ajo, huevos, sal, setas]","[sartén, horno]"
181,102280,helado de limón con yogur,2009,"[zumo, azúcar, yogur]",[secar]


In [58]:
cookpad_super_recipes_df['creativity'] = cookpad_super_recipes_df.apply(
    lambda x: 0, axis=1)

In [59]:
cookpad_super_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
48,100571,pastel de fiambre,2009,"[tomate, tocino, lechuga, vino, mantequilla, c...","[horno, sal, al]",0
102,101608,ensalada de verano con patatas,2010,"[naranja, orégano, vinagre, aceitunas, huevos,...","[sal, agua, cocer]",0
131,102187,torta de pan con sardinas de bota,2009,"[pan, aceite, sardinas]","[horno, cocer, horno]",0
157,102249,setas revueltas con gulas,2009,"[aceite, gulas, ajo, huevos, sal, setas]","[sartén, horno]",0
181,102280,helado de limón con yogur,2009,"[zumo, azúcar, yogur]",[secar],0


### elbulli & cookpad

In [60]:
super_recipes_df = elbulli_super_recipes_df.append(cookpad_super_recipes_df, ignore_index=True)

In [61]:
super_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
0,1,terrina de melón con gelée de oporto,1987,"[oporto, aceite, agua, pimienta, gelatina, mel...",[agua],10
1,10,salmonetes gaudí,1987,"[sal, pimientos, pimiento, piñones, vinagre, o...","[sal, marcar, cocción, horno, hervir, horno, s...",10
2,100,oursins,1990,"[chocolate, claras, avellanas, yemas, azúcar, ...","[horno, horno, secar]",10
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[zumo, limón, alginato, agua, té, azúcar]","[agua, cocer, horno]",30
4,1001,ceps en ámbar de su caldo,2004,"[sal, ceps, carragenato, romero, ceps, ceps, a...","[sal, horno, cocer, hervir, confitar, cazuela,...",30


In [62]:
super_recipes_df.tail()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
1613,99269,pollo salteado con setas,2009,"[pimentón, aceite, ajo, perejil, setas, pollo,...","[horno, sal]",0
1614,99683,gazpaxo cordobés,2009,"[tomates, vinagre, aceite, ajo, pimienta, huev...","[sal, hervir, agua, cocer]",0
1615,99876,consomé de pollo,2009,"[chalotes, apio, puerro, zanahoria, pollo, hue...","[espuma, a, cocer, reducir, horno, cocción, ag...",0
1616,99954,pimientos asados en microondas,2009,"[pimiento, aceite]","[crudo, microondas]",0
1617,99978,revuelto de espárragos con gambas,2009,"[espárragos, aceite, huevos, ajo, gambas, sal]","[sal, horno, sartén]",0


## Types of ingredients, representative techniques

### elbulli

In [63]:
elbulli_types_recipes_df = pd.read_csv('data/dbs/recipes_elbulli_types.csv')

In [64]:
elbulli_types_recipes_df.dtypes

_id             int64
title          object
year            int64
ingredients    object
techniques     object
dtype: object

In [65]:
elbulli_types_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
0,1,terrina de melón con gelée de oporto,1987,"[""western"",""western"",""1"",""western"",""modernist""...","[""agua""]"
1,10,salmonetes gaudí,1987,"[""western"",""western"",""western"",""1"",""western"",""...","[""sal"",""marcar"",""cocción"",""horno"",""hervir"",""ho..."
2,100,oursins,1990,"[""western"",""western"",""1"",""western"",""western"",""...","[""horno"",""horno"",""secar""]"
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[""1"",""western"",""modernist"",""1"",""western"",""west...","[""agua"",""cocer"",""horno""]"
4,1001,ceps en ámbar de su caldo,2004,"[""western"",""modernist"",""modernist"",""modernist""...","[""sal"",""horno"",""cocer"",""hervir"",""confitar"",""ca..."


In [66]:
elbulli_types_recipes_df['ingredients'] = elbulli_types_recipes_df.apply(lambda x: json.loads(x['ingredients']), axis=1)
elbulli_types_recipes_df['techniques'] = elbulli_types_recipes_df.apply(lambda x: json.loads(x['techniques']), axis=1)

In [67]:
elbulli_types_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
0,1,terrina de melón con gelée de oporto,1987,"[western, western, 1, western, modernist, west...",[agua]
1,10,salmonetes gaudí,1987,"[western, western, western, 1, western, wester...","[sal, marcar, cocción, horno, hervir, horno, s..."
2,100,oursins,1990,"[western, western, 1, western, western, western]","[horno, horno, secar]"
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[1, western, modernist, 1, western, western]","[agua, cocer, horno]"
4,1001,ceps en ámbar de su caldo,2004,"[western, modernist, modernist, modernist, wes...","[sal, horno, cocer, hervir, confitar, cazuela,..."


In [68]:
elbulli_types_recipes_df['creativity'] = elbulli_types_recipes_df.apply(
    lambda x: 10 if 1987 <= x['year'] <= 1997 else \
              20 if 1998 <= x['year'] <= 2001 else \
              30 if 2002 <= x['year'] <= 2005 else \
              'UNKOWN', axis=1)

In [69]:
elbulli_types_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
0,1,terrina de melón con gelée de oporto,1987,"[western, western, 1, western, modernist, west...",[agua],10
1,10,salmonetes gaudí,1987,"[western, western, western, 1, western, wester...","[sal, marcar, cocción, horno, hervir, horno, s...",10
2,100,oursins,1990,"[western, western, 1, western, western, western]","[horno, horno, secar]",10
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[1, western, modernist, 1, western, western]","[agua, cocer, horno]",30
4,1001,ceps en ámbar de su caldo,2004,"[western, modernist, modernist, modernist, wes...","[sal, horno, cocer, hervir, confitar, cazuela,...",30


### cookpad

In [70]:
cookpad_types_recipes_df = pd.read_csv('data/dbs/recipes_cookpad_types.csv')
cookpad_types_recipes_df = my_sample(cookpad_types_recipes_df)

In [71]:
cookpad_types_recipes_df.dtypes

_id             int64
title          object
year            int64
ingredients    object
techniques     object
dtype: object

In [72]:
cookpad_types_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
48,100571,pastel de fiambre,2009,"[""western"",""western"",""western"",""western"",""west...","[""horno"",""sal"",""al baño maria""]"
102,101608,ensalada de verano con patatas,2010,"[""western"",""western"",""western"",""western"",""west...","[""sal"",""agua"",""cocer""]"
131,102187,torta de pan con sardinas de bota,2009,"[""western"",""western"",""western""]","[""horno"",""cocer"",""horno""]"
157,102249,setas revueltas con gulas,2009,"[""western"",""western"",""western"",""western"",""west...","[""sartén"",""horno""]"
181,102280,helado de limón con yogur,2009,"[""western"",""western"",""western""]","[""secar""]"


In [73]:
cookpad_types_recipes_df['ingredients'] = cookpad_types_recipes_df.apply(lambda x: json.loads(x['ingredients']), axis=1)
cookpad_types_recipes_df['techniques'] = cookpad_types_recipes_df.apply(lambda x: json.loads(x['techniques']), axis=1)

In [74]:
cookpad_types_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
48,100571,pastel de fiambre,2009,"[western, western, western, western, western, ...","[horno, sal, al baño maria]"
102,101608,ensalada de verano con patatas,2010,"[western, western, western, western, western, ...","[sal, agua, cocer]"
131,102187,torta de pan con sardinas de bota,2009,"[western, western, western]","[horno, cocer, horno]"
157,102249,setas revueltas con gulas,2009,"[western, western, western, western, western, ...","[sartén, horno]"
181,102280,helado de limón con yogur,2009,"[western, western, western]",[secar]


In [75]:
cookpad_types_recipes_df['creativity'] = cookpad_types_recipes_df.apply(
    lambda x: 0, axis=1)

In [76]:
cookpad_types_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
48,100571,pastel de fiambre,2009,"[western, western, western, western, western, ...","[horno, sal, al baño maria]",0
102,101608,ensalada de verano con patatas,2010,"[western, western, western, western, western, ...","[sal, agua, cocer]",0
131,102187,torta de pan con sardinas de bota,2009,"[western, western, western]","[horno, cocer, horno]",0
157,102249,setas revueltas con gulas,2009,"[western, western, western, western, western, ...","[sartén, horno]",0
181,102280,helado de limón con yogur,2009,"[western, western, western]",[secar],0


### elbulli & cookpad

In [77]:
types_recipes_df = elbulli_types_recipes_df.append(cookpad_types_recipes_df, ignore_index=True)

In [78]:
types_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
0,1,terrina de melón con gelée de oporto,1987,"[western, western, 1, western, modernist, west...",[agua],10
1,10,salmonetes gaudí,1987,"[western, western, western, 1, western, wester...","[sal, marcar, cocción, horno, hervir, horno, s...",10
2,100,oursins,1990,"[western, western, 1, western, western, western]","[horno, horno, secar]",10
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[1, western, modernist, 1, western, western]","[agua, cocer, horno]",30
4,1001,ceps en ámbar de su caldo,2004,"[western, modernist, modernist, modernist, wes...","[sal, horno, cocer, hervir, confitar, cazuela,...",30


## Cuisines of ingredients, representative techniques

### elbulli

In [79]:
elbulli_cuis_recipes_df = pd.read_csv('data/dbs/recipes_elbulli_cuisines.csv')

In [80]:
elbulli_cuis_recipes_df.dtypes

_id             int64
title          object
year            int64
ingredients    object
techniques     object
dtype: object

In [81]:
elbulli_cuis_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
0,1,terrina de melón con gelée de oporto,1987,"[""other"",""spicies_and_condimients"",""drinks"",""2...","[""agua""]"
1,10,salmonetes gaudí,1987,"[""spicies_and_condimients"",""vegetables"",""veget...","[""sal"",""marcar"",""cocción"",""horno"",""hervir"",""ho..."
2,100,oursins,1990,"[""sweets"",""spicies_and_condimients"",""nuts"",""sp...","[""horno"",""horno"",""secar""]"
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[""14"",""fruits"",""other"",""drinks"",""drinks"",""swee...","[""agua"",""cocer"",""horno""]"
4,1001,ceps en ámbar de su caldo,2004,"[""spicies_and_condimients"",""other"",""other"",""sp...","[""sal"",""horno"",""cocer"",""hervir"",""confitar"",""ca..."


In [82]:
elbulli_cuis_recipes_df['ingredients'] = elbulli_cuis_recipes_df.apply(lambda x: json.loads(x['ingredients']), axis=1)
elbulli_cuis_recipes_df['techniques'] = elbulli_cuis_recipes_df.apply(lambda x: json.loads(x['techniques']), axis=1)

In [83]:
elbulli_cuis_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
0,1,terrina de melón con gelée de oporto,1987,"[other, spicies_and_condimients, drinks, 2, ot...",[agua]
1,10,salmonetes gaudí,1987,"[spicies_and_condimients, vegetables, vegetabl...","[sal, marcar, cocción, horno, hervir, horno, s..."
2,100,oursins,1990,"[sweets, spicies_and_condimients, nuts, spicie...","[horno, horno, secar]"
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[14, fruits, other, drinks, drinks, sweets]","[agua, cocer, horno]"
4,1001,ceps en ámbar de su caldo,2004,"[spicies_and_condimients, other, other, spicie...","[sal, horno, cocer, hervir, confitar, cazuela,..."


In [84]:
elbulli_cuis_recipes_df['creativity'] = elbulli_cuis_recipes_df.apply(
    lambda x: 10 if 1987 <= x['year'] <= 1997 else \
              20 if 1998 <= x['year'] <= 2001 else \
              30 if 2002 <= x['year'] <= 2005 else \
              'UNKOWN', axis=1)

In [85]:
elbulli_cuis_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
0,1,terrina de melón con gelée de oporto,1987,"[other, spicies_and_condimients, drinks, 2, ot...",[agua],10
1,10,salmonetes gaudí,1987,"[spicies_and_condimients, vegetables, vegetabl...","[sal, marcar, cocción, horno, hervir, horno, s...",10
2,100,oursins,1990,"[sweets, spicies_and_condimients, nuts, spicie...","[horno, horno, secar]",10
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[14, fruits, other, drinks, drinks, sweets]","[agua, cocer, horno]",30
4,1001,ceps en ámbar de su caldo,2004,"[spicies_and_condimients, other, other, spicie...","[sal, horno, cocer, hervir, confitar, cazuela,...",30


### cookpad

In [86]:
cookpad_cuis_recipes_df = pd.read_csv('data/dbs/recipes_cookpad_cuisines.csv')
cookpad_cuis_recipes_df = my_sample(cookpad_cuis_recipes_df)

In [87]:
cookpad_cuis_recipes_df.dtypes

_id             int64
title          object
year            int64
ingredients    object
techniques     object
dtype: object

In [88]:
cookpad_cuis_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
48,100571,pastel de fiambre,2009,"[""vegetables"",""meats"",""vegetables"",""drinks"",""d...","[""horno"",""sal"",""al baño maria""]"
102,101608,ensalada de verano con patatas,2010,"[""fruits"",""spicies_and_condimients"",""spicies_a...","[""sal"",""agua"",""cocer""]"
131,102187,torta de pan con sardinas de bota,2009,"[""cereals"",""spicies_and_condimients"",""seafood""]","[""horno"",""cocer"",""horno""]"
157,102249,setas revueltas con gulas,2009,"[""spicies_and_condimients"",""seafood"",""spicies_...","[""sartén"",""horno""]"
181,102280,helado de limón con yogur,2009,"[""drinks"",""sweets"",""dairy""]","[""secar""]"


In [89]:
cookpad_cuis_recipes_df['ingredients'] = cookpad_cuis_recipes_df.apply(lambda x: json.loads(x['ingredients']), axis=1)
cookpad_cuis_recipes_df['techniques'] = cookpad_cuis_recipes_df.apply(lambda x: json.loads(x['techniques']), axis=1)

In [90]:
cookpad_cuis_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
48,100571,pastel de fiambre,2009,"[vegetables, meats, vegetables, drinks, dairy,...","[horno, sal, al baño maria]"
102,101608,ensalada de verano con patatas,2010,"[fruits, spicies_and_condimients, spicies_and_...","[sal, agua, cocer]"
131,102187,torta de pan con sardinas de bota,2009,"[cereals, spicies_and_condimients, seafood]","[horno, cocer, horno]"
157,102249,setas revueltas con gulas,2009,"[spicies_and_condimients, seafood, spicies_and...","[sartén, horno]"
181,102280,helado de limón con yogur,2009,"[drinks, sweets, dairy]",[secar]


In [91]:
cookpad_cuis_recipes_df['creativity'] = cookpad_cuis_recipes_df.apply(
    lambda x: 0, axis=1)

In [92]:
cookpad_cuis_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
48,100571,pastel de fiambre,2009,"[vegetables, meats, vegetables, drinks, dairy,...","[horno, sal, al baño maria]",0
102,101608,ensalada de verano con patatas,2010,"[fruits, spicies_and_condimients, spicies_and_...","[sal, agua, cocer]",0
131,102187,torta de pan con sardinas de bota,2009,"[cereals, spicies_and_condimients, seafood]","[horno, cocer, horno]",0
157,102249,setas revueltas con gulas,2009,"[spicies_and_condimients, seafood, spicies_and...","[sartén, horno]",0
181,102280,helado de limón con yogur,2009,"[drinks, sweets, dairy]",[secar],0


### elbulli & cookpad

In [93]:
cuis_recipes_df = elbulli_cuis_recipes_df.append(cookpad_cuis_recipes_df, ignore_index=True)

In [94]:
cuis_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
0,1,terrina de melón con gelée de oporto,1987,"[other, spicies_and_condimients, drinks, 2, ot...",[agua],10
1,10,salmonetes gaudí,1987,"[spicies_and_condimients, vegetables, vegetabl...","[sal, marcar, cocción, horno, hervir, horno, s...",10
2,100,oursins,1990,"[sweets, spicies_and_condimients, nuts, spicie...","[horno, horno, secar]",10
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[14, fruits, other, drinks, drinks, sweets]","[agua, cocer, horno]",30
4,1001,ceps en ámbar de su caldo,2004,"[spicies_and_condimients, other, other, spicie...","[sal, horno, cocer, hervir, confitar, cazuela,...",30


# Formatting data

In [95]:
def join_ingredients_and_techniques(ingr_list, tech_list):
    i_list = ['i_' + '_'.join(x.split()) for x in ingr_list]
    t_list = ['t_' + '_'.join(x.split()) for x in tech_list]
    return ' '.join(i_list + t_list)

In [96]:
columns = ['text', 'creativity']

In [97]:
raw_text_df = pd.DataFrame(columns=columns)
raw_text_df['creativity'] = raw_recipes_df['creativity']
raw_text_df['text'] = raw_recipes_df.apply(
    lambda x: join_ingredients_and_techniques(x['ingredients'], x['techniques']), axis=1)

In [98]:
raw_text_df.head()

Unnamed: 0,text,creativity
0,i_oporto i_aceite_de_oliva i_agua i_pimienta_b...,10
1,i_sal i_pimiento i_pimiento_rojo i_piñones_tos...,10
2,i_chocolate i_claras_de_huevo i_avellanas_tost...,10
3,i_zumo_de_limón i_limón i_alginato_sódico i_ag...,30
4,i_sal i_caldo_de_ceps i_carragenato_kappa_en_p...,30


In [99]:
repr_text_df = pd.DataFrame(columns=columns)
repr_text_df['creativity'] = repr_recipes_df['creativity']
repr_text_df['text'] = repr_recipes_df.apply(
    lambda x: join_ingredients_and_techniques(x['ingredients'], x['techniques']), axis=1)

In [100]:
repr_text_df.head()

Unnamed: 0,text,creativity
0,i_oporto i_aceite_de_oliva i_agua i_pimienta_b...,10
1,i_sal i_pimientos i_pimiento_rojo i_piñones_to...,10
2,i_chocolate i_claras_de_huevo i_avellanas_tost...,10
3,i_zumo_de_limón i_limón i_alginato_sódico i_ag...,30
4,i_sal i_caldo_de_ceps i_carragenato_kappa_en_p...,30


In [101]:
super_text_df = pd.DataFrame(columns=columns)
super_text_df['creativity'] = super_recipes_df['creativity']
super_text_df['text'] = super_recipes_df.apply(
    lambda x: join_ingredients_and_techniques(x['ingredients'], x['techniques']), axis=1)

In [102]:
super_text_df.head()

Unnamed: 0,text,creativity
0,i_oporto i_aceite i_agua i_pimienta i_gelatina...,10
1,i_sal i_pimientos i_pimiento i_piñones i_vinag...,10
2,i_chocolate i_claras i_avellanas i_yemas i_azú...,10
3,i_zumo i_limón i_alginato i_agua i_té i_azúcar...,30
4,i_sal i_ceps i_carragenato i_romero i_ceps i_c...,30


In [103]:
types_text_df = pd.DataFrame(columns=columns)
types_text_df['creativity'] = types_recipes_df['creativity']
types_text_df['text'] = types_recipes_df.apply(
    lambda x: join_ingredients_and_techniques(x['ingredients'], x['techniques']), axis=1)

In [104]:
types_text_df.head()

Unnamed: 0,text,creativity
0,i_western i_western i_1 i_western i_modernist ...,10
1,i_western i_western i_western i_1 i_western i_...,10
2,i_western i_western i_1 i_western i_western i_...,10
3,i_1 i_western i_modernist i_1 i_western i_west...,30
4,i_western i_modernist i_modernist i_modernist ...,30


In [105]:
cuis_text_df = pd.DataFrame(columns=columns)
cuis_text_df['creativity'] = cuis_recipes_df['creativity']
cuis_text_df['text'] = cuis_recipes_df.apply(
    lambda x: join_ingredients_and_techniques(x['ingredients'], x['techniques']), axis=1)

In [106]:
cuis_text_df.head()

Unnamed: 0,text,creativity
0,i_other i_spicies_and_condimients i_drinks i_2...,10
1,i_spicies_and_condimients i_vegetables i_veget...,10
2,i_sweets i_spicies_and_condimients i_nuts i_sp...,10
3,i_14 i_fruits i_other i_drinks i_drinks i_swee...,30
4,i_spicies_and_condimients i_other i_other i_sp...,30


# Classifier

In [107]:
K = 10

In [108]:
labels = pd.Series(raw_recipes_df['creativity'])
skf = StratifiedKFold(labels, K)
random_stratification = list(skf)[randint(0, K - 1)]
train_indices = random_stratification[0]
test_indices = random_stratification[1]

In [109]:
print(len(labels), len(train_indices), len(test_indices))

1618 1455 163


In [110]:
def benchmark(clf, X_train, X_test, y_train, y_test, name):
    print("_" * 80)
    print("Training: %s" % name)
    print(clf)
    print()
    
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)
    print()
    
    print("Best score:", clf.best_score_)
    print("Best parameters:",clf.best_params_)
    print()
    for params, mean_score, scores in clf.grid_scores_:
        print("%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() * 2, params))
    print()
    
    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)
    print()
    
    score = metrics.accuracy_score(y_test, pred)
    print("accuracy:   %0.3f" % score)
    
    precision = metrics.precision_score(y_test, pred)
    print("precision:   %0.3f" % precision)
    
    recall = metrics.recall_score(y_test, pred)
    print("recall:   %0.3f" % recall)
    
    f1_score = metrics.f1_score(y_test, pred)
    print("f1_score:   %0.3f" % f1_score)
    
    print("classification report:")
    print(metrics.classification_report(y_test, pred, target_names=['None' ,'Low', 'Medium', 'High']))
    
    print("confusion matrix:")
    print(metrics.confusion_matrix(y_test, pred))
    
    print()
    return score, precision, recall, f1_score

In [111]:
classifiers = [
    ('BernoulliNB', BernoulliNB, {}),
    ('KNeighborsClassifier', KNeighborsClassifier, {}),
    ('LinearSVC', LinearSVC, {}),
#     ('MultinomialNB', MultinomialNB, {}),
#     ('NearestCentroid', NearestCentroid(), {}),
#     ('PassiveAggressiveClassifier', PassiveAggressiveClassifier, {}),
#     ('Perceptron', Perceptron(), {}),
#     ('RandomForestClassifier', RandomForestClassifier, {}),
#     ('RidgeClassifier', RidgeClassifier, {}),
#     ('SGDClassifier', SGDClassifier, {}),
]

## Raw ingredients and techniques

In [114]:
X = raw_text_df['text']
y = raw_text_df['creativity']

In [116]:
y.value_counts().sort_index()

0     404
10    454
20    371
30    389
Name: creativity, dtype: int64

In [126]:
X_train, X_test, y_train, y_test = X[train_indices], X[test_indices], y[train_indices], y[test_indices]

In [128]:
X_train.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

In [130]:
for name, clf, parameters in classifiers:
    pipeline = Pipeline([
        ('CountVectorizer', CountVectorizer()),
        ('TfidfTransformer', TfidfTransformer()),
        ('Classifier', clf()),
    ])
    grid_search = GridSearchCV(pipeline, parameters, cv=skf, scoring='f1_weighted', error_score=0, n_jobs=-1, verbose=1)
#     benchmark_results = benchmark(grid_search, X_train, X_test, y_train, y_test, name)
#     benchmark_results = benchmark(pipeline, X_train, X_test, y_train, y_test, name)

In [131]:
grid_search.fit(X_train, y_train)

Fitting 10 folds for each of 1 candidates, totalling 10 fits


JoblibIndexError: JoblibIndexError
___________________________________________________________________________
Multiprocessing exception:
...........................................................................
/usr/lib/python3.4/runpy.py in _run_module_as_main(mod_name='ipykernel.__main__', alter_argv=1)
    165         sys.exit(msg)
    166     main_globals = sys.modules["__main__"].__dict__
    167     if alter_argv:
    168         sys.argv[0] = mod_spec.origin
    169     return _run_code(code, main_globals, None,
--> 170                      "__main__", mod_spec)
        mod_spec = ModuleSpec(name='ipykernel.__main__', loader=<_f...b/python3.4/site-packages/ipykernel/__main__.py')
    171 
    172 def run_module(mod_name, init_globals=None,
    173                run_name=None, alter_sys=False):
    174     """Execute a module's code without importing it

...........................................................................
/usr/lib/python3.4/runpy.py in _run_code(code=<code object <module> at 0x7f1ec741ef60, file "/...3.4/site-packages/ipykernel/__main__.py", line 1>, run_globals={'__builtins__': <module 'builtins' (built-in)>, '__cached__': '/home/antonio/virtualenvs/elbulli/lib/python3.4/...ges/ipykernel/__pycache__/__main__.cpython-34.pyc', '__doc__': None, '__file__': '/home/antonio/virtualenvs/elbulli/lib/python3.4/site-packages/ipykernel/__main__.py', '__loader__': <_frozen_importlib.SourceFileLoader object>, '__name__': '__main__', '__package__': 'ipykernel', '__spec__': ModuleSpec(name='ipykernel.__main__', loader=<_f...b/python3.4/site-packages/ipykernel/__main__.py'), 'app': <module 'ipykernel.kernelapp' from '/home/antoni.../python3.4/site-packages/ipykernel/kernelapp.py'>}, init_globals=None, mod_name='__main__', mod_spec=ModuleSpec(name='ipykernel.__main__', loader=<_f...b/python3.4/site-packages/ipykernel/__main__.py'), pkg_name='ipykernel', script_name=None)
     80                        __cached__ = cached,
     81                        __doc__ = None,
     82                        __loader__ = loader,
     83                        __package__ = pkg_name,
     84                        __spec__ = mod_spec)
---> 85     exec(code, run_globals)
        code = <code object <module> at 0x7f1ec741ef60, file "/...3.4/site-packages/ipykernel/__main__.py", line 1>
        run_globals = {'__builtins__': <module 'builtins' (built-in)>, '__cached__': '/home/antonio/virtualenvs/elbulli/lib/python3.4/...ges/ipykernel/__pycache__/__main__.cpython-34.pyc', '__doc__': None, '__file__': '/home/antonio/virtualenvs/elbulli/lib/python3.4/site-packages/ipykernel/__main__.py', '__loader__': <_frozen_importlib.SourceFileLoader object>, '__name__': '__main__', '__package__': 'ipykernel', '__spec__': ModuleSpec(name='ipykernel.__main__', loader=<_f...b/python3.4/site-packages/ipykernel/__main__.py'), 'app': <module 'ipykernel.kernelapp' from '/home/antoni.../python3.4/site-packages/ipykernel/kernelapp.py'>}
     86     return run_globals
     87 
     88 def _run_module_code(code, init_globals=None,
     89                     mod_name=None, mod_spec=None,

...........................................................................
/home/antonio/virtualenvs/elbulli/lib/python3.4/site-packages/ipykernel/__main__.py in <module>()
      1 
      2 
----> 3 
      4 if __name__ == '__main__':
      5     from ipykernel import kernelapp as app
      6     app.launch_new_instance()
      7 
      8 
      9 
     10 

...........................................................................
/home/antonio/virtualenvs/elbulli/lib/python3.4/site-packages/traitlets/config/application.py in launch_instance(cls=<class 'ipykernel.kernelapp.IPKernelApp'>, argv=None, **kwargs={})
    584         
    585         If a global instance already exists, this reinitializes and starts it
    586         """
    587         app = cls.instance(**kwargs)
    588         app.initialize(argv)
--> 589         app.start()
        app.start = <bound method IPKernelApp.start of <ipykernel.kernelapp.IPKernelApp object>>
    590 
    591 #-----------------------------------------------------------------------------
    592 # utility functions, for convenience
    593 #-----------------------------------------------------------------------------

...........................................................................
/home/antonio/virtualenvs/elbulli/lib/python3.4/site-packages/ipykernel/kernelapp.py in start(self=<ipykernel.kernelapp.IPKernelApp object>)
    400         
    401         if self.poller is not None:
    402             self.poller.start()
    403         self.kernel.start()
    404         try:
--> 405             ioloop.IOLoop.instance().start()
    406         except KeyboardInterrupt:
    407             pass
    408 
    409 launch_new_instance = IPKernelApp.launch_instance

...........................................................................
/home/antonio/virtualenvs/elbulli/lib/python3.4/site-packages/zmq/eventloop/ioloop.py in start(self=<zmq.eventloop.ioloop.ZMQIOLoop object>)
    157             PollIOLoop.configure(ZMQIOLoop)
    158         return PollIOLoop.current(*args, **kwargs)
    159     
    160     def start(self):
    161         try:
--> 162             super(ZMQIOLoop, self).start()
        self.start = <bound method ZMQIOLoop.start of <zmq.eventloop.ioloop.ZMQIOLoop object>>
    163         except ZMQError as e:
    164             if e.errno == ETERM:
    165                 # quietly return on ETERM
    166                 pass

...........................................................................
/home/antonio/virtualenvs/elbulli/lib/python3.4/site-packages/tornado/ioloop.py in start(self=<zmq.eventloop.ioloop.ZMQIOLoop object>)
    878                 self._events.update(event_pairs)
    879                 while self._events:
    880                     fd, events = self._events.popitem()
    881                     try:
    882                         fd_obj, handler_func = self._handlers[fd]
--> 883                         handler_func(fd_obj, events)
        handler_func = <function wrap.<locals>.null_wrapper>
        fd_obj = <zmq.sugar.socket.Socket object>
        events = 1
    884                     except (OSError, IOError) as e:
    885                         if errno_from_exception(e) == errno.EPIPE:
    886                             # Happens when the client closes the connection
    887                             pass

...........................................................................
/home/antonio/virtualenvs/elbulli/lib/python3.4/site-packages/tornado/stack_context.py in null_wrapper(*args=(<zmq.sugar.socket.Socket object>, 1), **kwargs={})
    270         # Fast path when there are no active contexts.
    271         def null_wrapper(*args, **kwargs):
    272             try:
    273                 current_state = _state.contexts
    274                 _state.contexts = cap_contexts[0]
--> 275                 return fn(*args, **kwargs)
        args = (<zmq.sugar.socket.Socket object>, 1)
        kwargs = {}
    276             finally:
    277                 _state.contexts = current_state
    278         null_wrapper._wrapped = True
    279         return null_wrapper

...........................................................................
/home/antonio/virtualenvs/elbulli/lib/python3.4/site-packages/zmq/eventloop/zmqstream.py in _handle_events(self=<zmq.eventloop.zmqstream.ZMQStream object>, fd=<zmq.sugar.socket.Socket object>, events=1)
    435             # dispatch events:
    436             if events & IOLoop.ERROR:
    437                 gen_log.error("got POLLERR event on ZMQStream, which doesn't make sense")
    438                 return
    439             if events & IOLoop.READ:
--> 440                 self._handle_recv()
        self._handle_recv = <bound method ZMQStream._handle_recv of <zmq.eventloop.zmqstream.ZMQStream object>>
    441                 if not self.socket:
    442                     return
    443             if events & IOLoop.WRITE:
    444                 self._handle_send()

...........................................................................
/home/antonio/virtualenvs/elbulli/lib/python3.4/site-packages/zmq/eventloop/zmqstream.py in _handle_recv(self=<zmq.eventloop.zmqstream.ZMQStream object>)
    467                 gen_log.error("RECV Error: %s"%zmq.strerror(e.errno))
    468         else:
    469             if self._recv_callback:
    470                 callback = self._recv_callback
    471                 # self._recv_callback = None
--> 472                 self._run_callback(callback, msg)
        self._run_callback = <bound method ZMQStream._run_callback of <zmq.eventloop.zmqstream.ZMQStream object>>
        callback = <function wrap.<locals>.null_wrapper>
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    473                 
    474         # self.update_state()
    475         
    476 

...........................................................................
/home/antonio/virtualenvs/elbulli/lib/python3.4/site-packages/zmq/eventloop/zmqstream.py in _run_callback(self=<zmq.eventloop.zmqstream.ZMQStream object>, callback=<function wrap.<locals>.null_wrapper>, *args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    409         close our socket."""
    410         try:
    411             # Use a NullContext to ensure that all StackContexts are run
    412             # inside our blanket exception handler rather than outside.
    413             with stack_context.NullContext():
--> 414                 callback(*args, **kwargs)
        callback = <function wrap.<locals>.null_wrapper>
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    415         except:
    416             gen_log.error("Uncaught exception, closing connection.",
    417                           exc_info=True)
    418             # Close the socket on an uncaught exception from a user callback

...........................................................................
/home/antonio/virtualenvs/elbulli/lib/python3.4/site-packages/tornado/stack_context.py in null_wrapper(*args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    270         # Fast path when there are no active contexts.
    271         def null_wrapper(*args, **kwargs):
    272             try:
    273                 current_state = _state.contexts
    274                 _state.contexts = cap_contexts[0]
--> 275                 return fn(*args, **kwargs)
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    276             finally:
    277                 _state.contexts = current_state
    278         null_wrapper._wrapped = True
    279         return null_wrapper

...........................................................................
/home/antonio/virtualenvs/elbulli/lib/python3.4/site-packages/ipykernel/kernelbase.py in dispatcher(msg=[<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>])
    255         if self.control_stream:
    256             self.control_stream.on_recv(self.dispatch_control, copy=False)
    257 
    258         def make_dispatcher(stream):
    259             def dispatcher(msg):
--> 260                 return self.dispatch_shell(stream, msg)
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    261             return dispatcher
    262 
    263         for s in self.shell_streams:
    264             s.on_recv(make_dispatcher(s), copy=False)

...........................................................................
/home/antonio/virtualenvs/elbulli/lib/python3.4/site-packages/ipykernel/kernelbase.py in dispatch_shell(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, msg={'buffers': [], 'content': {'allow_stdin': True, 'code': 'grid_search.fit(X_train, y_train)', 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': '2016-07-04T00:17:59.863701', 'msg_id': '67E66C1549A44AFF8AEB53BCDE2644B5', 'msg_type': 'execute_request', 'session': '80E54FDFF6334C618BFF2DC4FCA953D0', 'username': 'username', 'version': '5.0'}, 'metadata': {}, 'msg_id': '67E66C1549A44AFF8AEB53BCDE2644B5', 'msg_type': 'execute_request', 'parent_header': {}})
    207             self.log.error("UNKNOWN MESSAGE TYPE: %r", msg_type)
    208         else:
    209             self.log.debug("%s: %s", msg_type, msg)
    210             self.pre_handler_hook()
    211             try:
--> 212                 handler(stream, idents, msg)
        handler = <bound method IPythonKernel.execute_request of <ipykernel.ipkernel.IPythonKernel object>>
        stream = <zmq.eventloop.zmqstream.ZMQStream object>
        idents = [b'80E54FDFF6334C618BFF2DC4FCA953D0']
        msg = {'buffers': [], 'content': {'allow_stdin': True, 'code': 'grid_search.fit(X_train, y_train)', 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': '2016-07-04T00:17:59.863701', 'msg_id': '67E66C1549A44AFF8AEB53BCDE2644B5', 'msg_type': 'execute_request', 'session': '80E54FDFF6334C618BFF2DC4FCA953D0', 'username': 'username', 'version': '5.0'}, 'metadata': {}, 'msg_id': '67E66C1549A44AFF8AEB53BCDE2644B5', 'msg_type': 'execute_request', 'parent_header': {}}
    213             except Exception:
    214                 self.log.error("Exception in message handler:", exc_info=True)
    215             finally:
    216                 self.post_handler_hook()

...........................................................................
/home/antonio/virtualenvs/elbulli/lib/python3.4/site-packages/ipykernel/kernelbase.py in execute_request(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, ident=[b'80E54FDFF6334C618BFF2DC4FCA953D0'], parent={'buffers': [], 'content': {'allow_stdin': True, 'code': 'grid_search.fit(X_train, y_train)', 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': '2016-07-04T00:17:59.863701', 'msg_id': '67E66C1549A44AFF8AEB53BCDE2644B5', 'msg_type': 'execute_request', 'session': '80E54FDFF6334C618BFF2DC4FCA953D0', 'username': 'username', 'version': '5.0'}, 'metadata': {}, 'msg_id': '67E66C1549A44AFF8AEB53BCDE2644B5', 'msg_type': 'execute_request', 'parent_header': {}})
    365         if not silent:
    366             self.execution_count += 1
    367             self._publish_execute_input(code, parent, self.execution_count)
    368 
    369         reply_content = self.do_execute(code, silent, store_history,
--> 370                                         user_expressions, allow_stdin)
        user_expressions = {}
        allow_stdin = True
    371 
    372         # Flush output before sending the reply.
    373         sys.stdout.flush()
    374         sys.stderr.flush()

...........................................................................
/home/antonio/virtualenvs/elbulli/lib/python3.4/site-packages/ipykernel/ipkernel.py in do_execute(self=<ipykernel.ipkernel.IPythonKernel object>, code='grid_search.fit(X_train, y_train)', silent=False, store_history=True, user_expressions={}, allow_stdin=True)
    170 
    171         reply_content = {}
    172         # FIXME: the shell calls the exception handler itself.
    173         shell._reply_content = None
    174         try:
--> 175             shell.run_cell(code, store_history=store_history, silent=silent)
        shell.run_cell = <bound method ZMQInteractiveShell.run_cell of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = 'grid_search.fit(X_train, y_train)'
        store_history = True
        silent = False
    176         except:
    177             status = u'error'
    178             # FIXME: this code right now isn't being used yet by default,
    179             # because the run_cell() call above directly fires off exception

...........................................................................
/home/antonio/virtualenvs/elbulli/lib/python3.4/site-packages/IPython/core/interactiveshell.py in run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, raw_cell='grid_search.fit(X_train, y_train)', store_history=True, silent=False, shell_futures=True)
   2897                 self.displayhook.exec_result = result
   2898 
   2899                 # Execute the user code
   2900                 interactivity = "none" if silent else self.ast_node_interactivity
   2901                 self.run_ast_nodes(code_ast.body, cell_name,
-> 2902                    interactivity=interactivity, compiler=compiler, result=result)
        interactivity = 'last_expr'
        compiler = <IPython.core.compilerop.CachingCompiler object>
   2903 
   2904                 # Reset this so later displayed values do not modify the
   2905                 # ExecutionResult
   2906                 self.displayhook.exec_result = None

...........................................................................
/home/antonio/virtualenvs/elbulli/lib/python3.4/site-packages/IPython/core/interactiveshell.py in run_ast_nodes(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, nodelist=[<_ast.Expr object>], cell_name='<ipython-input-131-79fead2bcf0e>', interactivity='last', compiler=<IPython.core.compilerop.CachingCompiler object>, result=<IPython.core.interactiveshell.ExecutionResult object>)
   3007                     return True
   3008 
   3009             for i, node in enumerate(to_run_interactive):
   3010                 mod = ast.Interactive([node])
   3011                 code = compiler(mod, cell_name, "single")
-> 3012                 if self.run_code(code, result):
        self.run_code = <bound method ZMQInteractiveShell.run_code of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = <code object <module> at 0x7f1e8dec8c00, file "<ipython-input-131-79fead2bcf0e>", line 1>
        result = <IPython.core.interactiveshell.ExecutionResult object>
   3013                     return True
   3014 
   3015             # Flush softspace
   3016             if softspace(sys.stdout, 0):

...........................................................................
/home/antonio/virtualenvs/elbulli/lib/python3.4/site-packages/IPython/core/interactiveshell.py in run_code(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, code_obj=<code object <module> at 0x7f1e8dec8c00, file "<ipython-input-131-79fead2bcf0e>", line 1>, result=<IPython.core.interactiveshell.ExecutionResult object>)
   3061         outflag = 1  # happens in more places, so it's easier as default
   3062         try:
   3063             try:
   3064                 self.hooks.pre_run_code_hook()
   3065                 #rprint('Running code', repr(code_obj)) # dbg
-> 3066                 exec(code_obj, self.user_global_ns, self.user_ns)
        code_obj = <code object <module> at 0x7f1e8dec8c00, file "<ipython-input-131-79fead2bcf0e>", line 1>
        self.user_global_ns = {'BernoulliNB': <class 'sklearn.naive_bayes.BernoulliNB'>, 'CountVectorizer': <class 'sklearn.feature_extraction.text.CountVectorizer'>, 'GridSearchCV': <class 'sklearn.grid_search.GridSearchCV'>, 'In': ['', 'import json\nfrom random import randint\nfrom time...es/text/document_classification_20newsgroups.html', "elbulli_raw_recipes_df = pd.read_csv('data/dbs/recipes_elbulli_raw.csv')", "elbulli_raw_recipes_df['creativity'] = elbulli_r...'] <= 2005 else \\\n              'UNKOWN', axis=1)", "elbulli_raw_recipes_df['creativity'].value_counts()", "cookpad_size = int(np.average(elbulli_raw_recipes_df['creativity'].value_counts()))", 'cookpad_size', "cookpad_raw_recipes_df = pd.read_csv('data/dbs/recipes_cookpad_raw.csv')", 'cookpad_sample = cookpad_raw_recipes_df.sample(cookpad_size)', 'cookpad_indices = cookpad_sample.index.sort_values()', 'cookpad_indices', 'def my_sample(df, indices=cookpad_indices):\n    return df.ix[indices]', "elbulli_raw_recipes_df = pd.read_csv('data/dbs/recipes_elbulli_raw.csv')", 'elbulli_raw_recipes_df.dtypes', 'elbulli_raw_recipes_df.head()', "elbulli_raw_recipes_df['ingredients'] = elbulli_...ly(lambda x: json.loads(x['techniques']), axis=1)", 'elbulli_raw_recipes_df.head()', "elbulli_raw_recipes_df['creativity'] = elbulli_r...'] <= 2005 else \\\n              'UNKOWN', axis=1)", 'elbulli_raw_recipes_df.head()', "cookpad_raw_recipes_df = pd.read_csv('data/dbs/r...aw_recipes_df = my_sample(cookpad_raw_recipes_df)", ...], 'K': 10, 'KNeighborsClassifier': <class 'sklearn.neighbors.classification.KNeighborsClassifier'>, 'LinearSVC': <class 'sklearn.svm.classes.LinearSVC'>, 'MultinomialNB': <class 'sklearn.naive_bayes.MultinomialNB'>, 'NearestCentroid': <class 'sklearn.neighbors.nearest_centroid.NearestCentroid'>, 'Out': {4: 10    454
30    389
20    371
Name: creativity, dtype: int64, 6: 404, 10: Int64Index([   3,   23,   96,  100,  115,  120, ...930, 7970],
           dtype='int64', length=404), 13: _id             int64
title          object
year...nts    object
techniques     object
dtype: object, 14:     _id                                         ...al, asar, cocer, hervido, confitar, cazuela,...  , 16:     _id                                         ...al, asar, cocer, hervido, confitar, cazuela,...  , 18:     _id                                         ...cer, hervido, confitar, cazuela,...          30  , 20: _id             int64
title          object
year...nts    object
techniques     object
dtype: object, 21:         _id                        title  year  ...ofreir, cocer, sofreimos, hervir, sofrito, d...  , 23:         _id                        title  year  ...ofreir, cocer, sofreimos, hervir, sofrito, d...  , ...}, ...}
        self.user_ns = {'BernoulliNB': <class 'sklearn.naive_bayes.BernoulliNB'>, 'CountVectorizer': <class 'sklearn.feature_extraction.text.CountVectorizer'>, 'GridSearchCV': <class 'sklearn.grid_search.GridSearchCV'>, 'In': ['', 'import json\nfrom random import randint\nfrom time...es/text/document_classification_20newsgroups.html', "elbulli_raw_recipes_df = pd.read_csv('data/dbs/recipes_elbulli_raw.csv')", "elbulli_raw_recipes_df['creativity'] = elbulli_r...'] <= 2005 else \\\n              'UNKOWN', axis=1)", "elbulli_raw_recipes_df['creativity'].value_counts()", "cookpad_size = int(np.average(elbulli_raw_recipes_df['creativity'].value_counts()))", 'cookpad_size', "cookpad_raw_recipes_df = pd.read_csv('data/dbs/recipes_cookpad_raw.csv')", 'cookpad_sample = cookpad_raw_recipes_df.sample(cookpad_size)', 'cookpad_indices = cookpad_sample.index.sort_values()', 'cookpad_indices', 'def my_sample(df, indices=cookpad_indices):\n    return df.ix[indices]', "elbulli_raw_recipes_df = pd.read_csv('data/dbs/recipes_elbulli_raw.csv')", 'elbulli_raw_recipes_df.dtypes', 'elbulli_raw_recipes_df.head()', "elbulli_raw_recipes_df['ingredients'] = elbulli_...ly(lambda x: json.loads(x['techniques']), axis=1)", 'elbulli_raw_recipes_df.head()', "elbulli_raw_recipes_df['creativity'] = elbulli_r...'] <= 2005 else \\\n              'UNKOWN', axis=1)", 'elbulli_raw_recipes_df.head()', "cookpad_raw_recipes_df = pd.read_csv('data/dbs/r...aw_recipes_df = my_sample(cookpad_raw_recipes_df)", ...], 'K': 10, 'KNeighborsClassifier': <class 'sklearn.neighbors.classification.KNeighborsClassifier'>, 'LinearSVC': <class 'sklearn.svm.classes.LinearSVC'>, 'MultinomialNB': <class 'sklearn.naive_bayes.MultinomialNB'>, 'NearestCentroid': <class 'sklearn.neighbors.nearest_centroid.NearestCentroid'>, 'Out': {4: 10    454
30    389
20    371
Name: creativity, dtype: int64, 6: 404, 10: Int64Index([   3,   23,   96,  100,  115,  120, ...930, 7970],
           dtype='int64', length=404), 13: _id             int64
title          object
year...nts    object
techniques     object
dtype: object, 14:     _id                                         ...al, asar, cocer, hervido, confitar, cazuela,...  , 16:     _id                                         ...al, asar, cocer, hervido, confitar, cazuela,...  , 18:     _id                                         ...cer, hervido, confitar, cazuela,...          30  , 20: _id             int64
title          object
year...nts    object
techniques     object
dtype: object, 21:         _id                        title  year  ...ofreir, cocer, sofreimos, hervir, sofrito, d...  , 23:         _id                        title  year  ...ofreir, cocer, sofreimos, hervir, sofrito, d...  , ...}, ...}
   3067             finally:
   3068                 # Reset our crash handler in place
   3069                 sys.excepthook = old_excepthook
   3070         except SystemExit as e:

...........................................................................
/home/antonio/git/elbulli/<ipython-input-131-79fead2bcf0e> in <module>()
----> 1 
      2 
      3 
      4 
      5 
      6 grid_search.fit(X_train, y_train)
      7 
      8 
      9 
     10 

...........................................................................
/home/antonio/virtualenvs/elbulli/lib/python3.4/site-packages/sklearn/grid_search.py in fit(self=GridSearchCV(cv=sklearn.cross_validation.Stratif...it=True, scoring='f1_weighted',
       verbose=1), X=0       i_oporto i_aceite_de_oliva i_agua i_pimi...liva i_huevos i_ajo ...
Name: text, dtype: object, y=0       10
1       10
2       10
3       30
4   ...3     0
1454     0
Name: creativity, dtype: int64)
    799         y : array-like, shape = [n_samples] or [n_samples, n_output], optional
    800             Target relative to X for classification or regression;
    801             None for unsupervised learning.
    802 
    803         """
--> 804         return self._fit(X, y, ParameterGrid(self.param_grid))
        self._fit = <bound method GridSearchCV._fit of GridSearchCV(...t=True, scoring='f1_weighted',
       verbose=1)>
        X = 0       i_oporto i_aceite_de_oliva i_agua i_pimi...liva i_huevos i_ajo ...
Name: text, dtype: object
        y = 0       10
1       10
2       10
3       30
4   ...3     0
1454     0
Name: creativity, dtype: int64
        self.param_grid = {}
    805 
    806 
    807 class RandomizedSearchCV(BaseSearchCV):
    808     """Randomized search on hyper parameters.

...........................................................................
/home/antonio/virtualenvs/elbulli/lib/python3.4/site-packages/sklearn/grid_search.py in _fit(self=GridSearchCV(cv=sklearn.cross_validation.Stratif...it=True, scoring='f1_weighted',
       verbose=1), X=0       i_oporto i_aceite_de_oliva i_agua i_pimi...liva i_huevos i_ajo ...
Name: text, dtype: object, y=0       10
1       10
2       10
3       30
4   ...3     0
1454     0
Name: creativity, dtype: int64, parameter_iterable=<sklearn.grid_search.ParameterGrid object>)
    548         )(
    549             delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_,
    550                                     train, test, self.verbose, parameters,
    551                                     self.fit_params, return_parameters=True,
    552                                     error_score=self.error_score)
--> 553                 for parameters in parameter_iterable
        parameters = undefined
        parameter_iterable = <sklearn.grid_search.ParameterGrid object>
    554                 for train, test in cv)
    555 
    556         # Out is a list of triplet: score, estimator, n_test_samples
    557         n_fits = len(out)

...........................................................................
/home/antonio/virtualenvs/elbulli/lib/python3.4/site-packages/sklearn/externals/joblib/parallel.py in __call__(self=Parallel(n_jobs=-1), iterable=<generator object <genexpr>>)
    805             if pre_dispatch == "all" or n_jobs == 1:
    806                 # The iterable was consumed all at once by the above for loop.
    807                 # No need to wait for async callbacks to trigger to
    808                 # consumption.
    809                 self._iterating = False
--> 810             self.retrieve()
        self.retrieve = <bound method Parallel.retrieve of Parallel(n_jobs=-1)>
    811             # Make sure that we get a last message telling us we are done
    812             elapsed_time = time.time() - self._start_time
    813             self._print('Done %3i out of %3i | elapsed: %s finished',
    814                         (len(self._output), len(self._output),

---------------------------------------------------------------------------
Sub-process traceback:
---------------------------------------------------------------------------
IndexError                                         Mon Jul  4 00:17:59 2016
PID: 16367      Python 3.4.3: /home/antonio/virtualenvs/elbulli/bin/python3
...........................................................................
/home/antonio/virtualenvs/elbulli/lib/python3.4/site-packages/sklearn/externals/joblib/parallel.py in __call__(self=<sklearn.externals.joblib.parallel.BatchedCalls object>)
     67     def __init__(self, iterator_slice):
     68         self.items = list(iterator_slice)
     69         self._size = len(self.items)
     70 
     71     def __call__(self):
---> 72         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        self.items = <class 'list'> instance
     73 
     74     def __len__(self):
     75         return self._size
     76 

...........................................................................
/home/antonio/virtualenvs/elbulli/lib/python3.4/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0=<list_iterator object>)
     67     def __init__(self, iterator_slice):
     68         self.items = list(iterator_slice)
     69         self._size = len(self.items)
     70 
     71     def __call__(self):
---> 72         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        func = <function _fit_and_score>
        args = <class 'tuple'> instance
        kwargs = {'error_score': 0, 'return_parameters': True}
     73 
     74     def __len__(self):
     75         return self._size
     76 

...........................................................................
/home/antonio/virtualenvs/elbulli/lib/python3.4/site-packages/sklearn/cross_validation.py in _fit_and_score(estimator=Pipeline(steps=[('CountVectorizer', CountVectori...random_state=None, tol=0.0001,
     verbose=0))]), X=0       i_oporto i_aceite_de_oliva i_agua i_pimi...liva i_huevos i_ajo ...
Name: text, dtype: object, y=0       10
1       10
2       10
3       30
4   ...3     0
1454     0
Name: creativity, dtype: int64, scorer=<class 'sklearn.metrics.scorer._PredictScorer'> instance, train=array([  45,   47,   48, ..., 1615, 1616, 1617]), test=array([   0,    1,    2,    3,    4,    5,    6,... 1247, 1248, 1249, 1250, 1251, 1252, 1253, 1254]), verbose=1, parameters={}, fit_params={}, return_train_score=False, return_parameters=True, error_score=0)
   1519     if parameters is not None:
   1520         estimator.set_params(**parameters)
   1521 
   1522     start_time = time.time()
   1523 
-> 1524     X_train, y_train = _safe_split(estimator, X, y, train)
        X_train = undefined
        y_train = undefined
        estimator = Pipeline(steps=[('CountVectorizer', CountVectori...random_state=None, tol=0.0001,
     verbose=0))])
        X = 0       i_oporto i_aceite_de_oliva i_agua i_pimi...liva i_huevos i_ajo ...
Name: text, dtype: object
        y = 0       10
1       10
2       10
3       30
4   ...3     0
1454     0
Name: creativity, dtype: int64
        train = array([  45,   47,   48, ..., 1615, 1616, 1617])
   1525     X_test, y_test = _safe_split(estimator, X, y, test, train)
   1526 
   1527     try:
   1528         if y_train is None:

...........................................................................
/home/antonio/virtualenvs/elbulli/lib/python3.4/site-packages/sklearn/cross_validation.py in _safe_split(estimator=Pipeline(steps=[('CountVectorizer', CountVectori...random_state=None, tol=0.0001,
     verbose=0))]), X=0       i_oporto i_aceite_de_oliva i_agua i_pimi...liva i_huevos i_ajo ...
Name: text, dtype: object, y=0       10
1       10
2       10
3       30
4   ...3     0
1454     0
Name: creativity, dtype: int64, indices=array([  45,   47,   48, ..., 1615, 1616, 1617]), train_indices=None)
   1586             if train_indices is None:
   1587                 X_subset = X[np.ix_(indices, indices)]
   1588             else:
   1589                 X_subset = X[np.ix_(indices, train_indices)]
   1590         else:
-> 1591             X_subset = safe_indexing(X, indices)
        X_subset = undefined
        X = 0       i_oporto i_aceite_de_oliva i_agua i_pimi...liva i_huevos i_ajo ...
Name: text, dtype: object
        indices = array([  45,   47,   48, ..., 1615, 1616, 1617])
   1592 
   1593     if y is not None:
   1594         y_subset = safe_indexing(y, indices)
   1595     else:

...........................................................................
/home/antonio/virtualenvs/elbulli/lib/python3.4/site-packages/sklearn/utils/__init__.py in safe_indexing(X=0       i_oporto i_aceite_de_oliva i_agua i_pimi...liva i_huevos i_ajo ...
Name: text, dtype: object, indices=array([  45,   47,   48, ..., 1615, 1616, 1617]))
    147         Indices according to which X will be subsampled.
    148     """
    149     if hasattr(X, "iloc"):
    150         # Pandas Dataframes and Series
    151         try:
--> 152             return X.iloc[indices]
        X.iloc = <pandas.core.indexing._iLocIndexer object>
        indices = array([  45,   47,   48, ..., 1615, 1616, 1617])
    153         except ValueError:
    154             # Cython typed memoryviews internally used in pandas do not support
    155             # readonly buffers.
    156             warnings.warn("Copying input dataframe for slicing.",

...........................................................................
/home/antonio/virtualenvs/elbulli/lib/python3.4/site-packages/pandas/core/indexing.py in __getitem__(self=<pandas.core.indexing._iLocIndexer object>, key=array([  45,   47,   48, ..., 1615, 1616, 1617]))
   1291             key = com._apply_if_callable(key, self.obj)
   1292 
   1293         if type(key) is tuple:
   1294             return self._getitem_tuple(key)
   1295         else:
-> 1296             return self._getitem_axis(key, axis=0)
        self._getitem_axis = <bound method _iLocIndexer._getitem_axis of <pandas.core.indexing._iLocIndexer object>>
        key = array([  45,   47,   48, ..., 1615, 1616, 1617])
   1297 
   1298     def _getitem_axis(self, key, axis=0):
   1299         raise NotImplementedError()
   1300 

...........................................................................
/home/antonio/virtualenvs/elbulli/lib/python3.4/site-packages/pandas/core/indexing.py in _getitem_axis(self=<pandas.core.indexing._iLocIndexer object>, key=array([  45,   47,   48, ..., 1615, 1616, 1617]), axis=0)
   1594         else:
   1595 
   1596             if is_list_like_indexer(key):
   1597 
   1598                 # validate list bounds
-> 1599                 self._is_valid_list_like(key, axis)
        self._is_valid_list_like = <bound method _iLocIndexer._is_valid_list_like of <pandas.core.indexing._iLocIndexer object>>
        key = array([  45,   47,   48, ..., 1615, 1616, 1617])
        axis = 0
   1600 
   1601                 # force an actual list
   1602                 key = list(key)
   1603 

...........................................................................
/home/antonio/virtualenvs/elbulli/lib/python3.4/site-packages/pandas/core/indexing.py in _is_valid_list_like(self=<pandas.core.indexing._iLocIndexer object>, key=array([  45,   47,   48, ..., 1615, 1616, 1617]), axis=0)
   1533         # coerce the key to not exceed the maximum size of the index
   1534         arr = np.array(key)
   1535         ax = self.obj._get_axis(axis)
   1536         l = len(ax)
   1537         if len(arr) and (arr.max() >= l or arr.min() < -l):
-> 1538             raise IndexError("positional indexers are out-of-bounds")
   1539 
   1540         return True
   1541 
   1542     def _getitem_tuple(self, tup):

IndexError: positional indexers are out-of-bounds
___________________________________________________________________________

In [None]:
# >>> from sklearn.metrics import confusion_matrix
# >>> y_true = [1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 2, 2, 2, 2]
# >>> y_pred = [1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 2, 2, 2, 1]
# >>> print(confusion_matrix(y_true, y_pred))
# [[4 1 0]
#  [2 5 0]
#  [0 1 3]]

In [107]:
# Params
K = 10
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'tfidf__norm': (None, 'l1', 'l2'),
}

In [None]:
# # Classifiers
# clf_list = [
#     (RidgeClassifier(alpha=.00001, tol=1e-2, solver="lsqr"), "Ridge classifier"),
#     (Perceptron(alpha=.00001, n_iter=50), "Perceptron"),
#     (PassiveAggressiveClassifier(n_iter=50), "Passive-aggressive"),
#     (KNeighborsClassifier(n_neighbors=10), "kNN"),
#     (RandomForestClassifier(n_estimators=100), "Random Forest Classifier"),
# #     (RandomForestRegressor(n_estimators=100), "Random Forest Regressor"),
#     (LinearSVC(loss='squared_hinge', penalty='l2', dual=False, tol=1e-3), 'Linear SVC 1'),
#     (SGDClassifier(alpha=.000001, n_iter=50, penalty='l1'), 'SGDClassifier'),
# #     (SGDRegressor(alpha=.000001, n_iter=50, penalty='l1'), 'SGDRegressor'),
#     (NearestCentroid(), 'Nearest Centroid'),
#     (MultinomialNB(alpha=.00001), 'Multinomial NB'),
#     (BernoulliNB(alpha=.00001), 'Bernoulli NB'),
#     (LinearSVC(penalty="l1", dual=False, tol=1e-3), 'Linear SVC 2'),
#     (RidgeClassifier(), "Ridge classifier"),
#     (Perceptron(), "Perceptron"),
#     (PassiveAggressiveClassifier(), "Passive-aggressive"),
#     (KNeighborsClassifier(), "kNN"),
#     (RandomForestClassifier(), "Random Forest Classifier"),
# #     (RandomForestRegressor(), "Random Forest Regressor"),
#     (LinearSVC(), 'Linear SVC'),
#     (SGDClassifier(), 'SGDClassifier'),
# #     (SGDRegressor(), 'SGDRegressor'),
#     (NearestCentroid(), 'Nearest Centroid'),
#     (MultinomialNB(), 'Multinomial NB'),
#     (BernoulliNB(), 'Bernoulli NB'),
# ]

In [None]:
# # Classifiers
# clf_list = [
#     (LinearSVC(), 'Linear SVC'),
#     (LinearSVC(penalty="l1", dual=False, tol=1e-3), 'Linear SVC'),
#     (LinearSVC(loss='squared_hinge', penalty='l2', dual=False, tol=1e-3), 'Linear SVC'),
#     (LinearSVC(loss='hinge', penalty='l1', dual=True, tol=1e-1, multi_class='crammer_singer', random_state=100), 'Linear SVC'),
# ]

In [110]:
# Classifiers
clf_list = []
for loss in ['hinge', 'squared_hinge']:
    for tol in [0.0001, 0.001, 0.01, 0.1]:
        for multi_class in ['ovr', 'crammer_singer']:
            clf_list.append((
                    LinearSVC(loss=loss, tol=tol, multi_class=multi_class),
                    'LinearSVC(loss={}, tol={}, multi_class={})'.format(loss, tol, multi_class)
                ))
len(clf_list)

16

In [118]:
# X_train = X_data
# X_test = X_val
# y_train = y_data
# y_test = y_val

for clf, name in clf_list:
        pipeline = Pipeline([
            ('vect', CountVectorizer()),
            ('tfidf', TfidfTransformer(use_idf=True)),
            ('clf', clf),
        ])
#         grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
#         benchmark_results = benchmark(grid_search, X_train, X_test, y_train, y_test, name)
        benchmark_results = benchmark(pipeline, X_train, X_test, y_train, y_test, name)
#         results[name].append(benchmark_results)

________________________________________________________________________________
Training: LinearSVC(loss=hinge, tol=0.0001, multi_class=ovr)
Pipeline(steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...e', max_iter=1000, multi_class='ovr',
     penalty='l2', random_state=None, tol=0.0001, verbose=0))])
train time: 0.232s
test time:  0.018s
accuracy:   0.968
precision:   0.968
recall:   0.968
classification report:


  sample_weight=sample_weight)
  sample_weight=sample_weight)


IndexError: list index out of range

In [None]:
# split a training set and a test set
y_train = y_data
y_test = y_val

vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5)

print("Extracting features from the training data using a sparse vectorizer")
X_train = vectorizer.fit_transform(X_data)
print("n_samples: %d, n_features: %d" % X_train.shape)
print()

print("Extracting features from the test data using the same vectorizer")
X_test = vectorizer.transform(X_data)
print("n_samples: %d, n_features: %d" % X_test.shape)
print()

In [None]:
# mapping from integer feature name to original token string
feature_names = vectorizer.get_feature_names()
opts_select_chi2 = 50

print("Extracting %d best features by a chi-squared test" %
      opts_select_chi2)
ch2 = SelectKBest(chi2, k=opts_select_chi2)
X_train = ch2.fit_transform(X_train, y_train)
X_test = ch2.transform(X_test)
if feature_names:
    # keep selected feature names
    feature_names = [feature_names[i] for i
                     in ch2.get_support(indices=True)]
print()

feature_names = np.asarray(feature_names)

In [None]:
feature_names

In [None]:
count_d = {
    '1987-1997': {},
    '1998-2001': {},
    '2002-2005': {},
}
for y in ['1987-1997', '1998-2001', '2002-2005']:
    for f in feature_names:
        x = f[2:].replace('_', ' ')
        if f.startswith('i'):
            count_d[y][f] = 0
            for r in recipes_df[(int(y.split('-')[0]) <= recipes_df['year']) & (recipes_df['year'] <= int(y.split('-')[1]))].iterrows():
                count_d[y][f] += r[1]['ingredients'].count(x)
        elif f.startswith('t'):
            count_d[y][f] = 0
            for r in recipes_df[(int(y.split('-')[0]) <= recipes_df['year']) & (recipes_df['year'] <= int(y.split('-')[1]))].iterrows():
                count_d[y][f] += r[1]['techniques'].count(x)

In [None]:
count_d

In [None]:
s1 = pd.Series(count_d['1987-1997'])
s2 = pd.Series(count_d['1998-2001'])
s3 = pd.Series(count_d['2002-2005'])

In [None]:
df = pd.DataFrame({
        '1987-1997': s1,
        '1998-2001': s2,
        '2002-2005': s3,
    })

In [None]:
df.plot(kind='bar')

In [None]:
recipes_df[(1987 <= recipes_df['year']) & (recipes_df['year'] <= 1997)]

In [None]:
ig = nx.read_gexf('data/spanish_ingredients_lexicon_6.gexf')
tg = nx.read_gexf('data/spanish_techniques_lexicon_6.gexf')

In [None]:
ingrs = ig.nodes(data=True)
techs = tg.nodes(data=True)

In [None]:
ingrs[:2]

In [None]:
top_ingrs = sorted(ingrs, key=lambda x: x[1]['count'], reverse=True)
top_techs = sorted(techs, key=lambda x: x[1]['count'], reverse=True)

In [None]:
top_ingrs[:20]

In [None]:
i_aux = top_ingrs[:21]
i_aux.remove(('hojas de gelatina', {'count': 164, 'label': 'hojas de gelatina'}))
i_aux

In [None]:
top_techs[:10]

In [None]:
t_aux = top_techs[:14]
t_aux.remove(('agua', {'count': 891, 'label': 'agua'}))
t_aux.remove(('sal', {'count': 648, 'label': 'sal'}))
t_aux.remove(('min', {'count': 553, 'label': 'min'}))
t_aux.remove(('hirviendo', {'count': 296, 'label': 'hirviendo'}))
t_aux

In [None]:
s1

In [None]:
i_d = dict((x[0], x[1]['count']) for x in i_aux)
t_d = dict((x[0], x[1]['count']) for x in t_aux)

In [None]:
s1 = pd.Series(i_d)
s2 = pd.Series(t_d)

In [None]:
matplotlib.rcParams['figure.figsize'] = (8, 5)
s1.plot(kind='bar')

In [None]:
s2.plot(kind='bar')
matplotlib.rcParams['figure.figsize'] = (12, 7)