In [1]:
import json
from time import time

import networkx as nx
import nltk
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.cross_validation import StratifiedKFold, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import PassiveAggressiveClassifier, Perceptron, RidgeClassifier, SGDClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
# Classifiers obtained from http://scikit-learn.org/stable/auto_examples/text/document_classification_20newsgroups.html

# Original data

## Preparing sets' sizes

In [2]:
elbulli_raw_recipes_df = pd.read_csv('data/dbs/recipes_elbulli_raw.csv')

In [3]:
elbulli_raw_recipes_df['creativity'] = elbulli_raw_recipes_df.apply(
    lambda x: 10 if 1987 <= x['year'] <= 1997 else \
              20 if 1998 <= x['year'] <= 2001 else \
              30 if 2002 <= x['year'] <= 2005 else \
              'UNKOWN', axis=1)

In [4]:
elbulli_raw_recipes_df['creativity'].value_counts()

10    454
30    389
20    371
Name: creativity, dtype: int64

In [5]:
cookpad_size = int(np.average(elbulli_raw_recipes_df['creativity'].value_counts()))

In [6]:
cookpad_size

404

In [7]:
cookpad_raw_recipes_df = pd.read_csv('data/dbs/recipes_cookpad_raw.csv')

In [8]:
cookpad_sample = cookpad_raw_recipes_df.sample(cookpad_size)

In [9]:
cookpad_indices = cookpad_sample.index.sort_values()

In [10]:
cookpad_indices

Int64Index([   5,   11,   40,   45,   57,  111,  118,  167,  177,  205,
            ...
            7783, 7801, 7825, 7873, 7885, 7889, 7934, 7935, 7940, 7946],
           dtype='int64', length=404)

In [11]:
def my_sample(df, indices=cookpad_indices):
    return df.loc[indices]

## Raw ingredients and techniques

### elbulli

In [12]:
elbulli_raw_recipes_df = pd.read_csv('data/dbs/recipes_elbulli_raw.csv')

In [13]:
elbulli_raw_recipes_df.shape

(1214, 5)

In [14]:
elbulli_raw_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
0,1,terrina de melón con gelée de oporto,1987,"[""oporto"",""aceite de oliva"",""agua"",""pimienta b...","[""agua""]"
1,10,salmonetes gaudí,1987,"[""sal"",""pimiento"",""pimiento rojo"",""piñones tos...","[""sal"",""marcar"",""cocción"",""asado"",""hirviendo"",..."
2,100,oursins,1990,"[""chocolate"",""claras de huevo"",""avellanas tost...","[""horno"",""al horno"",""secar""]"
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[""zumo de limón"",""limón"",""alginato sódico"",""ag...","[""agua"",""cocer"",""fría""]"
4,1001,ceps en ámbar de su caldo,2004,"[""sal"",""caldo de ceps"",""carragenato kappa en p...","[""sal"",""asar"",""cocer"",""hervido"",""confitar"",""ca..."


In [15]:
elbulli_raw_recipes_df['ingredients'] = elbulli_raw_recipes_df.apply(lambda x: json.loads(x['ingredients']), axis=1)
elbulli_raw_recipes_df['techniques'] = elbulli_raw_recipes_df.apply(lambda x: json.loads(x['techniques']), axis=1)

In [16]:
elbulli_raw_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
0,1,terrina de melón con gelée de oporto,1987,"[oporto, aceite de oliva, agua, pimienta blanc...",[agua]
1,10,salmonetes gaudí,1987,"[sal, pimiento, pimiento rojo, piñones tostado...","[sal, marcar, cocción, asado, hirviendo, horno..."
2,100,oursins,1990,"[chocolate, claras de huevo, avellanas tostada...","[horno, al horno, secar]"
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[zumo de limón, limón, alginato sódico, agua, ...","[agua, cocer, fría]"
4,1001,ceps en ámbar de su caldo,2004,"[sal, caldo de ceps, carragenato kappa en polv...","[sal, asar, cocer, hervido, confitar, cazuela,..."


In [17]:
elbulli_raw_recipes_df['creativity'] = elbulli_raw_recipes_df.apply(
    lambda x: 10 if 1987 <= x['year'] <= 1997 else \
              20 if 1998 <= x['year'] <= 2001 else \
              30 if 2002 <= x['year'] <= 2005 else \
              'UNKOWN', axis=1)

In [18]:
elbulli_raw_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
0,1,terrina de melón con gelée de oporto,1987,"[oporto, aceite de oliva, agua, pimienta blanc...",[agua],10
1,10,salmonetes gaudí,1987,"[sal, pimiento, pimiento rojo, piñones tostado...","[sal, marcar, cocción, asado, hirviendo, horno...",10
2,100,oursins,1990,"[chocolate, claras de huevo, avellanas tostada...","[horno, al horno, secar]",10
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[zumo de limón, limón, alginato sódico, agua, ...","[agua, cocer, fría]",30
4,1001,ceps en ámbar de su caldo,2004,"[sal, caldo de ceps, carragenato kappa en polv...","[sal, asar, cocer, hervido, confitar, cazuela,...",30


### cookpad

In [19]:
cookpad_raw_recipes_df = pd.read_csv('data/dbs/recipes_cookpad_raw.csv')
cookpad_raw_recipes_df = my_sample(cookpad_raw_recipes_df)

In [20]:
cookpad_raw_recipes_df.shape

(404, 5)

In [21]:
cookpad_raw_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
5,100045,filete al romero,2009,"[""romero fresco"",""pimienta negra"",""aceite de o...","[""freír"",""parrilla"",""rodajas"",""sartén"",""sal"",""..."
11,100067,hígado de ternera con cebolla,2009,"[""ternera"",""cebollas"",""perejil"",""aceite de oli...","[""cocer"",""a fuego lento"",""rodajas"",""sartén"",""r..."
40,100487,bocaditos de queso cabrales,2009,"[""leche"",""philadelphia"",""mantequilla"",""pan ral...","[""sal"",""freír"",""cocer"",""fria""]"
45,100550,caracoles con jamón serrano,2009,"[""tomate"",""comino"",""vino"",""laurel"",""aceite de ...","[""frito"",""hervir"",""sofreír"",""sofrito"",""fría"",""..."
57,101036,pastel de pollo con gelatina,2009,"[""apio"",""caldo de pollo"",""pimienta"",""perejil"",...","[""agua""]"


In [22]:
cookpad_raw_recipes_df['ingredients'] = cookpad_raw_recipes_df.apply(lambda x: json.loads(x['ingredients']), axis=1)
cookpad_raw_recipes_df['techniques'] = cookpad_raw_recipes_df.apply(lambda x: json.loads(x['techniques']), axis=1)

In [23]:
cookpad_raw_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
5,100045,filete al romero,2009,"[romero fresco, pimienta negra, aceite de oliv...","[freír, parrilla, rodajas, sartén, sal, fritos]"
11,100067,hígado de ternera con cebolla,2009,"[ternera, cebollas, perejil, aceite de oliva, ...","[cocer, a fuego lento, rodajas, sartén, rehoga..."
40,100487,bocaditos de queso cabrales,2009,"[leche, philadelphia, mantequilla, pan rallado...","[sal, freír, cocer, fria]"
45,100550,caracoles con jamón serrano,2009,"[tomate, comino, vino, laurel, aceite de oliva...","[frito, hervir, sofreír, sofrito, fría, sal, e..."
57,101036,pastel de pollo con gelatina,2009,"[apio, caldo de pollo, pimienta, perejil, gela...",[agua]


In [24]:
cookpad_raw_recipes_df['creativity'] = cookpad_raw_recipes_df.apply(
    lambda x: 0, axis=1)

In [25]:
cookpad_raw_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
5,100045,filete al romero,2009,"[romero fresco, pimienta negra, aceite de oliv...","[freír, parrilla, rodajas, sartén, sal, fritos]",0
11,100067,hígado de ternera con cebolla,2009,"[ternera, cebollas, perejil, aceite de oliva, ...","[cocer, a fuego lento, rodajas, sartén, rehoga...",0
40,100487,bocaditos de queso cabrales,2009,"[leche, philadelphia, mantequilla, pan rallado...","[sal, freír, cocer, fria]",0
45,100550,caracoles con jamón serrano,2009,"[tomate, comino, vino, laurel, aceite de oliva...","[frito, hervir, sofreír, sofrito, fría, sal, e...",0
57,101036,pastel de pollo con gelatina,2009,"[apio, caldo de pollo, pimienta, perejil, gela...",[agua],0


### elbulli & cookpad

In [26]:
raw_recipes_df = elbulli_raw_recipes_df.append(cookpad_raw_recipes_df, ignore_index=True)

In [27]:
raw_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
0,1,terrina de melón con gelée de oporto,1987,"[oporto, aceite de oliva, agua, pimienta blanc...",[agua],10
1,10,salmonetes gaudí,1987,"[sal, pimiento, pimiento rojo, piñones tostado...","[sal, marcar, cocción, asado, hirviendo, horno...",10
2,100,oursins,1990,"[chocolate, claras de huevo, avellanas tostada...","[horno, al horno, secar]",10
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[zumo de limón, limón, alginato sódico, agua, ...","[agua, cocer, fría]",30
4,1001,ceps en ámbar de su caldo,2004,"[sal, caldo de ceps, carragenato kappa en polv...","[sal, asar, cocer, hervido, confitar, cazuela,...",30


In [28]:
raw_recipes_df.tail()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
1613,99294,sopa de hojas de tomate,2009,"[tomate, bacalao, ajos, aceite, cominos, pan, ...","[sopa, crudo, cocer, cocido]",0
1614,99739,capón asado con jamón serrano,2009,"[caldo de ave, jamón serrano, ajo, cebollas, c...","[hornear, cocción, salar, horno]",0
1615,99740,estofado de chorizos con cachelos,2009,"[vino blanco, sal, chorizo, patatas, laurel]","[cazuela, espesa, sal, en agua, cocer]",0
1616,99745,guiso gallego de mejillones,2009,"[huevo, laurel, vino blanco, pan rallado, pimi...","[freír, cocer, cazuela, gratinar, rehogar, hor...",0
1617,99751,estofado de cerdo con pimientos,2009,"[pimientos, vinagre, pimienta, ajo, perejil, c...","[a la cazuela, en rodajas, freír, cocer, rehogar]",0


## Representative ingredients and techniques

### elbulli

In [29]:
elbulli_repr_recipes_df = pd.read_csv('data/dbs/recipes_elbulli_representatives.csv')

In [30]:
elbulli_repr_recipes_df.dtypes

_id             int64
title          object
year            int64
ingredients    object
techniques     object
dtype: object

In [31]:
elbulli_repr_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
0,1,terrina de melón con gelée de oporto,1987,"[""oporto"",""aceite de oliva"",""agua"",""pimienta b...","[""agua""]"
1,10,salmonetes gaudí,1987,"[""sal"",""pimientos"",""pimiento rojo"",""piñones to...","[""sal"",""marcar"",""cocción"",""horno"",""hervir"",""ho..."
2,100,oursins,1990,"[""chocolate"",""claras de huevo"",""avellanas tost...","[""horno"",""horno"",""secar""]"
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[""zumo de limón"",""limón"",""alginato sódico"",""ag...","[""agua"",""cocer"",""horno""]"
4,1001,ceps en ámbar de su caldo,2004,"[""sal"",""caldo de ceps"",""carragenato kappa en p...","[""sal"",""horno"",""cocer"",""hervir"",""confitar"",""ca..."


In [32]:
elbulli_repr_recipes_df['ingredients'] = elbulli_repr_recipes_df.apply(lambda x: json.loads(x['ingredients']), axis=1)
elbulli_repr_recipes_df['techniques'] = elbulli_repr_recipes_df.apply(lambda x: json.loads(x['techniques']), axis=1)

In [33]:
elbulli_repr_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
0,1,terrina de melón con gelée de oporto,1987,"[oporto, aceite de oliva, agua, pimienta blanc...",[agua]
1,10,salmonetes gaudí,1987,"[sal, pimientos, pimiento rojo, piñones tostad...","[sal, marcar, cocción, horno, hervir, horno, s..."
2,100,oursins,1990,"[chocolate, claras de huevo, avellanas tostada...","[horno, horno, secar]"
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[zumo de limón, limón, alginato sódico, agua, ...","[agua, cocer, horno]"
4,1001,ceps en ámbar de su caldo,2004,"[sal, caldo de ceps, carragenato kappa en polv...","[sal, horno, cocer, hervir, confitar, cazuela,..."


In [34]:
elbulli_repr_recipes_df['creativity'] = elbulli_repr_recipes_df.apply(
    lambda x: 10 if 1987 <= x['year'] <= 1997 else \
              20 if 1998 <= x['year'] <= 2001 else \
              30 if 2002 <= x['year'] <= 2005 else \
              'UNKOWN', axis=1)

In [35]:
elbulli_repr_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
0,1,terrina de melón con gelée de oporto,1987,"[oporto, aceite de oliva, agua, pimienta blanc...",[agua],10
1,10,salmonetes gaudí,1987,"[sal, pimientos, pimiento rojo, piñones tostad...","[sal, marcar, cocción, horno, hervir, horno, s...",10
2,100,oursins,1990,"[chocolate, claras de huevo, avellanas tostada...","[horno, horno, secar]",10
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[zumo de limón, limón, alginato sódico, agua, ...","[agua, cocer, horno]",30
4,1001,ceps en ámbar de su caldo,2004,"[sal, caldo de ceps, carragenato kappa en polv...","[sal, horno, cocer, hervir, confitar, cazuela,...",30


### cookpad

In [36]:
cookpad_repr_recipes_df = pd.read_csv('data/dbs/recipes_cookpad_representatives.csv')
cookpad_repr_recipes_df = my_sample(cookpad_repr_recipes_df)

In [37]:
cookpad_repr_recipes_df.shape

(404, 5)

In [38]:
cookpad_repr_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
5,100045,filete al romero,2009,"[""romero fresco"",""pimienta negra"",""aceite de o...","[""horno"",""parrilla"",""en rodajas"",""sartén"",""sal..."
11,100067,hígado de ternera con cebolla,2009,"[""ternera"",""cebolla"",""perejil"",""aceite de oliv...","[""cocer"",""a fuego lento"",""en rodajas"",""sartén""..."
40,100487,bocaditos de queso cabrales,2009,"[""leche"",""philadelphia"",""mantequilla"",""pan ral...","[""sal"",""horno"",""cocer"",""horno""]"
45,100550,caracoles con jamón serrano,2009,"[""tomate"",""comino"",""vino"",""laurel"",""aceite de ...","[""horno"",""hervir"",""horno"",""horno"",""horno"",""sal..."
57,101036,pastel de pollo con gelatina,2009,"[""apio"",""caldo de pollo"",""pimienta"",""perejil"",...","[""agua""]"


In [39]:
cookpad_repr_recipes_df['ingredients'] = cookpad_repr_recipes_df.apply(lambda x: json.loads(x['ingredients']), axis=1)
cookpad_repr_recipes_df['techniques'] = cookpad_repr_recipes_df.apply(lambda x: json.loads(x['techniques']), axis=1)

In [40]:
cookpad_repr_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
5,100045,filete al romero,2009,"[romero fresco, pimienta negra, aceite de oliv...","[horno, parrilla, en rodajas, sartén, sal, horno]"
11,100067,hígado de ternera con cebolla,2009,"[ternera, cebolla, perejil, aceite de oliva, p...","[cocer, a fuego lento, en rodajas, sartén, hor..."
40,100487,bocaditos de queso cabrales,2009,"[leche, philadelphia, mantequilla, pan rallado...","[sal, horno, cocer, horno]"
45,100550,caracoles con jamón serrano,2009,"[tomate, comino, vino, laurel, aceite de oliva...","[horno, hervir, horno, horno, horno, sal, agua..."
57,101036,pastel de pollo con gelatina,2009,"[apio, caldo de pollo, pimienta, perejil, gela...",[agua]


In [41]:
cookpad_repr_recipes_df['creativity'] = cookpad_repr_recipes_df.apply(
    lambda x: 0, axis=1)

In [42]:
cookpad_repr_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
5,100045,filete al romero,2009,"[romero fresco, pimienta negra, aceite de oliv...","[horno, parrilla, en rodajas, sartén, sal, horno]",0
11,100067,hígado de ternera con cebolla,2009,"[ternera, cebolla, perejil, aceite de oliva, p...","[cocer, a fuego lento, en rodajas, sartén, hor...",0
40,100487,bocaditos de queso cabrales,2009,"[leche, philadelphia, mantequilla, pan rallado...","[sal, horno, cocer, horno]",0
45,100550,caracoles con jamón serrano,2009,"[tomate, comino, vino, laurel, aceite de oliva...","[horno, hervir, horno, horno, horno, sal, agua...",0
57,101036,pastel de pollo con gelatina,2009,"[apio, caldo de pollo, pimienta, perejil, gela...",[agua],0


### elbulli & cookpad

In [43]:
repr_recipes_df = elbulli_repr_recipes_df.append(cookpad_repr_recipes_df, ignore_index=True)

In [44]:
repr_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
0,1,terrina de melón con gelée de oporto,1987,"[oporto, aceite de oliva, agua, pimienta blanc...",[agua],10
1,10,salmonetes gaudí,1987,"[sal, pimientos, pimiento rojo, piñones tostad...","[sal, marcar, cocción, horno, hervir, horno, s...",10
2,100,oursins,1990,"[chocolate, claras de huevo, avellanas tostada...","[horno, horno, secar]",10
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[zumo de limón, limón, alginato sódico, agua, ...","[agua, cocer, horno]",30
4,1001,ceps en ámbar de su caldo,2004,"[sal, caldo de ceps, carragenato kappa en polv...","[sal, horno, cocer, hervir, confitar, cazuela,...",30


In [45]:
repr_recipes_df.tail()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
1613,99294,sopa de hojas de tomate,2009,"[tomate, bacalao, ajo, aceite, comino, pan, sal]","[sopa, crudo, cocer, cocer]",0
1614,99739,capón asado con jamón serrano,2009,"[caldo de ave, jamón serrano, ajo, cebolla, ca...","[horno, cocción, sal, horno]",0
1615,99740,estofado de chorizos con cachelos,2009,"[vino blanco, sal, chorizo, patatas, laurel]","[cazuela, espesa, sal, agua, cocer]",0
1616,99745,guiso gallego de mejillones,2009,"[huevos, laurel, vino blanco, pan rallado, pim...","[horno, cocer, cazuela, horno, horno, horno, s...",0
1617,99751,estofado de cerdo con pimientos,2009,"[pimiento, vinagre, pimienta, ajo, perejil, ce...","[cazuela, en rodajas, horno, cocer, horno]",0


## Superclasses of ingredients and techniques

### elbulli

In [46]:
elbulli_super_recipes_df = pd.read_csv('data/dbs/recipes_elbulli_superclasses.csv')

In [47]:
elbulli_super_recipes_df.dtypes

_id             int64
title          object
year            int64
ingredients    object
techniques     object
dtype: object

In [48]:
elbulli_super_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
0,1,terrina de melón con gelée de oporto,1987,"[""oporto"",""aceite"",""agua"",""pimienta"",""gelatina...","[""agua""]"
1,10,salmonetes gaudí,1987,"[""sal"",""pimientos"",""pimiento"",""piñones"",""vinag...","[""sal"",""marcar"",""cocción"",""horno"",""hervir"",""ho..."
2,100,oursins,1990,"[""chocolate"",""claras"",""avellanas"",""yemas"",""azú...","[""horno"",""horno"",""secar""]"
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[""zumo"",""limón"",""alginato"",""agua"",""té"",""azúcar""]","[""agua"",""cocer"",""horno""]"
4,1001,ceps en ámbar de su caldo,2004,"[""sal"",""ceps"",""carragenato"",""romero"",""ceps"",""c...","[""sal"",""horno"",""cocer"",""hervir"",""confitar"",""ca..."


In [49]:
elbulli_super_recipes_df['ingredients'] = elbulli_super_recipes_df.apply(lambda x: json.loads(x['ingredients']), axis=1)
elbulli_super_recipes_df['techniques'] = elbulli_super_recipes_df.apply(lambda x: json.loads(x['techniques']), axis=1)

In [50]:
elbulli_super_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
0,1,terrina de melón con gelée de oporto,1987,"[oporto, aceite, agua, pimienta, gelatina, mel...",[agua]
1,10,salmonetes gaudí,1987,"[sal, pimientos, pimiento, piñones, vinagre, o...","[sal, marcar, cocción, horno, hervir, horno, s..."
2,100,oursins,1990,"[chocolate, claras, avellanas, yemas, azúcar, ...","[horno, horno, secar]"
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[zumo, limón, alginato, agua, té, azúcar]","[agua, cocer, horno]"
4,1001,ceps en ámbar de su caldo,2004,"[sal, ceps, carragenato, romero, ceps, ceps, a...","[sal, horno, cocer, hervir, confitar, cazuela,..."


In [51]:
elbulli_super_recipes_df['creativity'] = elbulli_super_recipes_df.apply(
    lambda x: 10 if 1987 <= x['year'] <= 1997 else \
              20 if 1998 <= x['year'] <= 2001 else \
              30 if 2002 <= x['year'] <= 2005 else \
              'UNKOWN', axis=1)

In [52]:
elbulli_super_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
0,1,terrina de melón con gelée de oporto,1987,"[oporto, aceite, agua, pimienta, gelatina, mel...",[agua],10
1,10,salmonetes gaudí,1987,"[sal, pimientos, pimiento, piñones, vinagre, o...","[sal, marcar, cocción, horno, hervir, horno, s...",10
2,100,oursins,1990,"[chocolate, claras, avellanas, yemas, azúcar, ...","[horno, horno, secar]",10
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[zumo, limón, alginato, agua, té, azúcar]","[agua, cocer, horno]",30
4,1001,ceps en ámbar de su caldo,2004,"[sal, ceps, carragenato, romero, ceps, ceps, a...","[sal, horno, cocer, hervir, confitar, cazuela,...",30


### cookpad

In [53]:
cookpad_super_recipes_df = pd.read_csv('data/dbs/recipes_cookpad_superclasses.csv')
cookpad_super_recipes_df = my_sample(cookpad_super_recipes_df)

In [54]:
cookpad_super_recipes_df.dtypes

_id             int64
title          object
year            int64
ingredients    object
techniques     object
dtype: object

In [55]:
cookpad_super_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
5,100045,filete al romero,2009,"[""romero"",""pimienta"",""aceite"",""solomillo"",""sal...","[""horno"",""parrilla"",""rodajas"",""sartén"",""sal"",""..."
11,100067,hígado de ternera con cebolla,2009,"[""ternera"",""cebolla"",""perejil"",""aceite"",""pimie...","[""cocer"",""a"",""rodajas"",""sartén"",""horno"",""horno..."
40,100487,bocaditos de queso cabrales,2009,"[""leche"",""philadelphia"",""mantequilla"",""pan"",""a...","[""sal"",""horno"",""cocer"",""horno""]"
45,100550,caracoles con jamón serrano,2009,"[""tomate"",""comino"",""vino"",""laurel"",""aceite"",""j...","[""horno"",""hervir"",""horno"",""horno"",""horno"",""sal..."
57,101036,pastel de pollo con gelatina,2009,"[""apio"",""pollo"",""pimienta"",""perejil"",""gelatina...","[""agua""]"


In [56]:
cookpad_super_recipes_df['ingredients'] = cookpad_super_recipes_df.apply(lambda x: json.loads(x['ingredients']), axis=1)
cookpad_super_recipes_df['techniques'] = cookpad_super_recipes_df.apply(lambda x: json.loads(x['techniques']), axis=1)

In [57]:
cookpad_super_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
5,100045,filete al romero,2009,"[romero, pimienta, aceite, solomillo, salsa, h...","[horno, parrilla, rodajas, sartén, sal, horno]"
11,100067,hígado de ternera con cebolla,2009,"[ternera, cebolla, perejil, aceite, pimienta]","[cocer, a, rodajas, sartén, horno, horno, sal,..."
40,100487,bocaditos de queso cabrales,2009,"[leche, philadelphia, mantequilla, pan, aceite...","[sal, horno, cocer, horno]"
45,100550,caracoles con jamón serrano,2009,"[tomate, comino, vino, laurel, aceite, jamón, ...","[horno, hervir, horno, horno, horno, sal, agua..."
57,101036,pastel de pollo con gelatina,2009,"[apio, pollo, pimienta, perejil, gelatina, pol...",[agua]


In [58]:
cookpad_super_recipes_df['creativity'] = cookpad_super_recipes_df.apply(
    lambda x: 0, axis=1)

In [59]:
cookpad_super_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
5,100045,filete al romero,2009,"[romero, pimienta, aceite, solomillo, salsa, h...","[horno, parrilla, rodajas, sartén, sal, horno]",0
11,100067,hígado de ternera con cebolla,2009,"[ternera, cebolla, perejil, aceite, pimienta]","[cocer, a, rodajas, sartén, horno, horno, sal,...",0
40,100487,bocaditos de queso cabrales,2009,"[leche, philadelphia, mantequilla, pan, aceite...","[sal, horno, cocer, horno]",0
45,100550,caracoles con jamón serrano,2009,"[tomate, comino, vino, laurel, aceite, jamón, ...","[horno, hervir, horno, horno, horno, sal, agua...",0
57,101036,pastel de pollo con gelatina,2009,"[apio, pollo, pimienta, perejil, gelatina, pol...",[agua],0


### elbulli & cookpad

In [60]:
super_recipes_df = elbulli_super_recipes_df.append(cookpad_super_recipes_df, ignore_index=True)

In [61]:
super_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
0,1,terrina de melón con gelée de oporto,1987,"[oporto, aceite, agua, pimienta, gelatina, mel...",[agua],10
1,10,salmonetes gaudí,1987,"[sal, pimientos, pimiento, piñones, vinagre, o...","[sal, marcar, cocción, horno, hervir, horno, s...",10
2,100,oursins,1990,"[chocolate, claras, avellanas, yemas, azúcar, ...","[horno, horno, secar]",10
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[zumo, limón, alginato, agua, té, azúcar]","[agua, cocer, horno]",30
4,1001,ceps en ámbar de su caldo,2004,"[sal, ceps, carragenato, romero, ceps, ceps, a...","[sal, horno, cocer, hervir, confitar, cazuela,...",30


In [62]:
super_recipes_df.tail()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
1613,99294,sopa de hojas de tomate,2009,"[tomate, bacalao, ajo, aceite, comino, pan, sal]","[sopa, crudo, cocer, cocer]",0
1614,99739,capón asado con jamón serrano,2009,"[ave, jamón, ajo, cebolla, capón, sal]","[horno, cocción, sal, horno]",0
1615,99740,estofado de chorizos con cachelos,2009,"[vino, sal, chorizo, patatas, laurel]","[cazuela, espesa, sal, agua, cocer]",0
1616,99745,guiso gallego de mejillones,2009,"[huevos, laurel, vino, pan, pimienta, ajo, per...","[horno, cocer, cazuela, horno, horno, horno, s...",0
1617,99751,estofado de cerdo con pimientos,2009,"[pimiento, vinagre, pimienta, ajo, perejil, ce...","[cazuela, rodajas, horno, cocer, horno]",0


## Types of ingredients, representative techniques

### elbulli

In [63]:
elbulli_types_recipes_df = pd.read_csv('data/dbs/recipes_elbulli_types.csv')

In [64]:
elbulli_types_recipes_df.dtypes

_id             int64
title          object
year            int64
ingredients    object
techniques     object
dtype: object

In [65]:
elbulli_types_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
0,1,terrina de melón con gelée de oporto,1987,"[""western"",""western"",""1"",""western"",""modernist""...","[""agua""]"
1,10,salmonetes gaudí,1987,"[""western"",""western"",""western"",""1"",""western"",""...","[""sal"",""marcar"",""cocción"",""horno"",""hervir"",""ho..."
2,100,oursins,1990,"[""western"",""western"",""1"",""western"",""western"",""...","[""horno"",""horno"",""secar""]"
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[""1"",""western"",""modernist"",""1"",""western"",""west...","[""agua"",""cocer"",""horno""]"
4,1001,ceps en ámbar de su caldo,2004,"[""western"",""modernist"",""modernist"",""modernist""...","[""sal"",""horno"",""cocer"",""hervir"",""confitar"",""ca..."


In [66]:
elbulli_types_recipes_df['ingredients'] = elbulli_types_recipes_df.apply(lambda x: json.loads(x['ingredients']), axis=1)
elbulli_types_recipes_df['techniques'] = elbulli_types_recipes_df.apply(lambda x: json.loads(x['techniques']), axis=1)

In [67]:
elbulli_types_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
0,1,terrina de melón con gelée de oporto,1987,"[western, western, 1, western, modernist, west...",[agua]
1,10,salmonetes gaudí,1987,"[western, western, western, 1, western, wester...","[sal, marcar, cocción, horno, hervir, horno, s..."
2,100,oursins,1990,"[western, western, 1, western, western, western]","[horno, horno, secar]"
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[1, western, modernist, 1, western, western]","[agua, cocer, horno]"
4,1001,ceps en ámbar de su caldo,2004,"[western, modernist, modernist, modernist, wes...","[sal, horno, cocer, hervir, confitar, cazuela,..."


In [68]:
elbulli_types_recipes_df['creativity'] = elbulli_types_recipes_df.apply(
    lambda x: 10 if 1987 <= x['year'] <= 1997 else \
              20 if 1998 <= x['year'] <= 2001 else \
              30 if 2002 <= x['year'] <= 2005 else \
              'UNKOWN', axis=1)

In [69]:
elbulli_types_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
0,1,terrina de melón con gelée de oporto,1987,"[western, western, 1, western, modernist, west...",[agua],10
1,10,salmonetes gaudí,1987,"[western, western, western, 1, western, wester...","[sal, marcar, cocción, horno, hervir, horno, s...",10
2,100,oursins,1990,"[western, western, 1, western, western, western]","[horno, horno, secar]",10
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[1, western, modernist, 1, western, western]","[agua, cocer, horno]",30
4,1001,ceps en ámbar de su caldo,2004,"[western, modernist, modernist, modernist, wes...","[sal, horno, cocer, hervir, confitar, cazuela,...",30


### cookpad

In [70]:
cookpad_types_recipes_df = pd.read_csv('data/dbs/recipes_cookpad_types.csv')
cookpad_types_recipes_df = my_sample(cookpad_types_recipes_df)

In [71]:
cookpad_types_recipes_df.dtypes

_id             int64
title          object
year            int64
ingredients    object
techniques     object
dtype: object

In [72]:
cookpad_types_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
5,100045,filete al romero,2009,"[""western"",""western"",""western"",""western"",""asia...","[""horno"",""parrilla"",""en rodajas"",""sartén"",""sal..."
11,100067,hígado de ternera con cebolla,2009,"[""western"",""western"",""western"",""western"",""west...","[""cocer"",""a fuego lento"",""en rodajas"",""sartén""..."
40,100487,bocaditos de queso cabrales,2009,"[""western"",""western"",""western"",""western"",""west...","[""sal"",""horno"",""cocer"",""horno""]"
45,100550,caracoles con jamón serrano,2009,"[""western"",""western"",""western"",""western"",""west...","[""horno"",""hervir"",""horno"",""horno"",""horno"",""sal..."
57,101036,pastel de pollo con gelatina,2009,"[""western"",""western"",""western"",""western"",""west...","[""agua""]"


In [73]:
cookpad_types_recipes_df['ingredients'] = cookpad_types_recipes_df.apply(lambda x: json.loads(x['ingredients']), axis=1)
cookpad_types_recipes_df['techniques'] = cookpad_types_recipes_df.apply(lambda x: json.loads(x['techniques']), axis=1)

In [74]:
cookpad_types_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
5,100045,filete al romero,2009,"[western, western, western, western, asian, we...","[horno, parrilla, en rodajas, sartén, sal, horno]"
11,100067,hígado de ternera con cebolla,2009,"[western, western, western, western, western]","[cocer, a fuego lento, en rodajas, sartén, hor..."
40,100487,bocaditos de queso cabrales,2009,"[western, western, western, western, western, ...","[sal, horno, cocer, horno]"
45,100550,caracoles con jamón serrano,2009,"[western, western, western, western, western, ...","[horno, hervir, horno, horno, horno, sal, agua..."
57,101036,pastel de pollo con gelatina,2009,"[western, western, western, western, western, ...",[agua]


In [75]:
cookpad_types_recipes_df['creativity'] = cookpad_types_recipes_df.apply(
    lambda x: 0, axis=1)

In [76]:
cookpad_types_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
5,100045,filete al romero,2009,"[western, western, western, western, asian, we...","[horno, parrilla, en rodajas, sartén, sal, horno]",0
11,100067,hígado de ternera con cebolla,2009,"[western, western, western, western, western]","[cocer, a fuego lento, en rodajas, sartén, hor...",0
40,100487,bocaditos de queso cabrales,2009,"[western, western, western, western, western, ...","[sal, horno, cocer, horno]",0
45,100550,caracoles con jamón serrano,2009,"[western, western, western, western, western, ...","[horno, hervir, horno, horno, horno, sal, agua...",0
57,101036,pastel de pollo con gelatina,2009,"[western, western, western, western, western, ...",[agua],0


### elbulli & cookpad

In [77]:
types_recipes_df = elbulli_types_recipes_df.append(cookpad_types_recipes_df, ignore_index=True)

In [78]:
types_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
0,1,terrina de melón con gelée de oporto,1987,"[western, western, 1, western, modernist, west...",[agua],10
1,10,salmonetes gaudí,1987,"[western, western, western, 1, western, wester...","[sal, marcar, cocción, horno, hervir, horno, s...",10
2,100,oursins,1990,"[western, western, 1, western, western, western]","[horno, horno, secar]",10
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[1, western, modernist, 1, western, western]","[agua, cocer, horno]",30
4,1001,ceps en ámbar de su caldo,2004,"[western, modernist, modernist, modernist, wes...","[sal, horno, cocer, hervir, confitar, cazuela,...",30


## Cuisines of ingredients, representative techniques

### elbulli

In [79]:
elbulli_cuis_recipes_df = pd.read_csv('data/dbs/recipes_elbulli_cuisines.csv')

In [80]:
elbulli_cuis_recipes_df.dtypes

_id             int64
title          object
year            int64
ingredients    object
techniques     object
dtype: object

In [81]:
elbulli_cuis_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
0,1,terrina de melón con gelée de oporto,1987,"[""other"",""spicies_and_condimients"",""drinks"",""2...","[""agua""]"
1,10,salmonetes gaudí,1987,"[""spicies_and_condimients"",""vegetables"",""veget...","[""sal"",""marcar"",""cocción"",""horno"",""hervir"",""ho..."
2,100,oursins,1990,"[""sweets"",""spicies_and_condimients"",""nuts"",""sp...","[""horno"",""horno"",""secar""]"
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[""14"",""fruits"",""other"",""drinks"",""drinks"",""swee...","[""agua"",""cocer"",""horno""]"
4,1001,ceps en ámbar de su caldo,2004,"[""spicies_and_condimients"",""other"",""other"",""sp...","[""sal"",""horno"",""cocer"",""hervir"",""confitar"",""ca..."


In [82]:
elbulli_cuis_recipes_df['ingredients'] = elbulli_cuis_recipes_df.apply(lambda x: json.loads(x['ingredients']), axis=1)
elbulli_cuis_recipes_df['techniques'] = elbulli_cuis_recipes_df.apply(lambda x: json.loads(x['techniques']), axis=1)

In [83]:
elbulli_cuis_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
0,1,terrina de melón con gelée de oporto,1987,"[other, spicies_and_condimients, drinks, 2, ot...",[agua]
1,10,salmonetes gaudí,1987,"[spicies_and_condimients, vegetables, vegetabl...","[sal, marcar, cocción, horno, hervir, horno, s..."
2,100,oursins,1990,"[sweets, spicies_and_condimients, nuts, spicie...","[horno, horno, secar]"
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[14, fruits, other, drinks, drinks, sweets]","[agua, cocer, horno]"
4,1001,ceps en ámbar de su caldo,2004,"[spicies_and_condimients, other, other, spicie...","[sal, horno, cocer, hervir, confitar, cazuela,..."


In [84]:
elbulli_cuis_recipes_df['creativity'] = elbulli_cuis_recipes_df.apply(
    lambda x: 10 if 1987 <= x['year'] <= 1997 else \
              20 if 1998 <= x['year'] <= 2001 else \
              30 if 2002 <= x['year'] <= 2005 else \
              'UNKOWN', axis=1)

In [85]:
elbulli_cuis_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
0,1,terrina de melón con gelée de oporto,1987,"[other, spicies_and_condimients, drinks, 2, ot...",[agua],10
1,10,salmonetes gaudí,1987,"[spicies_and_condimients, vegetables, vegetabl...","[sal, marcar, cocción, horno, hervir, horno, s...",10
2,100,oursins,1990,"[sweets, spicies_and_condimients, nuts, spicie...","[horno, horno, secar]",10
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[14, fruits, other, drinks, drinks, sweets]","[agua, cocer, horno]",30
4,1001,ceps en ámbar de su caldo,2004,"[spicies_and_condimients, other, other, spicie...","[sal, horno, cocer, hervir, confitar, cazuela,...",30


### cookpad

In [86]:
cookpad_cuis_recipes_df = pd.read_csv('data/dbs/recipes_cookpad_cuisines.csv')
cookpad_cuis_recipes_df = my_sample(cookpad_cuis_recipes_df)

In [87]:
cookpad_cuis_recipes_df.dtypes

_id             int64
title          object
year            int64
ingredients    object
techniques     object
dtype: object

In [88]:
cookpad_cuis_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
5,100045,filete al romero,2009,"[""spicies_and_condimients"",""2"",""spicies_and_co...","[""horno"",""parrilla"",""en rodajas"",""sartén"",""sal..."
11,100067,hígado de ternera con cebolla,2009,"[""meats"",""vegetables"",""spicies_and_condimients...","[""cocer"",""a fuego lento"",""en rodajas"",""sartén""..."
40,100487,bocaditos de queso cabrales,2009,"[""dairy"",""dairy"",""dairy"",""cereals"",""spicies_an...","[""sal"",""horno"",""cocer"",""horno""]"
45,100550,caracoles con jamón serrano,2009,"[""vegetables"",""spicies_and_condimients"",""drink...","[""horno"",""hervir"",""horno"",""horno"",""horno"",""sal..."
57,101036,pastel de pollo con gelatina,2009,"[""vegetables"",""meats"",""1"",""spicies_and_condimi...","[""agua""]"


In [89]:
cookpad_cuis_recipes_df['ingredients'] = cookpad_cuis_recipes_df.apply(lambda x: json.loads(x['ingredients']), axis=1)
cookpad_cuis_recipes_df['techniques'] = cookpad_cuis_recipes_df.apply(lambda x: json.loads(x['techniques']), axis=1)

In [90]:
cookpad_cuis_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
5,100045,filete al romero,2009,"[spicies_and_condimients, 2, spicies_and_condi...","[horno, parrilla, en rodajas, sartén, sal, horno]"
11,100067,hígado de ternera con cebolla,2009,"[meats, vegetables, spicies_and_condimients, s...","[cocer, a fuego lento, en rodajas, sartén, hor..."
40,100487,bocaditos de queso cabrales,2009,"[dairy, dairy, dairy, cereals, spicies_and_con...","[sal, horno, cocer, horno]"
45,100550,caracoles con jamón serrano,2009,"[vegetables, spicies_and_condimients, drinks, ...","[horno, hervir, horno, horno, horno, sal, agua..."
57,101036,pastel de pollo con gelatina,2009,"[vegetables, meats, 1, spicies_and_condimients...",[agua]


In [91]:
cookpad_cuis_recipes_df['creativity'] = cookpad_cuis_recipes_df.apply(
    lambda x: 0, axis=1)

In [92]:
cookpad_cuis_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
5,100045,filete al romero,2009,"[spicies_and_condimients, 2, spicies_and_condi...","[horno, parrilla, en rodajas, sartén, sal, horno]",0
11,100067,hígado de ternera con cebolla,2009,"[meats, vegetables, spicies_and_condimients, s...","[cocer, a fuego lento, en rodajas, sartén, hor...",0
40,100487,bocaditos de queso cabrales,2009,"[dairy, dairy, dairy, cereals, spicies_and_con...","[sal, horno, cocer, horno]",0
45,100550,caracoles con jamón serrano,2009,"[vegetables, spicies_and_condimients, drinks, ...","[horno, hervir, horno, horno, horno, sal, agua...",0
57,101036,pastel de pollo con gelatina,2009,"[vegetables, meats, 1, spicies_and_condimients...",[agua],0


### elbulli & cookpad

In [93]:
cuis_recipes_df = elbulli_cuis_recipes_df.append(cookpad_cuis_recipes_df, ignore_index=True)

In [94]:
cuis_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
0,1,terrina de melón con gelée de oporto,1987,"[other, spicies_and_condimients, drinks, 2, ot...",[agua],10
1,10,salmonetes gaudí,1987,"[spicies_and_condimients, vegetables, vegetabl...","[sal, marcar, cocción, horno, hervir, horno, s...",10
2,100,oursins,1990,"[sweets, spicies_and_condimients, nuts, spicie...","[horno, horno, secar]",10
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[14, fruits, other, drinks, drinks, sweets]","[agua, cocer, horno]",30
4,1001,ceps en ámbar de su caldo,2004,"[spicies_and_condimients, other, other, spicie...","[sal, horno, cocer, hervir, confitar, cazuela,...",30


# Formatting data

In [95]:
def join_ingredients_and_techniques(ingr_list, tech_list):
    i_list = ['i_' + '_'.join(x.split()) for x in ingr_list]
    t_list = ['t_' + '_'.join(x.split()) for x in tech_list]
    return ' '.join(i_list + t_list)

In [96]:
columns = ['text', 'creativity']

In [97]:
raw_text_df = pd.DataFrame(columns=columns)
raw_text_df['creativity'] = raw_recipes_df['creativity']
raw_text_df['text'] = raw_recipes_df.apply(
    lambda x: join_ingredients_and_techniques(x['ingredients'], x['techniques']), axis=1)

In [98]:
raw_text_df.head()

Unnamed: 0,text,creativity
0,i_oporto i_aceite_de_oliva i_agua i_pimienta_b...,10
1,i_sal i_pimiento i_pimiento_rojo i_piñones_tos...,10
2,i_chocolate i_claras_de_huevo i_avellanas_tost...,10
3,i_zumo_de_limón i_limón i_alginato_sódico i_ag...,30
4,i_sal i_caldo_de_ceps i_carragenato_kappa_en_p...,30


In [99]:
repr_text_df = pd.DataFrame(columns=columns)
repr_text_df['creativity'] = repr_recipes_df['creativity']
repr_text_df['text'] = repr_recipes_df.apply(
    lambda x: join_ingredients_and_techniques(x['ingredients'], x['techniques']), axis=1)

In [100]:
repr_text_df.head()

Unnamed: 0,text,creativity
0,i_oporto i_aceite_de_oliva i_agua i_pimienta_b...,10
1,i_sal i_pimientos i_pimiento_rojo i_piñones_to...,10
2,i_chocolate i_claras_de_huevo i_avellanas_tost...,10
3,i_zumo_de_limón i_limón i_alginato_sódico i_ag...,30
4,i_sal i_caldo_de_ceps i_carragenato_kappa_en_p...,30


In [101]:
super_text_df = pd.DataFrame(columns=columns)
super_text_df['creativity'] = super_recipes_df['creativity']
super_text_df['text'] = super_recipes_df.apply(
    lambda x: join_ingredients_and_techniques(x['ingredients'], x['techniques']), axis=1)

In [102]:
super_text_df.head()

Unnamed: 0,text,creativity
0,i_oporto i_aceite i_agua i_pimienta i_gelatina...,10
1,i_sal i_pimientos i_pimiento i_piñones i_vinag...,10
2,i_chocolate i_claras i_avellanas i_yemas i_azú...,10
3,i_zumo i_limón i_alginato i_agua i_té i_azúcar...,30
4,i_sal i_ceps i_carragenato i_romero i_ceps i_c...,30


In [103]:
types_text_df = pd.DataFrame(columns=columns)
types_text_df['creativity'] = types_recipes_df['creativity']
types_text_df['text'] = types_recipes_df.apply(
    lambda x: join_ingredients_and_techniques(x['ingredients'], x['techniques']), axis=1)

In [104]:
types_text_df.head()

Unnamed: 0,text,creativity
0,i_western i_western i_1 i_western i_modernist ...,10
1,i_western i_western i_western i_1 i_western i_...,10
2,i_western i_western i_1 i_western i_western i_...,10
3,i_1 i_western i_modernist i_1 i_western i_west...,30
4,i_western i_modernist i_modernist i_modernist ...,30


In [105]:
cuis_text_df = pd.DataFrame(columns=columns)
cuis_text_df['creativity'] = cuis_recipes_df['creativity']
cuis_text_df['text'] = cuis_recipes_df.apply(
    lambda x: join_ingredients_and_techniques(x['ingredients'], x['techniques']), axis=1)

In [106]:
cuis_text_df.head()

Unnamed: 0,text,creativity
0,i_other i_spicies_and_condimients i_drinks i_2...,10
1,i_spicies_and_condimients i_vegetables i_veget...,10
2,i_sweets i_spicies_and_condimients i_nuts i_sp...,10
3,i_14 i_fruits i_other i_drinks i_drinks i_swee...,30
4,i_spicies_and_condimients i_other i_other i_sp...,30


# Classifier

In [107]:
K = 10

In [108]:
def benchmark(grid_search_cv, X_train, X_test, y_train, y_test, name):
    print('Training %s...' % name)
    print()
    t0 = time()
    grid_search_cv.fit(X_train, y_train)
    training_time = time() - t0
    print('Training time: %0.3fs' % training_time)
    print()
    print('Best score: %0.2f' % grid_search_cv.best_score_)
    print('Best parameters:', grid_search_cv.best_params_)
    print()
    print('Testing %s...' % name)
    print()
    t0 = time()
    pred = grid_search_cv.predict(X_test)
    testing_time = time() - t0
    print('Testing time:  %0.3fs' % testing_time)
    print()
    print('Metrics:')
    score = metrics.accuracy_score(y_test, pred)
    print('accuracy  = %0.2f' % score)
    precision = metrics.precision_score(y_test, pred, average='weighted')
    print('precision = %0.2f' % precision)
    recall = metrics.recall_score(y_test, pred, average='weighted')
    print('recall    = %0.2f' % recall)
    f1_score = metrics.f1_score(y_test, pred, average='weighted')
    print('f1_score  = %0.2f' % f1_score)
    print()
    print('Classification report:')
    print(metrics.classification_report(y_test, pred, target_names=['None' ,'Low', 'Medium', 'High']))
    print()
    print('Confusion matrix:')
    print(metrics.confusion_matrix(y_test, pred))
    print()
    return score, precision, recall, f1_score

In [109]:
def update(d1, d2):
    d = dict(d1)
    d.update(d2)
    return d

## Raw ingredients and techniques

In [115]:
X = raw_text_df['text']
y = raw_text_df['creativity']

In [116]:
y.value_counts().sort_index()

0     404
10    454
20    371
30    389
Name: creativity, dtype: int64

In [117]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0, stratify=y)

In [118]:
skf = StratifiedKFold(y_train, K)

In [119]:
parameters = {
    'vect__max_df': (0.8, 0.9, 1.0),
    'vect__min_df': (0.0, 0.1, 1),
    'tfidf__norm': (None, 'l1', 'l2'),
    'tfidf__use_idf': (True, False),
    'tfidf__smooth_idf': (True, False),
    'tfidf__sublinear_tf': (False, True),
}

classifiers = [
    ('BernoulliNB', BernoulliNB, update(parameters, {
        'clf__alpha': (1.0, 1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6),
        'clf__fit_prior': (False, True),
    })),
    ('KNeighborsClassifier', KNeighborsClassifier, update(parameters, {
        'clf__n_neighbors': (1, 5, 10),
        'clf__weights': ('uniform', 'distance'),
        'clf__algorithm': ('auto', 'ball_tree', 'kd_tree', 'brute'),
        'clf__leaf_size': (20, 30, 40),
        'clf__metric': ('euclidean', 'manhattan', 'chebyshev', 'minkowski', 'wminkowski', 'seuclidean', 'mahalanobis'),
        'clf__p': (1, 2, 3),})),
    ('LinearSVC', LinearSVC, update(parameters, {
        'clf__C': (1.0, 2.0, 10.),
        'clf__loss': ('squared_hinge', 'hinge'),
        'clf__penalty': ('l1', 'l2'),
        'clf__dual': (True, False),
        'clf__tol': (1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6),
        'clf__multi_class': ('ovr', 'crammer_singer'),
        'clf__fit_intercept': (True, False),
        'clf__max_iter': (100, 1000, 2000),})),
    ('MultinomialNB', MultinomialNB, update(parameters, {
        'clf__alpha': (1.0, 1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6),
        'clf__fit_prior': (False, True),})),
    ('NearestCentroid', NearestCentroid, update(parameters, {
        'clf__metric': ('euclidean', 'manhattan', 'chebyshev', 'minkowski', 'wminkowski', 'seuclidean', 'mahalanobis'),
        'clf__shrink_threshold': (None, 0.1, 1.0),})),
    ('PassiveAggressiveClassifier', PassiveAggressiveClassifier, update(parameters, {
        'clf__C': (1.0, 2.0, 10.),
        'clf__loss': ('squared_hinge', 'hinge'),
        'clf__fit_intercept': (True, False),
        'clf__n_iter': (5, 10, 20),
        'clf__shuffle': (True, False),
        'clf__random_state': (None, 0, 1),
        'clf__class_weight': (None, 'balanced', {0: 0.25, 10: 0.28, 20: 0.23, 30: 0.24}),
        'clf__warm_start': (False, True),})),
    ('Perceptron', Perceptron, update(parameters, {
        'clf__penalty': (None, 'l1', 'l2', 'elasticnet'),
        'clf__alpha': (1.0, 1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6),
        'clf__fit_intercept': (True, False),
        'clf__n_iter': (5, 10, 20),
        'clf__shuffle': (True, False),
        'clf__random_state': (None, 0, 1),
        'clf__class_weight': (None, 'balanced', {0: 0.25, 10: 0.28, 20: 0.23, 30: 0.24}),
        'clf__warm_start': (False, True),})),
    ('RandomForestClassifier', RandomForestClassifier, update(parameters, {
        'clf__n_estimators': (10, 20, 30),
        'clf__criterion': ('guini', 'entropy'),
        'clf__max_features': (1, 10, 0.1, 0.5, 0.9, 'auto', 'sqrt', 'log2', None),
        'clf__max_depth': (None, 5, 10),
        'clf__min_samples_split': (1, 2, 3),
        'clf__min_samples_leaf': (1, 3, 10),
        'clf__min_weight_fraction_leaf': (0.2, 0.1, 0.01, 0.001, 0.0),
        'clf__bootstrap': (True, False),
        'clf__oob_score': (True, False),
        'clf__random_state': (None, 0, 1),
        'clf__class_weight': (None, 'balanced', 'balanced_subsample', {0: 0.25, 10: 0.28, 20: 0.23, 30: 0.24}),
        'clf__warm_start': (False, True),})),
    ('RidgeClassifier', RidgeClassifier, update(parameters, {
        'clf__alpha': (1.0, 1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6),
        'clf__class_weight': (None, 'balanced', 'balanced_subsample', {0: 0.25, 10: 0.28, 20: 0.23, 30: 0.24}),
        'clf__copy_X': (True, False),
        'clf__fit_intercept': (True, False),
        'clf__max_iter': (None, 100, 1000, 2000),
        'clf__normalize': (True, False),
        'clf__solver': ('auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag'),
        'clf__tol': (1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6),
        'clf__random_state': (None, 0, 1),})),
    ('SGDClassifier', SGDClassifier, update(parameters, {
        'clf__loss': ('squared_hinge', 'hinge', 'log', 'modified_huber', 'perceptron',
                      'squared_loss', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'),
        'clf__penalty': (None, 'l1', 'l2', 'elasticnet'),
        'clf__alpha': (1.0, 1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6),
        'clf__l1_ratio': (0.0, 0.15, 0.5, 0.85, 1.0),
        'clf__fit_intercept': (True, False),
        'clf__n_iter': (5, 10, 20),
        'clf__shuffle': (True, False),
        'clf__random_state': (None, 0, 1),
        'clf__class_weight': (None, 'balanced', {0: 0.25, 10: 0.28, 20: 0.23, 30: 0.24}),
        'clf__warm_start': (False, True),
        'clf__average': (False, True, 1, 10),})),
]

In [120]:
print('Performing grid search with cross-validation...')
print('=' * 80)
print()
for name, clf, parameters in classifiers:
    pipeline = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', clf()),
    ])
    grid_search_cv = GridSearchCV(pipeline, parameters, cv=skf, scoring='f1_weighted', error_score=0, n_jobs=-1)
    benchmark(grid_search_cv, X_train, X_test, y_train, y_test, name)
    print('-' * 80)
#mirar esto tb http://scikit-learn.org/stable/auto_examples/text/document_clustering.html#example-text-document-clustering-py

Performing grid search with cross-validation...

Training BernoulliNB...

Training time: 715.202s

Best score: 0.84
Best parameters: {'tfidf__use_idf': True, 'tfidf__sublinear_tf': False, 'tfidf__smooth_idf': True, 'tfidf__norm': None, 'clf__alpha': 0.001, 'vect__min_df': 0.0, 'clf__fit_prior': False, 'vect__max_df': 0.8}

Testing BernoulliNB...

Testing time:  0.006s

Metrics:
accuracy  = 0.83
precision = 0.83
recall    = 0.83
f1_score  = 0.82

Classification report:
             precision    recall  f1-score   support

       None       0.90      0.95      0.93        40
        Low       0.82      0.62      0.71        45
     Medium       0.69      0.78      0.73        37
       High       0.88      0.97      0.93        39

avg / total       0.83      0.83      0.82       161


Confusion matrix:
[[38  1  0  1]
 [ 2 28 12  3]
 [ 2  5 29  1]
 [ 0  0  1 38]]

--------------------------------------------------------------------------------
Training KNeighborsClassifier...



ValueError("metric 'chebyshev' not valid for sparse input",)
ValueError("metric 'chebyshev' not valid for sparse input",)
ValueError("metric 'chebyshev' not valid for sparse input",)
ValueError("metric 'chebyshev' not valid for sparse input",)
ValueError("metric 'chebyshev' not valid for sparse input",)
ValueError("metric 'chebyshev' not valid for sparse input",)
ValueError("metric 'chebyshev' not valid for sparse input",)
ValueError("metric 'chebyshev' not valid for sparse input",)
ValueError("metric 'chebyshev' not valid for sparse input",)
ValueError("metric 'chebyshev' not valid for sparse input",)
ValueError("metric 'chebyshev' not valid for sparse input",)
ValueError("metric 'chebyshev' not valid for sparse input",)
ValueError("metric 'chebyshev' not valid for sparse input",)
ValueError("metric 'chebyshev' not valid for sparse input",)
ValueError("metric 'chebyshev' not valid for sparse input",)
ValueError("metric 'chebyshev' not valid for sparse input",)
ValueError("metric 'cheb

KeyboardInterrupt: 

ValueError("metric 'chebyshev' not valid for sparse input",)
ValueError("metric 'chebyshev' not valid for sparse input",)
ValueError("metric 'chebyshev' not valid for sparse input",)
ValueError("metric 'chebyshev' not valid for sparse input",)
ValueError("metric 'chebyshev' not valid for sparse input",)
ValueError("metric 'chebyshev' not valid for sparse input",)
ValueError("metric 'chebyshev' not valid for sparse input",)
ValueError("metric 'chebyshev' not valid for sparse input",)
ValueError("metric 'chebyshev' not valid for sparse input",)
ValueError("metric 'chebyshev' not valid for sparse input",)
ValueError("metric 'chebyshev' not valid for sparse input",)
ValueError("metric 'chebyshev' not valid for sparse input",)
ValueError("metric 'chebyshev' not valid for sparse input",)
ValueError("metric 'chebyshev' not valid for sparse input",)
ValueError("metric 'chebyshev' not valid for sparse input",)
ValueError("metric 'chebyshev' not valid for sparse input",)
ValueError("metric 'cheb

In [None]:
# split a training set and a test set
y_train = y_data
y_test = y_val

vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5)

print("Extracting features from the training data using a sparse vectorizer")
X_train = vectorizer.fit_transform(X_data)
print("n_samples: %d, n_features: %d" % X_train.shape)
print()

print("Extracting features from the test data using the same vectorizer")
X_test = vectorizer.transform(X_data)
print("n_samples: %d, n_features: %d" % X_test.shape)
print()

In [None]:
# mapping from integer feature name to original token string
feature_names = vectorizer.get_feature_names()
opts_select_chi2 = 50

print("Extracting %d best features by a chi-squared test" %
      opts_select_chi2)
ch2 = SelectKBest(chi2, k=opts_select_chi2)
X_train = ch2.fit_transform(X_train, y_train)
X_test = ch2.transform(X_test)
if feature_names:
    # keep selected feature names
    feature_names = [feature_names[i] for i
                     in ch2.get_support(indices=True)]
print()

feature_names = np.asarray(feature_names)

In [None]:
feature_names