In [1]:
import json

import networkx as nx
import nltk
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.cross_validation import StratifiedKFold, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import PassiveAggressiveClassifier, Perceptron, RidgeClassifier, SGDClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
# Classifiers obtained from http://scikit-learn.org/stable/auto_examples/text/document_classification_20newsgroups.html

# Original data

## Preparing sets' sizes

In [2]:
elbulli_raw_recipes_df = pd.read_csv('data/dbs/recipes_elbulli_raw.csv')

In [3]:
elbulli_raw_recipes_df['creativity'] = elbulli_raw_recipes_df.apply(
    lambda x: 10 if 1987 <= x['year'] <= 1997 else \
              20 if 1998 <= x['year'] <= 2001 else \
              30 if 2002 <= x['year'] <= 2005 else \
              'UNKOWN', axis=1)

In [4]:
elbulli_raw_recipes_df['creativity'].value_counts()

10    454
30    389
20    371
Name: creativity, dtype: int64

In [5]:
cookpad_size = int(np.average(elbulli_raw_recipes_df['creativity'].value_counts()))

In [6]:
cookpad_size

404

In [7]:
cookpad_raw_recipes_df = pd.read_csv('data/dbs/recipes_cookpad_raw.csv')

In [8]:
cookpad_sample = cookpad_raw_recipes_df.sample(cookpad_size)

In [9]:
cookpad_indices = cookpad_sample.index.sort_values()

In [10]:
cookpad_indices

Int64Index([  17,   18,   60,   77,   81,   85,  140,  148,  172,  189,
            ...
            7794, 7796, 7810, 7825, 7828, 7900, 7929, 7936, 7956, 7967],
           dtype='int64', length=404)

In [11]:
def my_sample(df, indices=cookpad_indices):
    return df.loc[indices]

## Raw ingredients and techniques

### elbulli

In [12]:
elbulli_raw_recipes_df = pd.read_csv('data/dbs/recipes_elbulli_raw.csv')

In [13]:
elbulli_raw_recipes_df.shape

(1214, 5)

In [14]:
elbulli_raw_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
0,1,terrina de melón con gelée de oporto,1987,"[""oporto"",""aceite de oliva"",""agua"",""pimienta b...","[""agua""]"
1,10,salmonetes gaudí,1987,"[""sal"",""pimiento"",""pimiento rojo"",""piñones tos...","[""sal"",""marcar"",""cocción"",""asado"",""hirviendo"",..."
2,100,oursins,1990,"[""chocolate"",""claras de huevo"",""avellanas tost...","[""horno"",""al horno"",""secar""]"
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[""zumo de limón"",""limón"",""alginato sódico"",""ag...","[""agua"",""cocer"",""fría""]"
4,1001,ceps en ámbar de su caldo,2004,"[""sal"",""caldo de ceps"",""carragenato kappa en p...","[""sal"",""asar"",""cocer"",""hervido"",""confitar"",""ca..."


In [15]:
elbulli_raw_recipes_df['ingredients'] = elbulli_raw_recipes_df.apply(lambda x: json.loads(x['ingredients']), axis=1)
elbulli_raw_recipes_df['techniques'] = elbulli_raw_recipes_df.apply(lambda x: json.loads(x['techniques']), axis=1)

In [16]:
elbulli_raw_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
0,1,terrina de melón con gelée de oporto,1987,"[oporto, aceite de oliva, agua, pimienta blanc...",[agua]
1,10,salmonetes gaudí,1987,"[sal, pimiento, pimiento rojo, piñones tostado...","[sal, marcar, cocción, asado, hirviendo, horno..."
2,100,oursins,1990,"[chocolate, claras de huevo, avellanas tostada...","[horno, al horno, secar]"
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[zumo de limón, limón, alginato sódico, agua, ...","[agua, cocer, fría]"
4,1001,ceps en ámbar de su caldo,2004,"[sal, caldo de ceps, carragenato kappa en polv...","[sal, asar, cocer, hervido, confitar, cazuela,..."


In [17]:
elbulli_raw_recipes_df['creativity'] = elbulli_raw_recipes_df.apply(
    lambda x: 10 if 1987 <= x['year'] <= 1997 else \
              20 if 1998 <= x['year'] <= 2001 else \
              30 if 2002 <= x['year'] <= 2005 else \
              'UNKOWN', axis=1)

In [18]:
elbulli_raw_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
0,1,terrina de melón con gelée de oporto,1987,"[oporto, aceite de oliva, agua, pimienta blanc...",[agua],10
1,10,salmonetes gaudí,1987,"[sal, pimiento, pimiento rojo, piñones tostado...","[sal, marcar, cocción, asado, hirviendo, horno...",10
2,100,oursins,1990,"[chocolate, claras de huevo, avellanas tostada...","[horno, al horno, secar]",10
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[zumo de limón, limón, alginato sódico, agua, ...","[agua, cocer, fría]",30
4,1001,ceps en ámbar de su caldo,2004,"[sal, caldo de ceps, carragenato kappa en polv...","[sal, asar, cocer, hervido, confitar, cazuela,...",30


### cookpad

In [19]:
cookpad_raw_recipes_df = pd.read_csv('data/dbs/recipes_cookpad_raw.csv')
cookpad_raw_recipes_df = my_sample(cookpad_raw_recipes_df)

In [20]:
cookpad_raw_recipes_df.shape

(404, 5)

In [21]:
cookpad_raw_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
17,100377,estofado de venado,2009,"[""laurel"",""zanahorias"",""aceite de oliva"",""pimi...","[""cocer"",""a fuego lento"",""freir"",""sal"",""julian..."
18,100383,arroz con mariscos mejjicanna,2009,"[""guisantes"",""azafran"",""laurel"",""calamar"",""mar...","[""sofrito"",""frie"",""hervir"",""agua""]"
60,101081,garbanzos con tomate,2010,"[""salsa de tomate"",""ajos"",""garbanzos cocidos"",...","[""cocemos""]"
77,101360,brazo gitano de gamba y surimi,2012,"[""pimientos"",""mayonesa"",""surimi"",""huevos"",""gam...","[""rodajas"",""pure""]"
81,101393,espinacas a la andaluza,2009,"[""aceite de oliva virgen"",""comino"",""vino"",""esp...","[""frito"",""cocer"",""a fuego lento"",""cazuela"",""sa..."


In [22]:
cookpad_raw_recipes_df['ingredients'] = cookpad_raw_recipes_df.apply(lambda x: json.loads(x['ingredients']), axis=1)
cookpad_raw_recipes_df['techniques'] = cookpad_raw_recipes_df.apply(lambda x: json.loads(x['techniques']), axis=1)

In [23]:
cookpad_raw_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
17,100377,estofado de venado,2009,"[laurel, zanahorias, aceite de oliva, pimienta...","[cocer, a fuego lento, freir, sal, juliana, agua]"
18,100383,arroz con mariscos mejjicanna,2009,"[guisantes, azafran, laurel, calamar, mariscos...","[sofrito, frie, hervir, agua]"
60,101081,garbanzos con tomate,2010,"[salsa de tomate, ajos, garbanzos cocidos, tom...",[cocemos]
77,101360,brazo gitano de gamba y surimi,2012,"[pimientos, mayonesa, surimi, huevos, gambas, ...","[rodajas, pure]"
81,101393,espinacas a la andaluza,2009,"[aceite de oliva virgen, comino, vino, espinac...","[frito, cocer, a fuego lento, cazuela, sartén,..."


In [24]:
cookpad_raw_recipes_df['creativity'] = cookpad_raw_recipes_df.apply(
    lambda x: 0, axis=1)

In [25]:
cookpad_raw_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
17,100377,estofado de venado,2009,"[laurel, zanahorias, aceite de oliva, pimienta...","[cocer, a fuego lento, freir, sal, juliana, agua]",0
18,100383,arroz con mariscos mejjicanna,2009,"[guisantes, azafran, laurel, calamar, mariscos...","[sofrito, frie, hervir, agua]",0
60,101081,garbanzos con tomate,2010,"[salsa de tomate, ajos, garbanzos cocidos, tom...",[cocemos],0
77,101360,brazo gitano de gamba y surimi,2012,"[pimientos, mayonesa, surimi, huevos, gambas, ...","[rodajas, pure]",0
81,101393,espinacas a la andaluza,2009,"[aceite de oliva virgen, comino, vino, espinac...","[frito, cocer, a fuego lento, cazuela, sartén,...",0


### elbulli & cookpad

In [26]:
raw_recipes_df = elbulli_raw_recipes_df.append(cookpad_raw_recipes_df, ignore_index=True)

In [27]:
raw_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
0,1,terrina de melón con gelée de oporto,1987,"[oporto, aceite de oliva, agua, pimienta blanc...",[agua],10
1,10,salmonetes gaudí,1987,"[sal, pimiento, pimiento rojo, piñones tostado...","[sal, marcar, cocción, asado, hirviendo, horno...",10
2,100,oursins,1990,"[chocolate, claras de huevo, avellanas tostada...","[horno, al horno, secar]",10
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[zumo de limón, limón, alginato sódico, agua, ...","[agua, cocer, fría]",30
4,1001,ceps en ámbar de su caldo,2004,"[sal, caldo de ceps, carragenato kappa en polv...","[sal, asar, cocer, hervido, confitar, cazuela,...",30


In [28]:
raw_recipes_df.tail()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
1613,99341,refrito de col y garbanzos,2009,"[aceite de oliva, pimienta, ajo, col, bacon, g...","[dorar, cocer, hervir, dore, sartén, sal, en a...",0
1614,99733,roscón de villalba,2009,"[huevos, guindillas, almendras, azúcar]","[al horno, dore, cocer]",0
1615,99741,tarta de santiago tradicional,2009,"[almendras, huevo, mantequilla, azúcar glas, h...","[dora, hornear, relleno, agua, horno]",0
1616,99839,cogote de merluza a la donostierra,2009,"[merluza, ajo, guindilla, oliva, vinagre]","[sarten, sal, plancha]",0
1617,99963,flan de huevo en microondas,2009,"[huevos, leche condensada, leche entera]","[microondas, caramelizado]",0


## Representative ingredients and techniques

### elbulli

In [29]:
elbulli_repr_recipes_df = pd.read_csv('data/dbs/recipes_elbulli_representatives.csv')

In [30]:
elbulli_repr_recipes_df.dtypes

_id             int64
title          object
year            int64
ingredients    object
techniques     object
dtype: object

In [31]:
elbulli_repr_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
0,1,terrina de melón con gelée de oporto,1987,"[""oporto"",""aceite de oliva"",""agua"",""pimienta b...","[""agua""]"
1,10,salmonetes gaudí,1987,"[""sal"",""pimientos"",""pimiento rojo"",""piñones to...","[""sal"",""marcar"",""cocción"",""horno"",""hervir"",""ho..."
2,100,oursins,1990,"[""chocolate"",""claras de huevo"",""avellanas tost...","[""horno"",""horno"",""secar""]"
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[""zumo de limón"",""limón"",""alginato sódico"",""ag...","[""agua"",""cocer"",""horno""]"
4,1001,ceps en ámbar de su caldo,2004,"[""sal"",""caldo de ceps"",""carragenato kappa en p...","[""sal"",""horno"",""cocer"",""hervir"",""confitar"",""ca..."


In [32]:
elbulli_repr_recipes_df['ingredients'] = elbulli_repr_recipes_df.apply(lambda x: json.loads(x['ingredients']), axis=1)
elbulli_repr_recipes_df['techniques'] = elbulli_repr_recipes_df.apply(lambda x: json.loads(x['techniques']), axis=1)

In [33]:
elbulli_repr_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
0,1,terrina de melón con gelée de oporto,1987,"[oporto, aceite de oliva, agua, pimienta blanc...",[agua]
1,10,salmonetes gaudí,1987,"[sal, pimientos, pimiento rojo, piñones tostad...","[sal, marcar, cocción, horno, hervir, horno, s..."
2,100,oursins,1990,"[chocolate, claras de huevo, avellanas tostada...","[horno, horno, secar]"
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[zumo de limón, limón, alginato sódico, agua, ...","[agua, cocer, horno]"
4,1001,ceps en ámbar de su caldo,2004,"[sal, caldo de ceps, carragenato kappa en polv...","[sal, horno, cocer, hervir, confitar, cazuela,..."


In [34]:
elbulli_repr_recipes_df['creativity'] = elbulli_repr_recipes_df.apply(
    lambda x: 10 if 1987 <= x['year'] <= 1997 else \
              20 if 1998 <= x['year'] <= 2001 else \
              30 if 2002 <= x['year'] <= 2005 else \
              'UNKOWN', axis=1)

In [35]:
elbulli_repr_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
0,1,terrina de melón con gelée de oporto,1987,"[oporto, aceite de oliva, agua, pimienta blanc...",[agua],10
1,10,salmonetes gaudí,1987,"[sal, pimientos, pimiento rojo, piñones tostad...","[sal, marcar, cocción, horno, hervir, horno, s...",10
2,100,oursins,1990,"[chocolate, claras de huevo, avellanas tostada...","[horno, horno, secar]",10
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[zumo de limón, limón, alginato sódico, agua, ...","[agua, cocer, horno]",30
4,1001,ceps en ámbar de su caldo,2004,"[sal, caldo de ceps, carragenato kappa en polv...","[sal, horno, cocer, hervir, confitar, cazuela,...",30


### cookpad

In [36]:
cookpad_repr_recipes_df = pd.read_csv('data/dbs/recipes_cookpad_representatives.csv')
cookpad_repr_recipes_df = my_sample(cookpad_repr_recipes_df)

In [37]:
cookpad_repr_recipes_df.shape

(404, 5)

In [38]:
cookpad_repr_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
17,100377,estofado de venado,2009,"[""laurel"",""zanahoria"",""aceite de oliva"",""pimie...","[""cocer"",""a fuego lento"",""horno"",""sal"",""julian..."
18,100383,arroz con mariscos mejjicanna,2009,"[""guisantes"",""azafrán"",""laurel"",""calamares"",""m...","[""horno"",""horno"",""hervir"",""agua""]"
60,101081,garbanzos con tomate,2010,"[""salsa de tomate"",""ajo"",""garbanzos cocidos"",""...","[""cocer""]"
77,101360,brazo gitano de gamba y surimi,2012,"[""pimiento"",""mayonesa"",""surimi"",""huevos"",""gamb...","[""en rodajas"",""puré""]"
81,101393,espinacas a la andaluza,2009,"[""aceite de oliva virgen"",""comino"",""vino"",""esp...","[""horno"",""cocer"",""a fuego lento"",""cazuela"",""sa..."


In [39]:
cookpad_repr_recipes_df['ingredients'] = cookpad_repr_recipes_df.apply(lambda x: json.loads(x['ingredients']), axis=1)
cookpad_repr_recipes_df['techniques'] = cookpad_repr_recipes_df.apply(lambda x: json.loads(x['techniques']), axis=1)

In [40]:
cookpad_repr_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
17,100377,estofado de venado,2009,"[laurel, zanahoria, aceite de oliva, pimienta,...","[cocer, a fuego lento, horno, sal, juliana, agua]"
18,100383,arroz con mariscos mejjicanna,2009,"[guisantes, azafrán, laurel, calamares, marisc...","[horno, horno, hervir, agua]"
60,101081,garbanzos con tomate,2010,"[salsa de tomate, ajo, garbanzos cocidos, tomi...",[cocer]
77,101360,brazo gitano de gamba y surimi,2012,"[pimiento, mayonesa, surimi, huevos, gambas, p...","[en rodajas, puré]"
81,101393,espinacas a la andaluza,2009,"[aceite de oliva virgen, comino, vino, espinac...","[horno, cocer, a fuego lento, cazuela, sartén,..."


In [41]:
cookpad_repr_recipes_df['creativity'] = cookpad_repr_recipes_df.apply(
    lambda x: 0, axis=1)

In [42]:
cookpad_repr_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
17,100377,estofado de venado,2009,"[laurel, zanahoria, aceite de oliva, pimienta,...","[cocer, a fuego lento, horno, sal, juliana, agua]",0
18,100383,arroz con mariscos mejjicanna,2009,"[guisantes, azafrán, laurel, calamares, marisc...","[horno, horno, hervir, agua]",0
60,101081,garbanzos con tomate,2010,"[salsa de tomate, ajo, garbanzos cocidos, tomi...",[cocer],0
77,101360,brazo gitano de gamba y surimi,2012,"[pimiento, mayonesa, surimi, huevos, gambas, p...","[en rodajas, puré]",0
81,101393,espinacas a la andaluza,2009,"[aceite de oliva virgen, comino, vino, espinac...","[horno, cocer, a fuego lento, cazuela, sartén,...",0


### elbulli & cookpad

In [43]:
repr_recipes_df = elbulli_repr_recipes_df.append(cookpad_repr_recipes_df, ignore_index=True)

In [44]:
repr_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
0,1,terrina de melón con gelée de oporto,1987,"[oporto, aceite de oliva, agua, pimienta blanc...",[agua],10
1,10,salmonetes gaudí,1987,"[sal, pimientos, pimiento rojo, piñones tostad...","[sal, marcar, cocción, horno, hervir, horno, s...",10
2,100,oursins,1990,"[chocolate, claras de huevo, avellanas tostada...","[horno, horno, secar]",10
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[zumo de limón, limón, alginato sódico, agua, ...","[agua, cocer, horno]",30
4,1001,ceps en ámbar de su caldo,2004,"[sal, caldo de ceps, carragenato kappa en polv...","[sal, horno, cocer, hervir, confitar, cazuela,...",30


In [45]:
repr_recipes_df.tail()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
1613,99341,refrito de col y garbanzos,2009,"[aceite de oliva, pimienta, ajo, col, panceta,...","[horno, cocer, hervir, horno, sartén, sal, agua]",0
1614,99733,roscón de villalba,2009,"[huevos, guindilla, almendras, azúcar]","[horno, horno, cocer]",0
1615,99741,tarta de santiago tradicional,2009,"[almendras, huevos, mantequilla, azúcar glas, ...","[horno, horno, relleno, agua, horno]",0
1616,99839,cogote de merluza a la donostierra,2009,"[merluza, ajo, guindilla, aceitunas, vinagre]","[sartén, sal, plancha]",0
1617,99963,flan de huevo en microondas,2009,"[huevos, leche condensada, leche entera]","[microondas, caramelizado]",0


## Superclasses of ingredients and techniques

### elbulli

In [46]:
elbulli_super_recipes_df = pd.read_csv('data/dbs/recipes_elbulli_superclasses.csv')

In [47]:
elbulli_super_recipes_df.dtypes

_id             int64
title          object
year            int64
ingredients    object
techniques     object
dtype: object

In [48]:
elbulli_super_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
0,1,terrina de melón con gelée de oporto,1987,"[""oporto"",""aceite"",""agua"",""pimienta"",""gelatina...","[""agua""]"
1,10,salmonetes gaudí,1987,"[""sal"",""pimientos"",""pimiento"",""piñones"",""vinag...","[""sal"",""marcar"",""cocción"",""horno"",""hervir"",""ho..."
2,100,oursins,1990,"[""chocolate"",""claras"",""avellanas"",""yemas"",""azú...","[""horno"",""horno"",""secar""]"
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[""zumo"",""limón"",""alginato"",""agua"",""té"",""azúcar""]","[""agua"",""cocer"",""horno""]"
4,1001,ceps en ámbar de su caldo,2004,"[""sal"",""ceps"",""carragenato"",""romero"",""ceps"",""c...","[""sal"",""horno"",""cocer"",""hervir"",""confitar"",""ca..."


In [49]:
elbulli_super_recipes_df['ingredients'] = elbulli_super_recipes_df.apply(lambda x: json.loads(x['ingredients']), axis=1)
elbulli_super_recipes_df['techniques'] = elbulli_super_recipes_df.apply(lambda x: json.loads(x['techniques']), axis=1)

In [50]:
elbulli_super_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
0,1,terrina de melón con gelée de oporto,1987,"[oporto, aceite, agua, pimienta, gelatina, mel...",[agua]
1,10,salmonetes gaudí,1987,"[sal, pimientos, pimiento, piñones, vinagre, o...","[sal, marcar, cocción, horno, hervir, horno, s..."
2,100,oursins,1990,"[chocolate, claras, avellanas, yemas, azúcar, ...","[horno, horno, secar]"
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[zumo, limón, alginato, agua, té, azúcar]","[agua, cocer, horno]"
4,1001,ceps en ámbar de su caldo,2004,"[sal, ceps, carragenato, romero, ceps, ceps, a...","[sal, horno, cocer, hervir, confitar, cazuela,..."


In [51]:
elbulli_super_recipes_df['creativity'] = elbulli_super_recipes_df.apply(
    lambda x: 10 if 1987 <= x['year'] <= 1997 else \
              20 if 1998 <= x['year'] <= 2001 else \
              30 if 2002 <= x['year'] <= 2005 else \
              'UNKOWN', axis=1)

In [52]:
elbulli_super_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
0,1,terrina de melón con gelée de oporto,1987,"[oporto, aceite, agua, pimienta, gelatina, mel...",[agua],10
1,10,salmonetes gaudí,1987,"[sal, pimientos, pimiento, piñones, vinagre, o...","[sal, marcar, cocción, horno, hervir, horno, s...",10
2,100,oursins,1990,"[chocolate, claras, avellanas, yemas, azúcar, ...","[horno, horno, secar]",10
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[zumo, limón, alginato, agua, té, azúcar]","[agua, cocer, horno]",30
4,1001,ceps en ámbar de su caldo,2004,"[sal, ceps, carragenato, romero, ceps, ceps, a...","[sal, horno, cocer, hervir, confitar, cazuela,...",30


### cookpad

In [53]:
cookpad_super_recipes_df = pd.read_csv('data/dbs/recipes_cookpad_superclasses.csv')
cookpad_super_recipes_df = my_sample(cookpad_super_recipes_df)

In [54]:
cookpad_super_recipes_df.dtypes

_id             int64
title          object
year            int64
ingredients    object
techniques     object
dtype: object

In [55]:
cookpad_super_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
17,100377,estofado de venado,2009,"[""laurel"",""zanahoria"",""aceite"",""pimienta"",""ajo...","[""cocer"",""a"",""horno"",""sal"",""juliana"",""agua""]"
18,100383,arroz con mariscos mejjicanna,2009,"[""guisantes"",""azafrán"",""laurel"",""calamares"",""m...","[""horno"",""horno"",""hervir"",""agua""]"
60,101081,garbanzos con tomate,2010,"[""salsa"",""ajo"",""garbanzos"",""tomillo"",""laurel""]","[""cocer""]"
77,101360,brazo gitano de gamba y surimi,2012,"[""pimiento"",""mayonesa"",""surimi"",""huevos"",""gamb...","[""rodajas"",""puré""]"
81,101393,espinacas a la andaluza,2009,"[""aceite"",""comino"",""vino"",""espinacas"",""pimentó...","[""horno"",""cocer"",""a"",""cazuela"",""sartén"",""horno..."


In [56]:
cookpad_super_recipes_df['ingredients'] = cookpad_super_recipes_df.apply(lambda x: json.loads(x['ingredients']), axis=1)
cookpad_super_recipes_df['techniques'] = cookpad_super_recipes_df.apply(lambda x: json.loads(x['techniques']), axis=1)

In [57]:
cookpad_super_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
17,100377,estofado de venado,2009,"[laurel, zanahoria, aceite, pimienta, ajo, cie...","[cocer, a, horno, sal, juliana, agua]"
18,100383,arroz con mariscos mejjicanna,2009,"[guisantes, azafrán, laurel, calamares, marisc...","[horno, horno, hervir, agua]"
60,101081,garbanzos con tomate,2010,"[salsa, ajo, garbanzos, tomillo, laurel]",[cocer]
77,101360,brazo gitano de gamba y surimi,2012,"[pimiento, mayonesa, surimi, huevos, gambas, p...","[rodajas, puré]"
81,101393,espinacas a la andaluza,2009,"[aceite, comino, vino, espinacas, pimentón, ga...","[horno, cocer, a, cazuela, sartén, horno, horn..."


In [58]:
cookpad_super_recipes_df['creativity'] = cookpad_super_recipes_df.apply(
    lambda x: 0, axis=1)

In [59]:
cookpad_super_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
17,100377,estofado de venado,2009,"[laurel, zanahoria, aceite, pimienta, ajo, cie...","[cocer, a, horno, sal, juliana, agua]",0
18,100383,arroz con mariscos mejjicanna,2009,"[guisantes, azafrán, laurel, calamares, marisc...","[horno, horno, hervir, agua]",0
60,101081,garbanzos con tomate,2010,"[salsa, ajo, garbanzos, tomillo, laurel]",[cocer],0
77,101360,brazo gitano de gamba y surimi,2012,"[pimiento, mayonesa, surimi, huevos, gambas, p...","[rodajas, puré]",0
81,101393,espinacas a la andaluza,2009,"[aceite, comino, vino, espinacas, pimentón, ga...","[horno, cocer, a, cazuela, sartén, horno, horn...",0


### elbulli & cookpad

In [60]:
super_recipes_df = elbulli_super_recipes_df.append(cookpad_super_recipes_df, ignore_index=True)

In [61]:
super_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
0,1,terrina de melón con gelée de oporto,1987,"[oporto, aceite, agua, pimienta, gelatina, mel...",[agua],10
1,10,salmonetes gaudí,1987,"[sal, pimientos, pimiento, piñones, vinagre, o...","[sal, marcar, cocción, horno, hervir, horno, s...",10
2,100,oursins,1990,"[chocolate, claras, avellanas, yemas, azúcar, ...","[horno, horno, secar]",10
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[zumo, limón, alginato, agua, té, azúcar]","[agua, cocer, horno]",30
4,1001,ceps en ámbar de su caldo,2004,"[sal, ceps, carragenato, romero, ceps, ceps, a...","[sal, horno, cocer, hervir, confitar, cazuela,...",30


In [62]:
super_recipes_df.tail()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
1613,99341,refrito de col y garbanzos,2009,"[aceite, pimienta, ajo, col, panceta, garbanzo...","[horno, cocer, hervir, horno, sartén, sal, agua]",0
1614,99733,roscón de villalba,2009,"[huevos, guindilla, almendras, azúcar]","[horno, horno, cocer]",0
1615,99741,tarta de santiago tradicional,2009,"[almendras, huevos, mantequilla, azúcar, huevo...","[horno, horno, relleno, agua, horno]",0
1616,99839,cogote de merluza a la donostierra,2009,"[merluza, ajo, guindilla, aceitunas, vinagre]","[sartén, sal, plancha]",0
1617,99963,flan de huevo en microondas,2009,"[huevos, leche, leche]","[microondas, caramelizado]",0


## Types of ingredients, representative techniques

### elbulli

In [63]:
elbulli_types_recipes_df = pd.read_csv('data/dbs/recipes_elbulli_types.csv')

In [64]:
elbulli_types_recipes_df.dtypes

_id             int64
title          object
year            int64
ingredients    object
techniques     object
dtype: object

In [65]:
elbulli_types_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
0,1,terrina de melón con gelée de oporto,1987,"[""western"",""western"",""1"",""western"",""modernist""...","[""agua""]"
1,10,salmonetes gaudí,1987,"[""western"",""western"",""western"",""1"",""western"",""...","[""sal"",""marcar"",""cocción"",""horno"",""hervir"",""ho..."
2,100,oursins,1990,"[""western"",""western"",""1"",""western"",""western"",""...","[""horno"",""horno"",""secar""]"
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[""1"",""western"",""modernist"",""1"",""western"",""west...","[""agua"",""cocer"",""horno""]"
4,1001,ceps en ámbar de su caldo,2004,"[""western"",""modernist"",""modernist"",""modernist""...","[""sal"",""horno"",""cocer"",""hervir"",""confitar"",""ca..."


In [66]:
elbulli_types_recipes_df['ingredients'] = elbulli_types_recipes_df.apply(lambda x: json.loads(x['ingredients']), axis=1)
elbulli_types_recipes_df['techniques'] = elbulli_types_recipes_df.apply(lambda x: json.loads(x['techniques']), axis=1)

In [67]:
elbulli_types_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
0,1,terrina de melón con gelée de oporto,1987,"[western, western, 1, western, modernist, west...",[agua]
1,10,salmonetes gaudí,1987,"[western, western, western, 1, western, wester...","[sal, marcar, cocción, horno, hervir, horno, s..."
2,100,oursins,1990,"[western, western, 1, western, western, western]","[horno, horno, secar]"
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[1, western, modernist, 1, western, western]","[agua, cocer, horno]"
4,1001,ceps en ámbar de su caldo,2004,"[western, modernist, modernist, modernist, wes...","[sal, horno, cocer, hervir, confitar, cazuela,..."


In [68]:
elbulli_types_recipes_df['creativity'] = elbulli_types_recipes_df.apply(
    lambda x: 10 if 1987 <= x['year'] <= 1997 else \
              20 if 1998 <= x['year'] <= 2001 else \
              30 if 2002 <= x['year'] <= 2005 else \
              'UNKOWN', axis=1)

In [69]:
elbulli_types_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
0,1,terrina de melón con gelée de oporto,1987,"[western, western, 1, western, modernist, west...",[agua],10
1,10,salmonetes gaudí,1987,"[western, western, western, 1, western, wester...","[sal, marcar, cocción, horno, hervir, horno, s...",10
2,100,oursins,1990,"[western, western, 1, western, western, western]","[horno, horno, secar]",10
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[1, western, modernist, 1, western, western]","[agua, cocer, horno]",30
4,1001,ceps en ámbar de su caldo,2004,"[western, modernist, modernist, modernist, wes...","[sal, horno, cocer, hervir, confitar, cazuela,...",30


### cookpad

In [70]:
cookpad_types_recipes_df = pd.read_csv('data/dbs/recipes_cookpad_types.csv')
cookpad_types_recipes_df = my_sample(cookpad_types_recipes_df)

In [71]:
cookpad_types_recipes_df.dtypes

_id             int64
title          object
year            int64
ingredients    object
techniques     object
dtype: object

In [72]:
cookpad_types_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
17,100377,estofado de venado,2009,"[""western"",""western"",""western"",""western"",""west...","[""cocer"",""a fuego lento"",""horno"",""sal"",""julian..."
18,100383,arroz con mariscos mejjicanna,2009,"[""western"",""western"",""western"",""western"",""west...","[""horno"",""horno"",""hervir"",""agua""]"
60,101081,garbanzos con tomate,2010,"[""western"",""western"",""western"",""western"",""west...","[""cocer""]"
77,101360,brazo gitano de gamba y surimi,2012,"[""western"",""western"",""asian"",""western"",""wester...","[""en rodajas"",""puré""]"
81,101393,espinacas a la andaluza,2009,"[""western"",""western"",""western"",""western"",""west...","[""horno"",""cocer"",""a fuego lento"",""cazuela"",""sa..."


In [73]:
cookpad_types_recipes_df['ingredients'] = cookpad_types_recipes_df.apply(lambda x: json.loads(x['ingredients']), axis=1)
cookpad_types_recipes_df['techniques'] = cookpad_types_recipes_df.apply(lambda x: json.loads(x['techniques']), axis=1)

In [74]:
cookpad_types_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
17,100377,estofado de venado,2009,"[western, western, western, western, western, ...","[cocer, a fuego lento, horno, sal, juliana, agua]"
18,100383,arroz con mariscos mejjicanna,2009,"[western, western, western, western, western, ...","[horno, horno, hervir, agua]"
60,101081,garbanzos con tomate,2010,"[western, western, western, western, western]",[cocer]
77,101360,brazo gitano de gamba y surimi,2012,"[western, western, asian, western, western, we...","[en rodajas, puré]"
81,101393,espinacas a la andaluza,2009,"[western, western, western, western, western, ...","[horno, cocer, a fuego lento, cazuela, sartén,..."


In [75]:
cookpad_types_recipes_df['creativity'] = cookpad_types_recipes_df.apply(
    lambda x: 0, axis=1)

In [76]:
cookpad_types_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
17,100377,estofado de venado,2009,"[western, western, western, western, western, ...","[cocer, a fuego lento, horno, sal, juliana, agua]",0
18,100383,arroz con mariscos mejjicanna,2009,"[western, western, western, western, western, ...","[horno, horno, hervir, agua]",0
60,101081,garbanzos con tomate,2010,"[western, western, western, western, western]",[cocer],0
77,101360,brazo gitano de gamba y surimi,2012,"[western, western, asian, western, western, we...","[en rodajas, puré]",0
81,101393,espinacas a la andaluza,2009,"[western, western, western, western, western, ...","[horno, cocer, a fuego lento, cazuela, sartén,...",0


### elbulli & cookpad

In [77]:
types_recipes_df = elbulli_types_recipes_df.append(cookpad_types_recipes_df, ignore_index=True)

In [78]:
types_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
0,1,terrina de melón con gelée de oporto,1987,"[western, western, 1, western, modernist, west...",[agua],10
1,10,salmonetes gaudí,1987,"[western, western, western, 1, western, wester...","[sal, marcar, cocción, horno, hervir, horno, s...",10
2,100,oursins,1990,"[western, western, 1, western, western, western]","[horno, horno, secar]",10
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[1, western, modernist, 1, western, western]","[agua, cocer, horno]",30
4,1001,ceps en ámbar de su caldo,2004,"[western, modernist, modernist, modernist, wes...","[sal, horno, cocer, hervir, confitar, cazuela,...",30


## Cuisines of ingredients, representative techniques

### elbulli

In [79]:
elbulli_cuis_recipes_df = pd.read_csv('data/dbs/recipes_elbulli_cuisines.csv')

In [80]:
elbulli_cuis_recipes_df.dtypes

_id             int64
title          object
year            int64
ingredients    object
techniques     object
dtype: object

In [81]:
elbulli_cuis_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
0,1,terrina de melón con gelée de oporto,1987,"[""other"",""spicies_and_condimients"",""drinks"",""2...","[""agua""]"
1,10,salmonetes gaudí,1987,"[""spicies_and_condimients"",""vegetables"",""veget...","[""sal"",""marcar"",""cocción"",""horno"",""hervir"",""ho..."
2,100,oursins,1990,"[""sweets"",""spicies_and_condimients"",""nuts"",""sp...","[""horno"",""horno"",""secar""]"
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[""14"",""fruits"",""other"",""drinks"",""drinks"",""swee...","[""agua"",""cocer"",""horno""]"
4,1001,ceps en ámbar de su caldo,2004,"[""spicies_and_condimients"",""other"",""other"",""sp...","[""sal"",""horno"",""cocer"",""hervir"",""confitar"",""ca..."


In [82]:
elbulli_cuis_recipes_df['ingredients'] = elbulli_cuis_recipes_df.apply(lambda x: json.loads(x['ingredients']), axis=1)
elbulli_cuis_recipes_df['techniques'] = elbulli_cuis_recipes_df.apply(lambda x: json.loads(x['techniques']), axis=1)

In [83]:
elbulli_cuis_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
0,1,terrina de melón con gelée de oporto,1987,"[other, spicies_and_condimients, drinks, 2, ot...",[agua]
1,10,salmonetes gaudí,1987,"[spicies_and_condimients, vegetables, vegetabl...","[sal, marcar, cocción, horno, hervir, horno, s..."
2,100,oursins,1990,"[sweets, spicies_and_condimients, nuts, spicie...","[horno, horno, secar]"
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[14, fruits, other, drinks, drinks, sweets]","[agua, cocer, horno]"
4,1001,ceps en ámbar de su caldo,2004,"[spicies_and_condimients, other, other, spicie...","[sal, horno, cocer, hervir, confitar, cazuela,..."


In [84]:
elbulli_cuis_recipes_df['creativity'] = elbulli_cuis_recipes_df.apply(
    lambda x: 10 if 1987 <= x['year'] <= 1997 else \
              20 if 1998 <= x['year'] <= 2001 else \
              30 if 2002 <= x['year'] <= 2005 else \
              'UNKOWN', axis=1)

In [85]:
elbulli_cuis_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
0,1,terrina de melón con gelée de oporto,1987,"[other, spicies_and_condimients, drinks, 2, ot...",[agua],10
1,10,salmonetes gaudí,1987,"[spicies_and_condimients, vegetables, vegetabl...","[sal, marcar, cocción, horno, hervir, horno, s...",10
2,100,oursins,1990,"[sweets, spicies_and_condimients, nuts, spicie...","[horno, horno, secar]",10
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[14, fruits, other, drinks, drinks, sweets]","[agua, cocer, horno]",30
4,1001,ceps en ámbar de su caldo,2004,"[spicies_and_condimients, other, other, spicie...","[sal, horno, cocer, hervir, confitar, cazuela,...",30


### cookpad

In [86]:
cookpad_cuis_recipes_df = pd.read_csv('data/dbs/recipes_cookpad_cuisines.csv')
cookpad_cuis_recipes_df = my_sample(cookpad_cuis_recipes_df)

In [87]:
cookpad_cuis_recipes_df.dtypes

_id             int64
title          object
year            int64
ingredients    object
techniques     object
dtype: object

In [88]:
cookpad_cuis_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
17,100377,estofado de venado,2009,"[""spicies_and_condimients"",""vegetables"",""spici...","[""cocer"",""a fuego lento"",""horno"",""sal"",""julian..."
18,100383,arroz con mariscos mejjicanna,2009,"[""vegetables"",""spicies_and_condimients"",""spici...","[""horno"",""horno"",""hervir"",""agua""]"
60,101081,garbanzos con tomate,2010,"[""spicies_and_condimients"",""spicies_and_condim...","[""cocer""]"
77,101360,brazo gitano de gamba y surimi,2012,"[""vegetables"",""spicies_and_condimients"",""seafo...","[""en rodajas"",""puré""]"
81,101393,espinacas a la andaluza,2009,"[""spicies_and_condimients"",""spicies_and_condim...","[""horno"",""cocer"",""a fuego lento"",""cazuela"",""sa..."


In [89]:
cookpad_cuis_recipes_df['ingredients'] = cookpad_cuis_recipes_df.apply(lambda x: json.loads(x['ingredients']), axis=1)
cookpad_cuis_recipes_df['techniques'] = cookpad_cuis_recipes_df.apply(lambda x: json.loads(x['techniques']), axis=1)

In [90]:
cookpad_cuis_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
17,100377,estofado de venado,2009,"[spicies_and_condimients, vegetables, spicies_...","[cocer, a fuego lento, horno, sal, juliana, agua]"
18,100383,arroz con mariscos mejjicanna,2009,"[vegetables, spicies_and_condimients, spicies_...","[horno, horno, hervir, agua]"
60,101081,garbanzos con tomate,2010,"[spicies_and_condimients, spicies_and_condimie...",[cocer]
77,101360,brazo gitano de gamba y surimi,2012,"[vegetables, spicies_and_condimients, seafood,...","[en rodajas, puré]"
81,101393,espinacas a la andaluza,2009,"[spicies_and_condimients, spicies_and_condimie...","[horno, cocer, a fuego lento, cazuela, sartén,..."


In [91]:
cookpad_cuis_recipes_df['creativity'] = cookpad_cuis_recipes_df.apply(
    lambda x: 0, axis=1)

In [92]:
cookpad_cuis_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
17,100377,estofado de venado,2009,"[spicies_and_condimients, vegetables, spicies_...","[cocer, a fuego lento, horno, sal, juliana, agua]",0
18,100383,arroz con mariscos mejjicanna,2009,"[vegetables, spicies_and_condimients, spicies_...","[horno, horno, hervir, agua]",0
60,101081,garbanzos con tomate,2010,"[spicies_and_condimients, spicies_and_condimie...",[cocer],0
77,101360,brazo gitano de gamba y surimi,2012,"[vegetables, spicies_and_condimients, seafood,...","[en rodajas, puré]",0
81,101393,espinacas a la andaluza,2009,"[spicies_and_condimients, spicies_and_condimie...","[horno, cocer, a fuego lento, cazuela, sartén,...",0


### elbulli & cookpad

In [93]:
cuis_recipes_df = elbulli_cuis_recipes_df.append(cookpad_cuis_recipes_df, ignore_index=True)

In [94]:
cuis_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques,creativity
0,1,terrina de melón con gelée de oporto,1987,"[other, spicies_and_condimients, drinks, 2, ot...",[agua],10
1,10,salmonetes gaudí,1987,"[spicies_and_condimients, vegetables, vegetabl...","[sal, marcar, cocción, horno, hervir, horno, s...",10
2,100,oursins,1990,"[sweets, spicies_and_condimients, nuts, spicie...","[horno, horno, secar]",10
3,1000,ravioli sférico de té con cubo helado de limón,2004,"[14, fruits, other, drinks, drinks, sweets]","[agua, cocer, horno]",30
4,1001,ceps en ámbar de su caldo,2004,"[spicies_and_condimients, other, other, spicie...","[sal, horno, cocer, hervir, confitar, cazuela,...",30


# Formatting data

In [95]:
def join_ingredients_and_techniques(ingr_list, tech_list):
    i_list = ['i_' + '_'.join(x.split()) for x in ingr_list]
    t_list = ['t_' + '_'.join(x.split()) for x in tech_list]
    return ' '.join(i_list + t_list)

In [96]:
columns = ['text', 'creativity']

In [97]:
raw_text_df = pd.DataFrame(columns=columns)
raw_text_df['creativity'] = raw_recipes_df['creativity']
raw_text_df['text'] = raw_recipes_df.apply(
    lambda x: join_ingredients_and_techniques(x['ingredients'], x['techniques']), axis=1)

In [98]:
raw_text_df.head()

Unnamed: 0,text,creativity
0,i_oporto i_aceite_de_oliva i_agua i_pimienta_b...,10
1,i_sal i_pimiento i_pimiento_rojo i_piñones_tos...,10
2,i_chocolate i_claras_de_huevo i_avellanas_tost...,10
3,i_zumo_de_limón i_limón i_alginato_sódico i_ag...,30
4,i_sal i_caldo_de_ceps i_carragenato_kappa_en_p...,30


In [99]:
repr_text_df = pd.DataFrame(columns=columns)
repr_text_df['creativity'] = repr_recipes_df['creativity']
repr_text_df['text'] = repr_recipes_df.apply(
    lambda x: join_ingredients_and_techniques(x['ingredients'], x['techniques']), axis=1)

In [100]:
repr_text_df.head()

Unnamed: 0,text,creativity
0,i_oporto i_aceite_de_oliva i_agua i_pimienta_b...,10
1,i_sal i_pimientos i_pimiento_rojo i_piñones_to...,10
2,i_chocolate i_claras_de_huevo i_avellanas_tost...,10
3,i_zumo_de_limón i_limón i_alginato_sódico i_ag...,30
4,i_sal i_caldo_de_ceps i_carragenato_kappa_en_p...,30


In [101]:
super_text_df = pd.DataFrame(columns=columns)
super_text_df['creativity'] = super_recipes_df['creativity']
super_text_df['text'] = super_recipes_df.apply(
    lambda x: join_ingredients_and_techniques(x['ingredients'], x['techniques']), axis=1)

In [102]:
super_text_df.head()

Unnamed: 0,text,creativity
0,i_oporto i_aceite i_agua i_pimienta i_gelatina...,10
1,i_sal i_pimientos i_pimiento i_piñones i_vinag...,10
2,i_chocolate i_claras i_avellanas i_yemas i_azú...,10
3,i_zumo i_limón i_alginato i_agua i_té i_azúcar...,30
4,i_sal i_ceps i_carragenato i_romero i_ceps i_c...,30


In [103]:
types_text_df = pd.DataFrame(columns=columns)
types_text_df['creativity'] = types_recipes_df['creativity']
types_text_df['text'] = types_recipes_df.apply(
    lambda x: join_ingredients_and_techniques(x['ingredients'], x['techniques']), axis=1)

In [104]:
types_text_df.head()

Unnamed: 0,text,creativity
0,i_western i_western i_1 i_western i_modernist ...,10
1,i_western i_western i_western i_1 i_western i_...,10
2,i_western i_western i_1 i_western i_western i_...,10
3,i_1 i_western i_modernist i_1 i_western i_west...,30
4,i_western i_modernist i_modernist i_modernist ...,30


In [105]:
cuis_text_df = pd.DataFrame(columns=columns)
cuis_text_df['creativity'] = cuis_recipes_df['creativity']
cuis_text_df['text'] = cuis_recipes_df.apply(
    lambda x: join_ingredients_and_techniques(x['ingredients'], x['techniques']), axis=1)

In [106]:
cuis_text_df.head()

Unnamed: 0,text,creativity
0,i_other i_spicies_and_condimients i_drinks i_2...,10
1,i_spicies_and_condimients i_vegetables i_veget...,10
2,i_sweets i_spicies_and_condimients i_nuts i_sp...,10
3,i_14 i_fruits i_other i_drinks i_drinks i_swee...,30
4,i_spicies_and_condimients i_other i_other i_sp...,30


# Classifier

In [202]:
K = 10

In [243]:
def benchmark(grid_search_cv, X_train, X_test, y_train, y_test, name):
    print('Training %s...' % name)
    print()
    grid_search_cv.fit(X_train, y_train)
    print('Best score: %0.2f' % grid_search_cv.best_score_)
    print('Best parameters:', grid_search_cv.best_params_)
    print()
    print('Testing %s...' % name)
    print()
    pred = grid_search_cv.predict(X_test)
    print('Metrics:')
    score = metrics.accuracy_score(y_test, pred)
    print('accuracy  = %0.2f' % score)
    precision = metrics.precision_score(y_test, pred, average='weighted')
    print('precision = %0.2f' % precision)
    recall = metrics.recall_score(y_test, pred, average='weighted')
    print('recall    = %0.2f' % recall)
    f1_score = metrics.f1_score(y_test, pred, average='weighted')
    print('f1_score  = %0.2f' % f1_score)
    print()
    print('Classification report:')
    print(metrics.classification_report(y_test, pred, target_names=['None' ,'Low', 'Medium', 'High']))
    print()
    print('Confusion matrix:')
    print(metrics.confusion_matrix(y_test, pred))
    print()
    return score, precision, recall, f1_score

In [237]:
classifiers = [
    ('BernoulliNB', BernoulliNB, {}),
    ('KNeighborsClassifier', KNeighborsClassifier, {}),
    ('LinearSVC', LinearSVC, {}),
    ('MultinomialNB', MultinomialNB, {}),
    ('NearestCentroid', NearestCentroid, {}),
    ('PassiveAggressiveClassifier', PassiveAggressiveClassifier, {}),
    ('Perceptron', Perceptron, {}),
    ('RandomForestClassifier', RandomForestClassifier, {}),
    ('RidgeClassifier', RidgeClassifier, {}),
    ('SGDClassifier', SGDClassifier, {}),
]

## Raw ingredients and techniques

In [232]:
X = raw_text_df['text']
y = raw_text_df['creativity']

In [233]:
y.value_counts().sort_index()

0     404
10    454
20    371
30    389
Name: creativity, dtype: int64

In [234]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0, stratify=y)

In [235]:
skf = StratifiedKFold(y_train, K)

In [248]:
print('Performing grid search with cross-validation...')
print('=' * 80)
print()
for name, clf, parameters in classifiers:
    pipeline = Pipeline([
        ('CountVectorizer', CountVectorizer()),
        ('TfidfTransformer', TfidfTransformer()),
        ('Classifier', clf()),
    ])
    grid_search_cv = GridSearchCV(pipeline, parameters, cv=skf, scoring='f1_weighted', error_score=0, n_jobs=-1)
    benchmark(grid_search_cv, X_train, X_test, y_train, y_test, name)
    print('-' * 80)
#mirar esto tb http://scikit-learn.org/stable/auto_examples/text/document_clustering.html#example-text-document-clustering-py

Performing grid search with cross-validation...

Training BernoulliNB...

Best score: 0.78
Best parameters: {}

Testing BernoulliNB...

Metrics:
accuracy  = 0.79
precision = 0.79
recall    = 0.79
f1_score  = 0.78

Classification report:
             precision    recall  f1-score   support

       None       0.85      0.97      0.91        40
        Low       0.81      0.56      0.66        45
     Medium       0.67      0.76      0.71        37
       High       0.83      0.90      0.86        39

avg / total       0.79      0.79      0.78       161


Confusion matrix:
[[39  0  1  0]
 [ 3 25 13  4]
 [ 3  3 28  3]
 [ 1  3  0 35]]

--------------------------------------------------------------------------------
Training KNeighborsClassifier...

Best score: 0.82
Best parameters: {}

Testing KNeighborsClassifier...

Metrics:
accuracy  = 0.78
precision = 0.80
recall    = 0.78
f1_score  = 0.79

Classification report:
             precision    recall  f1-score   support

       None       0.

KeyboardInterrupt: 

In [None]:
# >>> from sklearn.metrics import confusion_matrix
# >>> y_true = [1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 2, 2, 2, 2]
# >>> y_pred = [1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 2, 2, 2, 1]
# >>> print(confusion_matrix(y_true, y_pred))
# [[4 1 0]
#  [2 5 0]
#  [0 1 3]]

In [107]:
# Params
K = 10
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'tfidf__norm': (None, 'l1', 'l2'),
}

In [None]:
# # Classifiers
# clf_list = [
#     (RidgeClassifier(alpha=.00001, tol=1e-2, solver="lsqr"), "Ridge classifier"),
#     (Perceptron(alpha=.00001, n_iter=50), "Perceptron"),
#     (PassiveAggressiveClassifier(n_iter=50), "Passive-aggressive"),
#     (KNeighborsClassifier(n_neighbors=10), "kNN"),
#     (RandomForestClassifier(n_estimators=100), "Random Forest Classifier"),
# #     (RandomForestRegressor(n_estimators=100), "Random Forest Regressor"),
#     (LinearSVC(loss='squared_hinge', penalty='l2', dual=False, tol=1e-3), 'Linear SVC 1'),
#     (SGDClassifier(alpha=.000001, n_iter=50, penalty='l1'), 'SGDClassifier'),
# #     (SGDRegressor(alpha=.000001, n_iter=50, penalty='l1'), 'SGDRegressor'),
#     (NearestCentroid(), 'Nearest Centroid'),
#     (MultinomialNB(alpha=.00001), 'Multinomial NB'),
#     (BernoulliNB(alpha=.00001), 'Bernoulli NB'),
#     (LinearSVC(penalty="l1", dual=False, tol=1e-3), 'Linear SVC 2'),
#     (RidgeClassifier(), "Ridge classifier"),
#     (Perceptron(), "Perceptron"),
#     (PassiveAggressiveClassifier(), "Passive-aggressive"),
#     (KNeighborsClassifier(), "kNN"),
#     (RandomForestClassifier(), "Random Forest Classifier"),
# #     (RandomForestRegressor(), "Random Forest Regressor"),
#     (LinearSVC(), 'Linear SVC'),
#     (SGDClassifier(), 'SGDClassifier'),
# #     (SGDRegressor(), 'SGDRegressor'),
#     (NearestCentroid(), 'Nearest Centroid'),
#     (MultinomialNB(), 'Multinomial NB'),
#     (BernoulliNB(), 'Bernoulli NB'),
# ]

In [None]:
# # Classifiers
# clf_list = [
#     (LinearSVC(), 'Linear SVC'),
#     (LinearSVC(penalty="l1", dual=False, tol=1e-3), 'Linear SVC'),
#     (LinearSVC(loss='squared_hinge', penalty='l2', dual=False, tol=1e-3), 'Linear SVC'),
#     (LinearSVC(loss='hinge', penalty='l1', dual=True, tol=1e-1, multi_class='crammer_singer', random_state=100), 'Linear SVC'),
# ]

In [110]:
# Classifiers
clf_list = []
for loss in ['hinge', 'squared_hinge']:
    for tol in [0.0001, 0.001, 0.01, 0.1]:
        for multi_class in ['ovr', 'crammer_singer']:
            clf_list.append((
                    LinearSVC(loss=loss, tol=tol, multi_class=multi_class),
                    'LinearSVC(loss={}, tol={}, multi_class={})'.format(loss, tol, multi_class)
                ))
len(clf_list)

16

In [118]:
# X_train = X_data
# X_test = X_val
# y_train = y_data
# y_test = y_val

for clf, name in clf_list:
        pipeline = Pipeline([
            ('vect', CountVectorizer()),
            ('tfidf', TfidfTransformer(use_idf=True)),
            ('clf', clf),
        ])
#         grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
#         benchmark_results = benchmark(grid_search, X_train, X_test, y_train, y_test, name)
        benchmark_results = benchmark(pipeline, X_train, X_test, y_train, y_test, name)
#         results[name].append(benchmark_results)

________________________________________________________________________________
Training: LinearSVC(loss=hinge, tol=0.0001, multi_class=ovr)
Pipeline(steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...e', max_iter=1000, multi_class='ovr',
     penalty='l2', random_state=None, tol=0.0001, verbose=0))])
train time: 0.232s
test time:  0.018s
accuracy:   0.968
precision:   0.968
recall:   0.968
classification report:


  sample_weight=sample_weight)
  sample_weight=sample_weight)


IndexError: list index out of range

In [None]:
# split a training set and a test set
y_train = y_data
y_test = y_val

vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5)

print("Extracting features from the training data using a sparse vectorizer")
X_train = vectorizer.fit_transform(X_data)
print("n_samples: %d, n_features: %d" % X_train.shape)
print()

print("Extracting features from the test data using the same vectorizer")
X_test = vectorizer.transform(X_data)
print("n_samples: %d, n_features: %d" % X_test.shape)
print()

In [None]:
# mapping from integer feature name to original token string
feature_names = vectorizer.get_feature_names()
opts_select_chi2 = 50

print("Extracting %d best features by a chi-squared test" %
      opts_select_chi2)
ch2 = SelectKBest(chi2, k=opts_select_chi2)
X_train = ch2.fit_transform(X_train, y_train)
X_test = ch2.transform(X_test)
if feature_names:
    # keep selected feature names
    feature_names = [feature_names[i] for i
                     in ch2.get_support(indices=True)]
print()

feature_names = np.asarray(feature_names)

In [None]:
feature_names

In [None]:
count_d = {
    '1987-1997': {},
    '1998-2001': {},
    '2002-2005': {},
}
for y in ['1987-1997', '1998-2001', '2002-2005']:
    for f in feature_names:
        x = f[2:].replace('_', ' ')
        if f.startswith('i'):
            count_d[y][f] = 0
            for r in recipes_df[(int(y.split('-')[0]) <= recipes_df['year']) & (recipes_df['year'] <= int(y.split('-')[1]))].iterrows():
                count_d[y][f] += r[1]['ingredients'].count(x)
        elif f.startswith('t'):
            count_d[y][f] = 0
            for r in recipes_df[(int(y.split('-')[0]) <= recipes_df['year']) & (recipes_df['year'] <= int(y.split('-')[1]))].iterrows():
                count_d[y][f] += r[1]['techniques'].count(x)

In [None]:
count_d

In [None]:
s1 = pd.Series(count_d['1987-1997'])
s2 = pd.Series(count_d['1998-2001'])
s3 = pd.Series(count_d['2002-2005'])

In [None]:
df = pd.DataFrame({
        '1987-1997': s1,
        '1998-2001': s2,
        '2002-2005': s3,
    })

In [None]:
df.plot(kind='bar')

In [None]:
recipes_df[(1987 <= recipes_df['year']) & (recipes_df['year'] <= 1997)]

In [None]:
ig = nx.read_gexf('data/spanish_ingredients_lexicon_6.gexf')
tg = nx.read_gexf('data/spanish_techniques_lexicon_6.gexf')

In [None]:
ingrs = ig.nodes(data=True)
techs = tg.nodes(data=True)

In [None]:
ingrs[:2]

In [None]:
top_ingrs = sorted(ingrs, key=lambda x: x[1]['count'], reverse=True)
top_techs = sorted(techs, key=lambda x: x[1]['count'], reverse=True)

In [None]:
top_ingrs[:20]

In [None]:
i_aux = top_ingrs[:21]
i_aux.remove(('hojas de gelatina', {'count': 164, 'label': 'hojas de gelatina'}))
i_aux

In [None]:
top_techs[:10]

In [None]:
t_aux = top_techs[:14]
t_aux.remove(('agua', {'count': 891, 'label': 'agua'}))
t_aux.remove(('sal', {'count': 648, 'label': 'sal'}))
t_aux.remove(('min', {'count': 553, 'label': 'min'}))
t_aux.remove(('hirviendo', {'count': 296, 'label': 'hirviendo'}))
t_aux

In [None]:
s1

In [None]:
i_d = dict((x[0], x[1]['count']) for x in i_aux)
t_d = dict((x[0], x[1]['count']) for x in t_aux)

In [None]:
s1 = pd.Series(i_d)
s2 = pd.Series(t_d)

In [None]:
matplotlib.rcParams['figure.figsize'] = (8, 5)
s1.plot(kind='bar')

In [None]:
s2.plot(kind='bar')
matplotlib.rcParams['figure.figsize'] = (12, 7)