In [1]:
import json
import pickle

import pandas as pd

In [2]:
def join_ingredients_and_techniques(ingr_list, tech_list):
    i_list = ['i_' + '_'.join(x.split()) for x in ingr_list]
    t_list = ['t_' + '_'.join(x.split()) for x in tech_list]
    return ' '.join(i_list + t_list)

## Raw ingredients and techniques

In [3]:
with open('data/raw_best_estimators_linearsvc.pickle', 'rb') as f:
    best_estimators = pickle.load(f)
    raw_classifier = best_estimators[0]['best_estimator']

### allrecipes

In [4]:
allrecipes_raw_recipes_df = pd.read_csv('data/dbs/recipes_allrecipes_raw_spa.csv')

In [5]:
allrecipes_raw_recipes_df.shape

(66671, 5)

In [6]:
allrecipes_raw_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
0,10000,"""Chocolate Sandwich Cookies I""",2001,"[""cacao en polvo"", ""huevos"", ""leche"", ""az\u00f...","[""hornear"", ""emulsionar"", ""hornear"", ""relleno""..."
1,100008,"""Homemade Pickled Ginger (Gari)""",2007,"[""az\u00facar"", ""vinagre de arroz"", ""ra\u00edz""]","[""sal"", ""hirviendo""]"
2,10001,"""Chocolate Pizzelles""",2003,"[""cacao en polvo"", ""huevos"", ""az\u00facar"", ""h...","[""hornear"", ""sal"", ""rebozar""]"
3,100011,"""Pork and Black Bean Stew""",2007,"[""cebolla"", ""agua"", ""pimiento"", ""lomo de cerdo...","[""hervir"", ""cocer"", ""sal""]"
4,10002,"""Peter Pan Cookies""",2000,"[""az\u00facar moreno"", ""bicarbonato"", ""huevos""...","[""hornear"", ""emulsionar"", ""hornear"", ""sal""]"


In [37]:
allrecipes_raw_recipes_df['year'].value_counts()

2007    6442
2001    6283
2002    5609
2006    5145
2008    4661
2000    4245
2012    4234
2009    4148
2013    3791
2010    3621
2014    3569
2005    3382
2015    3239
2003    2808
2011    2640
2004    2492
2016     361
1999       1
Name: year, dtype: int64

In [38]:
epicurious_raw_recipes_df['year'].value_counts()

2004    18398
2006     2083
2007     1642
2005     1189
2008      889
2009       85
2010        9
1998        7
2003        7
2015        5
1999        4
2011        3
2000        2
2016        1
Name: year, dtype: int64

In [7]:
allrecipes_raw_recipes_df['ingredients'] = allrecipes_raw_recipes_df.apply(lambda x: json.loads(x['ingredients']), axis=1)
allrecipes_raw_recipes_df['techniques'] = allrecipes_raw_recipes_df.apply(lambda x: json.loads(x['techniques']), axis=1)

In [8]:
allrecipes_raw_recipes_df['source'] = allrecipes_raw_recipes_df.apply(
    lambda x: 'allrecipes', axis=1)

allrecipes_raw_recipes_df['text'] = allrecipes_raw_recipes_df.apply(
    lambda x: join_ingredients_and_techniques(x['ingredients'], x['techniques']), axis=1)

allrecipes_raw_recipes_df['creativity'] = allrecipes_raw_recipes_df.apply(
    lambda x: raw_classifier.predict([x['text']])[0], axis=1)

In [9]:
allrecipes_raw_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques,source,text,creativity
0,10000,"""Chocolate Sandwich Cookies I""",2001,"[cacao en polvo, huevos, leche, azúcar, harina...","[hornear, emulsionar, hornear, relleno, reboza...",allrecipes,i_cacao_en_polvo i_huevos i_leche i_azúcar i_h...,0
1,100008,"""Homemade Pickled Ginger (Gari)""",2007,"[azúcar, vinagre de arroz, raíz]","[sal, hirviendo]",allrecipes,i_azúcar i_vinagre_de_arroz i_raíz t_sal t_hir...,30
2,10001,"""Chocolate Pizzelles""",2003,"[cacao en polvo, huevos, azúcar, harina, sal, ...","[hornear, sal, rebozar]",allrecipes,i_cacao_en_polvo i_huevos i_azúcar i_harina i_...,0
3,100011,"""Pork and Black Bean Stew""",2007,"[cebolla, agua, pimiento, lomo de cerdo, ajo]","[hervir, cocer, sal]",allrecipes,i_cebolla i_agua i_pimiento i_lomo_de_cerdo i_...,0
4,10002,"""Peter Pan Cookies""",2000,"[azúcar moreno, bicarbonato, huevos, leche, az...","[hornear, emulsionar, hornear, sal]",allrecipes,i_azúcar_moreno i_bicarbonato i_huevos i_leche...,0


In [10]:
allrecipes_raw_recipes_df['creativity'].value_counts()

0     52150
30     5416
20     5047
10     4058
Name: creativity, dtype: int64

### epicurious

In [11]:
epicurious_raw_recipes_df = pd.read_csv('data/dbs/recipes_epicurious_raw.csv')

In [12]:
epicurious_raw_recipes_df.shape

(24324, 5)

In [13]:
epicurious_raw_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
0,1,White Bean and Pesto Dip,2004,"[""kidney beans"",""pesto"",""lemon juice""]","[""salt"",""prepared"",""season""]"
1,10,Lentil Salad with Mint and Feta,2004,"[""mint"",""green onions"",""lentils"",""olive oil"",""...","[""salt"",""cover"",""prepared"",""season"",""boil"",""ch..."
2,100,Moo Shu Pork,2004,"[""cloves"",""bok choy"",""red bell pepper"",""pork l...","[""brown"",""sauce"",""roll"",""wrap"",""mix"",""wok"",""st..."
3,1000,Butter Pastry Dough,2004,"[""salt"",""water"",""cut"",""flour""]","[""whisk"",""salt"",""wrap"",""dough"",""chilled"",""ice""]"
4,10001,Brown Bread,2004,"[""flour"",""salt"",""baking soda"",""molasses"",""rais...","[""knocking"",""whisk"",""grease"",""prepared"",""dry"",..."


In [14]:
epicurious_raw_recipes_df['ingredients'] = epicurious_raw_recipes_df.apply(lambda x: json.loads(x['ingredients']), axis=1)
epicurious_raw_recipes_df['techniques'] = epicurious_raw_recipes_df.apply(lambda x: json.loads(x['techniques']), axis=1)

In [15]:
epicurious_raw_recipes_df['source'] = epicurious_raw_recipes_df.apply(
    lambda x: 'epicurious', axis=1)

epicurious_raw_recipes_df['text'] = epicurious_raw_recipes_df.apply(
    lambda x: join_ingredients_and_techniques(x['ingredients'], x['techniques']), axis=1)

epicurious_raw_recipes_df['creativity'] = epicurious_raw_recipes_df.apply(
    lambda x: raw_classifier.predict([x['text']])[0], axis=1)

In [16]:
epicurious_raw_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques,source,text,creativity
0,1,White Bean and Pesto Dip,2004,"[kidney beans, pesto, lemon juice]","[salt, prepared, season]",epicurious,i_kidney_beans i_pesto i_lemon_juice t_salt t_...,0
1,10,Lentil Salad with Mint and Feta,2004,"[mint, green onions, lentils, olive oil, clove...","[salt, cover, prepared, season, boil, chilled,...",epicurious,i_mint i_green_onions i_lentils i_olive_oil i_...,0
2,100,Moo Shu Pork,2004,"[cloves, bok choy, red bell pepper, pork loin ...","[brown, sauce, roll, wrap, mix, wok, stir-fry]",epicurious,i_cloves i_bok_choy i_red_bell_pepper i_pork_l...,0
3,1000,Butter Pastry Dough,2004,"[salt, water, cut, flour]","[whisk, salt, wrap, dough, chilled, ice]",epicurious,i_salt i_water i_cut i_flour t_whisk t_salt t_...,0
4,10001,Brown Bread,2004,"[flour, salt, baking soda, molasses, raisins, ...","[knocking, whisk, grease, prepared, dry, slici...",epicurious,i_flour i_salt i_baking_soda i_molasses i_rais...,0


In [17]:
epicurious_raw_recipes_df['creativity'].value_counts()

0     22195
30     1385
10      470
20      274
Name: creativity, dtype: int64

### allrecipes & epicurious

In [18]:
raw_recipes_df = allrecipes_raw_recipes_df.append(epicurious_raw_recipes_df, ignore_index=True)

In [19]:
raw_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques,source,text,creativity
0,10000,"""Chocolate Sandwich Cookies I""",2001,"[cacao en polvo, huevos, leche, azúcar, harina...","[hornear, emulsionar, hornear, relleno, reboza...",allrecipes,i_cacao_en_polvo i_huevos i_leche i_azúcar i_h...,0
1,100008,"""Homemade Pickled Ginger (Gari)""",2007,"[azúcar, vinagre de arroz, raíz]","[sal, hirviendo]",allrecipes,i_azúcar i_vinagre_de_arroz i_raíz t_sal t_hir...,30
2,10001,"""Chocolate Pizzelles""",2003,"[cacao en polvo, huevos, azúcar, harina, sal, ...","[hornear, sal, rebozar]",allrecipes,i_cacao_en_polvo i_huevos i_azúcar i_harina i_...,0
3,100011,"""Pork and Black Bean Stew""",2007,"[cebolla, agua, pimiento, lomo de cerdo, ajo]","[hervir, cocer, sal]",allrecipes,i_cebolla i_agua i_pimiento i_lomo_de_cerdo i_...,0
4,10002,"""Peter Pan Cookies""",2000,"[azúcar moreno, bicarbonato, huevos, leche, az...","[hornear, emulsionar, hornear, sal]",allrecipes,i_azúcar_moreno i_bicarbonato i_huevos i_leche...,0


## Representative ingredients and techniques

In [20]:
with open('data/repr_best_estimators_linearsvc.pickle', 'rb') as f:
    best_estimators = pickle.load(f)
    repr_classifier = best_estimators[0]['best_estimator']

### allrecipes

In [21]:
allrecipes_repr_recipes_df = pd.read_csv('data/dbs/recipes_allrecipes_representatives_spa.csv')

In [22]:
allrecipes_repr_recipes_df.shape

(66673, 5)

In [23]:
allrecipes_repr_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
0,10000,"""Chocolate Sandwich Cookies I""",2001,"[""cacao en polvo"", ""huevos"", ""leche"", ""az\u00f...","[""hornear"", ""emulsionar"", ""hornear"", ""relleno""..."
1,100008,"""Homemade Pickled Ginger (Gari)""",2007,"[""az\u00facar"", ""vinagre de arroz"", ""ra\u00edz""]","[""sal"", ""hervir""]"
2,10001,"""Chocolate Pizzelles""",2003,"[""cacao en polvo"", ""huevos"", ""az\u00facar"", ""h...","[""hornear"", ""sal"", ""rebozar""]"
3,100011,"""Pork and Black Bean Stew""",2007,"[""cebolla"", ""laurel"", ""agua"", ""pimiento"", ""lom...","[""hervir"", ""cocer"", ""sal""]"
4,10002,"""Peter Pan Cookies""",2000,"[""az\u00facar moreno"", ""bicarbonato"", ""huevos""...","[""hornear"", ""emulsionar"", ""hornear"", ""sal""]"


In [24]:
allrecipes_repr_recipes_df['ingredients'] = allrecipes_repr_recipes_df.apply(lambda x: json.loads(x['ingredients']), axis=1)
allrecipes_repr_recipes_df['techniques'] = allrecipes_repr_recipes_df.apply(lambda x: json.loads(x['techniques']), axis=1)

In [25]:
allrecipes_repr_recipes_df['source'] = allrecipes_repr_recipes_df.apply(
    lambda x: 'allrecipes', axis=1)

allrecipes_repr_recipes_df['text'] = allrecipes_repr_recipes_df.apply(
    lambda x: join_ingredients_and_techniques(x['ingredients'], x['techniques']), axis=1)

allrecipes_repr_recipes_df['creativity'] = allrecipes_repr_recipes_df.apply(
    lambda x: raw_classifier.predict([x['text']])[0], axis=1)

In [26]:
allrecipes_repr_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques,source,text,creativity
0,10000,"""Chocolate Sandwich Cookies I""",2001,"[cacao en polvo, huevos, leche, azúcar, harina...","[hornear, emulsionar, hornear, relleno, reboza...",allrecipes,i_cacao_en_polvo i_huevos i_leche i_azúcar i_h...,0
1,100008,"""Homemade Pickled Ginger (Gari)""",2007,"[azúcar, vinagre de arroz, raíz]","[sal, hervir]",allrecipes,i_azúcar i_vinagre_de_arroz i_raíz t_sal t_hervir,30
2,10001,"""Chocolate Pizzelles""",2003,"[cacao en polvo, huevos, azúcar, harina, sal, ...","[hornear, sal, rebozar]",allrecipes,i_cacao_en_polvo i_huevos i_azúcar i_harina i_...,0
3,100011,"""Pork and Black Bean Stew""",2007,"[cebolla, laurel, agua, pimiento, lomo de cerd...","[hervir, cocer, sal]",allrecipes,i_cebolla i_laurel i_agua i_pimiento i_lomo_de...,0
4,10002,"""Peter Pan Cookies""",2000,"[azúcar moreno, bicarbonato, huevos, leche, az...","[hornear, emulsionar, hornear, sal]",allrecipes,i_azúcar_moreno i_bicarbonato i_huevos i_leche...,0


In [27]:
allrecipes_repr_recipes_df['creativity'].value_counts()

0     53132
10     5008
20     4444
30     4089
Name: creativity, dtype: int64

### epicurious

In [28]:
epicurious_repr_recipes_df = pd.read_csv('data/dbs/recipes_epicurious_representatives_spa.csv')

In [29]:
epicurious_repr_recipes_df.shape

(24300, 5)

In [30]:
epicurious_repr_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques
0,1,White Bean and Pesto Dip,2004,"[""jugo de lim\u00f3n""]","[""sal""]"
1,10,Lentil Salad with Mint and Feta,2004,"[""menta"", ""cebollas verdes"", ""lentejas"", ""acei...","[""sal"", ""hervir""]"
2,100,Moo Shu Pork,2004,"[""bok choy"", ""pimiento rojo"", ""salsa"", ""jengib...","[""dorar"", ""emulsionar"", ""stir-fry"", ""stir-fry""]"
3,1000,Butter Pastry Dough,2004,"[""sal"", ""agua"", ""harina""]","[""sal""]"
4,10001,Brown Bread,2004,"[""harina"", ""sal"", ""bicarbonato"", ""pasas"", ""az\...","[""secar"", ""rebozar"", ""hornear""]"


In [31]:
epicurious_repr_recipes_df['ingredients'] = epicurious_repr_recipes_df.apply(lambda x: json.loads(x['ingredients']), axis=1)
epicurious_repr_recipes_df['techniques'] = epicurious_repr_recipes_df.apply(lambda x: json.loads(x['techniques']), axis=1)

In [32]:
epicurious_repr_recipes_df['source'] = epicurious_repr_recipes_df.apply(
    lambda x: 'epicurious', axis=1)

epicurious_repr_recipes_df['text'] = epicurious_repr_recipes_df.apply(
    lambda x: join_ingredients_and_techniques(x['ingredients'], x['techniques']), axis=1)

epicurious_repr_recipes_df['creativity'] = epicurious_repr_recipes_df.apply(
    lambda x: raw_classifier.predict([x['text']])[0], axis=1)

In [33]:
epicurious_repr_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques,source,text,creativity
0,1,White Bean and Pesto Dip,2004,[jugo de limón],[sal],epicurious,i_jugo_de_limón t_sal,0
1,10,Lentil Salad with Mint and Feta,2004,"[menta, cebollas verdes, lentejas, aceite de o...","[sal, hervir]",epicurious,i_menta i_cebollas_verdes i_lentejas i_aceite_...,0
2,100,Moo Shu Pork,2004,"[bok choy, pimiento rojo, salsa, jengibre, sal...","[dorar, emulsionar, stir-fry, stir-fry]",epicurious,i_bok_choy i_pimiento_rojo i_salsa i_jengibre ...,0
3,1000,Butter Pastry Dough,2004,"[sal, agua, harina]",[sal],epicurious,i_sal i_agua i_harina t_sal,0
4,10001,Brown Bread,2004,"[harina, sal, bicarbonato, pasas, azúcar moren...","[secar, rebozar, hornear]",epicurious,i_harina i_sal i_bicarbonato i_pasas i_azúcar_...,0


In [34]:
epicurious_repr_recipes_df['creativity'].value_counts()

0     16085
10     2953
20     2929
30     2333
Name: creativity, dtype: int64

### allrecipes & epicurious

In [35]:
repr_recipes_df = allrecipes_repr_recipes_df.append(epicurious_repr_recipes_df, ignore_index=True)

In [36]:
repr_recipes_df.head()

Unnamed: 0,_id,title,year,ingredients,techniques,source,text,creativity
0,10000,"""Chocolate Sandwich Cookies I""",2001,"[cacao en polvo, huevos, leche, azúcar, harina...","[hornear, emulsionar, hornear, relleno, reboza...",allrecipes,i_cacao_en_polvo i_huevos i_leche i_azúcar i_h...,0
1,100008,"""Homemade Pickled Ginger (Gari)""",2007,"[azúcar, vinagre de arroz, raíz]","[sal, hervir]",allrecipes,i_azúcar i_vinagre_de_arroz i_raíz t_sal t_hervir,30
2,10001,"""Chocolate Pizzelles""",2003,"[cacao en polvo, huevos, azúcar, harina, sal, ...","[hornear, sal, rebozar]",allrecipes,i_cacao_en_polvo i_huevos i_azúcar i_harina i_...,0
3,100011,"""Pork and Black Bean Stew""",2007,"[cebolla, laurel, agua, pimiento, lomo de cerd...","[hervir, cocer, sal]",allrecipes,i_cebolla i_laurel i_agua i_pimiento i_lomo_de...,0
4,10002,"""Peter Pan Cookies""",2000,"[azúcar moreno, bicarbonato, huevos, leche, az...","[hornear, emulsionar, hornear, sal]",allrecipes,i_azúcar_moreno i_bicarbonato i_huevos i_leche...,0
