In [1]:
import pickle

import pandas as pd

In [2]:
with open('out/recipes_list.pickle', 'rb') as f:
    recipes = pickle.load(f)

In [3]:
fields = set(recipes[0].keys())

In [4]:
fields

{'id',
 'ingredients',
 'num_ingredients',
 'num_preparations',
 'num_styles',
 'num_techniques',
 'num_techniquesR',
 'num_worlds',
 'preparations_families',
 'recipe_family',
 'styles_families',
 'techniques',
 'techniquesR_families',
 'temperature',
 'title',
 'worlds',
 'year'}

In [5]:
recipes_info = []
for r in recipes:
    r1 = {}
    r1['id'] = r['id']
    r1['title'] = r['title']
    r1['year'] = r['year']
    recipes_info.append(r1)

In [6]:
recipes_info_df = pd.DataFrame(recipes_info)
recipes_info_df.set_index('id', inplace=True)
recipes_info_df.sort_index(inplace=True)

In [7]:
recipes_info_df.head()

Unnamed: 0_level_0,title,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,terrina de melón con gelée de oporto,1987
2,mousse de trufa negra '87,1987
3,"ensalada de pasta fresca con caviar, tempura d...",1987
4,"raviolis de cigala, patatas y trufa negra",1987
5,tempura de flor de calabacín rellena de mozzar...,1987


In [8]:
all_ingredients = set()
for r in recipes:
    all_ingredients = all_ingredients.union(r['ingredients'])

In [9]:
all_preparations_families = set()
for r in recipes:
    all_preparations_families = all_preparations_families.union(r['preparations_families'])

In [10]:
all_recipe_families = set()
for r in recipes:
    all_recipe_families = all_recipe_families.union(r['recipe_family'])

In [11]:
all_styles_families = set()
for r in recipes:
    all_styles_families = all_styles_families.union(r['styles_families'])

In [12]:
all_techniques = set()
for r in recipes:
    all_techniques = all_techniques.union(r['techniques'])

In [13]:
all_techniquesR_families = set()
for r in recipes:
    all_techniquesR_families = all_techniquesR_families.union(r['techniquesR_families'])

In [14]:
all_temperatures = set()
for r in recipes:
    all_temperatures = all_temperatures.union(r['temperature'])

In [15]:
all_worlds = set()
for r in recipes:
    all_worlds = all_worlds.union(r['worlds'])

In [16]:
all_fields = \
    ['id'] + \
    ['num_ingredients', 'num_preparations', 'num_techniques'] + \
    ['num_styles', 'num_techniquesR', 'num_worlds'] + \
    ['i_' + x for x in all_ingredients] + \
    ['pf_' + x for x in all_preparations_families] + \
    ['rf_' + x for x in all_recipe_families] + \
    ['sf_' + x for x in all_styles_families] + \
    ['t_' + x for x in all_techniques] + \
    ['trf_' + x for x in all_techniquesR_families] + \
    ['temp_' + x for x in all_temperatures] + \
    ['w_' + x for x in all_worlds]

In [17]:
def count_vectorizer(xs, x):
    return xs.count(x)

def one_hot_vectorizer(xs, x):
    return int(bool(count_vectorizer(xs, x)))

vectorizer = one_hot_vectorizer

In [18]:
recipes_data = []
for r in recipes:
    r2 = {}
    r2['id'] = r['id']
    r2['num_ingredients'] = r['num_ingredients']
    r2['num_preparations'] = r['num_preparations']
    r2['num_styles'] = r['num_styles']
    r2['num_techniques'] = r['num_techniques']
    r2['num_techniquesR'] = r['num_techniquesR']
    r2['num_worlds'] = r['num_worlds']
    for x in r['ingredients']:
        r2['i_' + x] = vectorizer(r['ingredients'], x)
    for x in r['preparations_families']:
        r2['pf_' + x] = r['preparations_families'].count(x)
    for x in r['recipe_family']:
        r2['rf_' + x] = r['recipe_family'].count(x)
    for x in r['styles_families']:
        r2['sf_' + x] = r['styles_families'].count(x)
    for x in r['techniques']:
        r2['t_' + x] = vectorizer(r['techniques'], x)
    for x in r['techniquesR_families']:
        r2['trf_' + x] = r['techniquesR_families'].count(x)
    for x in r['temperature']:
        r2['temp_' + x] = r['temperature'].count(x)
    for x in r['worlds']:
        r2['w_' + x] = r['worlds'].count(x)
    recipes_data.append(r2)

In [19]:
recipes_data_df = pd.DataFrame(recipes_data, columns=all_fields)
recipes_data_df.fillna(0, inplace=True)
recipes_data_df = recipes_data_df.astype(int)
recipes_data_df.set_index('id', inplace=True)
recipes_data_df.sort_index(inplace=True)

In [20]:
recipes_data_df.head()

Unnamed: 0_level_0,num_ingredients,num_preparations,num_styles,num_techniques,num_techniquesR,num_worlds,i_consomé de tucuppí,i_anchoas en salazón,i_tamarindo,i_leche de nuez,...,temp_TIBIA/AMBIENTE,temp_FRÍA/AMBIENTE,temp_HELADA/FRÍA/AMBIENTE,temp_CALIENTE/TIBIA,temp_FRÍA/AMBIENTE/HELADA,temp_FRÍA,temp_CALIENTE/FRÍA,temp_FRÍA/HELADA,w_SALADO,w_DULCE
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,7,4,1,8,0,2,0,0,0,0,...,0,0,0,0,0,1,0,0,2,0
2,15,5,0,12,0,2,0,0,0,0,...,0,0,0,0,0,1,0,0,2,0
3,21,5,0,17,0,4,0,0,0,0,...,0,0,0,0,0,0,0,0,4,0
4,30,10,1,38,0,3,0,0,0,0,...,0,0,0,0,0,0,0,0,3,0
5,25,7,1,21,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [21]:
recipes_info_df.to_csv('out/recipes_info.csv', index=False)

In [22]:
recipes_data_df.to_csv('out/recipes_data.csv', index=False)