In [1]:
import os
import pickle
import string
from collections import defaultdict

import networkx as nx
import nltk
from nltk.util import ngrams

In [2]:
g_nlg = nx.read_gexf('out/elbulli_nlg.gexf')

In [3]:
g_dat = nx.read_gexf('out/elbulli_dat.gexf')

In [4]:
all_ingredients_graph = nx.read_gexf('data/spanish_ingredients_lexicon_5.gexf')

In [5]:
ingredients = all_ingredients_graph.nodes()

In [6]:
len(ingredients)

26743

In [7]:
nlg_ingredients = set()
for n, data in g_nlg.nodes(data=True):
    ingr = None
    if data['nodetype'] == 'Producto':
        ingr = n[len('Producto:'):]
    elif data['nodetype'] == 'sabor':
        ingr = n[len('sabor:'):]
    elif data['nodetype'] == 'ingrediente':
        ingr = n
    if ingr:
        nlg_ingredients.add(ingr.lower())
ingredients = list(set(ingredients).union(nlg_ingredients))

In [8]:
len(ingredients)

26943

In [9]:
sorted(ingredients)[:10]

['3d',
 'aabalone',
 'aabalones',
 'abacate',
 'abacates',
 'abadejo',
 'abadejo desalado',
 'abadejo desalados',
 'abadejo fresco',
 'abadejo frescos']

In [10]:
all_techniques_graph = nx.read_gexf('data/spanish_techniques_lexicon_5.gexf')

In [11]:
techniques = all_techniques_graph.nodes()

In [12]:
len(techniques)

4372

In [13]:
nlg_techniques = set()
for n, data in g_nlg.nodes(data=True):
    tech = None
    if data['nodetype'] == 'tecnica':
        tech = n
    elif data['nodetype'] == 'Familia Tecnica':
        tech = n
    if tech:
        tech = tech.lower()
        tech = tech.split(':')[0]
        techs = tech.split('/')
        techs = map(str.strip, techs)
        nlg_techniques = nlg_techniques.union(techs)
techniques = list(set(techniques).union(nlg_techniques))

In [14]:
len(techniques)

4389

In [15]:
sorted(techniques)[:10]

['3d',
 'a baja temperatura',
 'a bajado temperatura',
 'a bajamos temperatura',
 'a bajando temperatura',
 'a bajar temperatura',
 'a baje temperatura',
 'a fuego lento',
 'a la brasa',
 'a la brasa rui']

In [16]:
intersect = set(ingredients).intersection(techniques)

In [17]:
intersect

{'3d',
 'agua',
 'ahumado',
 'ahumados',
 'encurtido',
 'encurtidos',
 'ensaladas',
 'fetas',
 'fresa',
 'juliana',
 'nitrogeno liquido',
 'nitrogeno líquido',
 'nitrógeno líquido',
 'pluma',
 'pure',
 'puré',
 'sal',
 'seca',
 'sofrito',
 'su',
 'tempura',
 'teriyaki',
 'tsukemono'}

In [18]:
def my_split(s):
    if '#' in s:
        r = s.split('#')
    else:
        r = s.split('<br>')
    return r

def my_trim(s):
    return ' '.join(s.split())

def my_ngrams(technique):
    ngrms = []
    tokens = nltk.word_tokenize(technique)
    for i in range(1, len(tokens) + 1):
        ngrms.extend(ngrams(tokens, i))
    return list(map(lambda x: ' '.join(x), ngrms))

def in_any(e, es):
    return any(map(lambda x: e in x, es))

In [19]:
%%time

found_ingredients = set()
found_techniques = set()

for n, data in g_dat.nodes(data=True):
    if data['nodetype'] == 'Elaboracion':
        ingrs_str = data['ingrs']
        elems = my_split(ingrs_str)
        ingrs = map(my_trim, elems)
        for ingr in ingrs:
            ngrms = my_ngrams(ingr)
            ngrms.reverse()
            for ngrm in ngrms:
                if ngrm in ingredients:
                    v = ngrm
                    found_ingredients.add(v)
        desc = data['desc']
        elems = my_split(desc)
        steps = map(my_trim, elems)
        for step in steps:
            ngrms = my_ngrams(step)
            ngrms.reverse()
            for ngrm in ngrms:
                if ngrm in techniques:
                    v = ngrm
                    found_techniques.add(v)

CPU times: user 14min 12s, sys: 188 ms, total: 14min 12s
Wall time: 14min 12s


In [20]:
prep_prod_rels = ['bañado', 'alcohol', 'chocolate', 'lacteo', 'nuevaPasta', 'producto', 'relleno']
prep_ingr_rels = ['composicion']
prep_flav_rels = ['sabor']
prep_tech_rels = ['tecnica']

def get_prep_products(g, prep):
    return {k for k in g[prep] if g[prep][k]['edgetype'] in prep_prod_rels}

def get_prep_ingredients(g, prep):
    return {k for k in g[prep] if g[prep][k]['edgetype'] in prep_ingr_rels}

def get_prep_flavors(g, prep):
    return {k for k in g[prep] if g[prep][k]['edgetype'] in prep_flav_rels}

def get_prep_components(g, prep):
    prods = get_prep_products(g, prep)
    ingrs = get_prep_ingredients(g, prep)
    flavs = get_prep_flavors(g, prep)
    return prods.union(ingrs).union(flavs)

def get_prep_techniques(g, prep):
    return {k for k in g[prep] if g[prep][k]['edgetype'] in prep_tech_rels}

def get_recip_preparations(g, recip):
    return [k for k in g[recip] if g[recip][k]['edgetype'] == 'elaboracion']

def get_recip_products(g, recip):
    preps = get_recip_preparations(g, recip)
    return {k for prep in preps for k in get_prep_products(g, prep)}

def get_recip_ingredients(g, recip):
    preps = get_recip_preparations(g, recip)
    return {k for prep in preps for k in get_prep_ingredients(g, prep)}

def get_recip_flavors(g, recip):
    preps = get_recip_preparations(g, recip)
    return {k for prep in preps for k in get_prep_flavors(g, prep)}

def get_recip_components(g, recip):
    preps = get_recip_preparations(g, recip)
    return {k for prep in preps for k in get_prep_components(g, prep)}

def get_recip_techniques(g, recip):
    preps = get_recip_preparations(g, recip)
    return {k for prep in preps for k in get_prep_techniques(g, prep)}

In [21]:
ingrs_techs_nlg = defaultdict(set)
for n in g_nlg.nodes():
    comps = get_recip_components(g_nlg, n)
    for c in comps:
        if c.lower() in intersect:
            ingrs_techs_nlg[c].add('i')
    techs = get_recip_techniques(g_nlg, n)
    for t in techs:
        if t.lower() in intersect:
            ingrs_techs_nlg[t].add('t')

In [22]:
ingrs_techs_dat = defaultdict(set)
for c in found_ingredients:
    if c in intersect:
        ingrs_techs_dat[c].add('i')
for t in found_techniques:
    if t in intersect:
        ingrs_techs_dat[t].add('t')

In [23]:
ingrs_techs_nlg

defaultdict(set,
            {'agua': {'i'},
             'ahumado': {'t'},
             'juliana': {'i'},
             'sal': {'i'},
             'tempura': {'i'}})

In [24]:
ingrs_techs_dat

defaultdict(set,
            {'agua': {'i', 't'},
             'ahumado': {'i', 't'},
             'ensaladas': {'i', 't'},
             'fresa': {'i', 't'},
             'juliana': {'i', 't'},
             'nitrógeno líquido': {'i', 't'},
             'pluma': {'t'},
             'puré': {'i', 't'},
             'sal': {'i', 't'},
             'seca': {'i', 't'},
             'sofrito': {'i', 't'},
             'su': {'i', 't'}})

In [25]:
# From future analysis

# nlg
# {'agua': {'i'},
#  'ahumado': {'t'},
#  'juliana': {'i'},
#  'sal': {'i'},
#  'sofrito': {'i'},
#  'tempura': {'i'}}

# dat
# {'agua': {'i', 't'},
#  'ahumado': {'i', 't'},
#  'ensaladas': {'i', 't'},
#  'fresa': {'i', 't'},
#  'juliana': {'i', 't'},
#  'nitrógeno líquido': {'i', 't'},
#  'pluma': {'t'},
#  'puré': {'i', 't'},
#  'sal': {'i', 't'},
#  'seca': {'i', 't'},
#  'sofrito': {'i', 't'},
#  'su': {'i', 't'}}

# 3d
# nlg: Found as Producto
# dat: Not found
techniques.remove('3d')
# agua
# nlg: Found as ingrediente and sabor
# dat: Found as ingredient and technique
techniques.remove('agua')
# ahumado
# nlg: Found as tecnica
# dat: Found as ingredient and technique
ingredients.remove('ahumado')
# ahumados
# nlg: Not found
# dat: Not found
# Same case as ahumado
ingredients.remove('ahumados')
# encurtido
# nlg: Not found
# dat: Not found
# Same case as ahumado
ingredients.remove('encurtido')
# encurtidos
# nlg: Not found
# dat: Not found
# Same case as ahumado
ingredients.remove('encurtidos')
# ensaladas
# nlg: Not found
# dat: Found as ingredient and technique
ingredients.remove('ensaladas')
# fetas
# nlg: Not found
# dat: Not found
techniques.remove('fetas')
# fresa
# nlg: Found as Producto and sabor
# dat: Found as ingredient and technique
techniques.remove('fresa')
# juliana
# nlg: Found as ingrediente
# dat: Found as ingredient and technique
techniques.remove('juliana')
# nitrógeno líquido
# nlg: Not found
# dat: Found as ingredient and technique
ingredients.remove('nitrógeno líquido')
# nitrogeno liquido
# nlg: Not found (found as Familia Elaboración)
# dat: Found as ingredient and technique
# Same case as nitrógeno líquido
ingredients.remove('nitrogeno liquido')
# nitrogeno líquido
# nlg: Not found (found as Familia Elaboración)
# dat: Found as ingredient and technique
# Same case as nitrógeno líquido
ingredients.remove('nitrogeno líquido')
# pluma
# nlg: Not found (found as Familia Elaboración)
# dat: Found as technique
ingredients.remove('pluma')
# pure
# nlg: Not found (found as Familia Elaboración)
# dat: Found as ingredient and technique
# Same case as nitrógeno líquido
ingredients.remove('pure')
# puré
# nlg: Not found (found as Familia Elaboración)
# dat: Found as ingredient and technique
# Same case as nitrógeno líquido
ingredients.remove('puré')
# sal
# nlg: Found as ingrediente and sabor
# dat: Found as ingredient and technique
techniques.remove('sal')
techniques.append('en sal')
# seca
# nlg: Not found
# dat: Found as ingredient and technique
ingredients.remove('seca')
# sofrito
# nlg: Not found
# dat: Found as ingredient and technique
ingredients.remove('sofrito')
# su
# nlg: Not found
# dat: Found as ingredient and technique
ingredients.remove('su')
techniques.remove('su')
# tempura
# nlg: Found as ingrediente
# dat: Not found
techniques.remove('tempura')
techniques.append('en tempura')
# teriyaki
# nlg: Not found
# dat: Not found
ingredients.remove('teriyaki')
# tsukemono
# nlg: Not found
# dat: Not found
# Same case as teriyaki
ingredients.remove('tsukemono')

In [26]:
len(ingredients)

26927

In [27]:
len(techniques)

4383

In [28]:
with open('out/ingredients.pickle', 'wb') as f:
    pickle.dump(ingredients, f)

In [29]:
with open('out/techniques.pickle', 'wb') as f:
    pickle.dump(techniques, f)