In [1]:
import os
import pickle
import string
from collections import defaultdict

import networkx as nx
import nltk
from nltk.util import ngrams

import api

In [2]:
g_nlg = nx.read_gexf('out/elbulli_nlg.gexf')

In [3]:
g_dat = nx.read_gexf('out/elbulli_dat.gexf')

In [25]:
all_ingredients_graph = nx.read_gexf('data/spanish_ingredients_lexicon_5.gexf')

In [26]:
len(all_ingredients_graph)

26743

In [27]:
nlg_ingredients = api.all_ingredients(g_nlg)
for ingr in nlg_ingredients:
    ingr = ingr.lower()
    if ingr not in all_ingredients_graph:
        all_ingredients_graph.add_node(ingr, {'count': 0, 'label': ingr})

In [28]:
len(all_ingredients_graph)

26871

In [29]:
sorted(all_ingredients_graph.nodes())[:10]

['3d',
 'aabalone',
 'aabalones',
 'abacate',
 'abacates',
 'abadejo',
 'abadejo desalado',
 'abadejo desalados',
 'abadejo fresco',
 'abadejo frescos']

In [30]:
all_techniques_graph = nx.read_gexf('data/spanish_techniques_lexicon_5.gexf')

In [31]:
len(all_techniques_graph)

4372

In [32]:
nlg_techniques = api.all_techniques(g_nlg)
for tech in nlg_techniques:
    tech = tech.lower()
    if tech not in all_techniques_graph:
        all_techniques_graph.add_node(tech, {'count': 0, 'label': tech})

In [33]:
len(all_techniques_graph)

4385

In [34]:
sorted(all_techniques_graph.nodes())[:10]

['3d',
 'a baja temperatura',
 'a bajado temperatura',
 'a bajamos temperatura',
 'a bajando temperatura',
 'a bajar temperatura',
 'a baje temperatura',
 'a fuego lento',
 'a la brasa',
 'a la brasa rui']

In [35]:
intersect = set(all_ingredients_graph.nodes()).intersection(all_techniques_graph.nodes())

In [36]:
intersect

{'3d',
 'agua',
 'ahumado',
 'ahumados',
 'encurtido',
 'encurtidos',
 'ensaladas',
 'fetas',
 'fresa',
 'juliana',
 'nitrogeno liquido',
 'nitrogeno líquido',
 'nitrógeno líquido',
 'pluma',
 'pure',
 'puré',
 'sal',
 'seca',
 'sofrito',
 'su',
 'tempura',
 'teriyaki',
 'tsukemono'}

In [37]:
def my_split(s):
    if '#' in s:
        r = s.split('#')
    else:
        r = s.split('<br>')
    return r

def my_trim(s):
    return ' '.join(s.split())

def my_ngrams(technique):
    ngrms = []
    tokens = nltk.word_tokenize(technique)
    for i in range(1, len(tokens) + 1):
        ngrms.extend(ngrams(tokens, i))
    return list(map(lambda x: ' '.join(x), ngrms))

def in_any(e, es):
    return any(map(lambda x: e in x, es))

In [None]:
# with open('data/es_lexicon.csv') as f:
#     reader = csv.reader(
#         f,
#         delimiter=' ',
#     )
#     docs = []
#     count = 0
#     for row in reader:
#         for i in range(1, len(row[1:]), 2):
#             entry = {}
#             entry['flexion'] = row[0].lower()
#             entry['lemma'] = row[i].lower()
#             entry['eagle'] = row[i+1].lower()
#             docs.append(entry)
#             count += 1
#         if count % 1000 == 0:
#             db.es_lexicon.insert_many(docs)
#             docs = []
#     db.es_lexicon.insert_many(docs)
#     docs = []

In [53]:
%%time

found_ingredients = set()
found_techniques = set()

for n, data in g_dat.nodes(data=True):
    if data['nodetype'] == 'Elaboracion':
        ingrs_str = data['ingrs']
        elems = my_split(ingrs_str)
        ingrs = map(my_trim, elems)
        for ingr in ingrs:
            ngrms = my_ngrams(ingr)
            ngrms.reverse()
            for ngrm in ngrms:
                if ngrm in ingredients:
                    v = ngrm
                    found_ingredients.add(v)
        desc = data['desc']
        elems = my_split(desc)
        steps = map(my_trim, elems)
        for step in steps:
            ngrms = my_ngrams(step)
            ngrms.reverse()
            for ngrm in ngrms:
                if ngrm in techniques:
                    v = ngrm
                    found_techniques.add(v)

CPU times: user 12min 23s, sys: 132 ms, total: 12min 23s
Wall time: 12min 23s


In [62]:
ingrs_techs_nlg = defaultdict(set)
for n in g_nlg.nodes():
    comps = api.get_components_Recipe(g_nlg, n)
    for c in comps:
        if c.lower() in intersect:
            ingrs_techs_nlg[c].add('i')
    techs = api.get_Techniques_Recipe(g_nlg, n)
    for t in techs:
        if t.lower() in intersect:
            ingrs_techs_nlg[t].add('t')

In [58]:
ingrs_techs_dat = defaultdict(set)
for c in found_ingredients:
    if c in intersect:
        ingrs_techs_dat[c].add('i')
for t in found_techniques:
    if t in intersect:
        ingrs_techs_dat[t].add('t')

In [63]:
ingrs_techs_nlg

defaultdict(set,
            {'agua': {'i'},
             'ahumado': {'t'},
             'juliana': {'i'},
             'sal': {'i'},
             'tempura': {'i'}})

In [23]:
ingrs_techs_nlg

defaultdict(set,
            {'agua': {'i'},
             'ahumado': {'t'},
             'juliana': {'i'},
             'sal': {'i'},
             'tempura': {'i'}})

In [60]:
ingrs_techs_dat

defaultdict(set,
            {'agua': {'i', 't'},
             'ahumado': {'i', 't'},
             'ensaladas': {'i', 't'},
             'fresa': {'i', 't'},
             'juliana': {'i', 't'},
             'nitrógeno líquido': {'i', 't'},
             'pluma': {'t'},
             'puré': {'i', 't'},
             'sal': {'i', 't'},
             'seca': {'i', 't'},
             'sofrito': {'i', 't'},
             'su': {'i', 't'}})

In [25]:
# From future analysis

# nlg
# {'agua': {'i'},
#  'ahumado': {'t'},
#  'juliana': {'i'},
#  'sal': {'i'},
#  'sofrito': {'i'},
#  'tempura': {'i'}}

# dat
# {'agua': {'i', 't'},
#  'ahumado': {'i', 't'},
#  'ensaladas': {'i', 't'},
#  'fresa': {'i', 't'},
#  'juliana': {'i', 't'},
#  'nitrógeno líquido': {'i', 't'},
#  'pluma': {'t'},
#  'puré': {'i', 't'},
#  'sal': {'i', 't'},
#  'seca': {'i', 't'},
#  'sofrito': {'i', 't'},
#  'su': {'i', 't'}}

# 3d
# nlg: Found as Producto
# dat: Not found
techniques.remove('3d')
# agua
# nlg: Found as ingrediente and sabor
# dat: Found as ingredient and technique
techniques.remove('agua')
# ahumado
# nlg: Found as tecnica
# dat: Found as ingredient and technique
ingredients.remove('ahumado')
# ahumados
# nlg: Not found
# dat: Not found
# Same case as ahumado
ingredients.remove('ahumados')
# encurtido
# nlg: Not found
# dat: Not found
# Same case as ahumado
ingredients.remove('encurtido')
# encurtidos
# nlg: Not found
# dat: Not found
# Same case as ahumado
ingredients.remove('encurtidos')
# ensaladas
# nlg: Not found
# dat: Found as ingredient and technique
ingredients.remove('ensaladas')
# fetas
# nlg: Not found
# dat: Not found
techniques.remove('fetas')
# fresa
# nlg: Found as Producto and sabor
# dat: Found as ingredient and technique
techniques.remove('fresa')
# juliana
# nlg: Found as ingrediente
# dat: Found as ingredient and technique
techniques.remove('juliana')
# nitrógeno líquido
# nlg: Not found
# dat: Found as ingredient and technique
ingredients.remove('nitrógeno líquido')
# nitrogeno liquido
# nlg: Not found (found as Familia Elaboración)
# dat: Found as ingredient and technique
# Same case as nitrógeno líquido
ingredients.remove('nitrogeno liquido')
# nitrogeno líquido
# nlg: Not found (found as Familia Elaboración)
# dat: Found as ingredient and technique
# Same case as nitrógeno líquido
ingredients.remove('nitrogeno líquido')
# pluma
# nlg: Not found (found as Familia Elaboración)
# dat: Found as technique
ingredients.remove('pluma')
# pure
# nlg: Not found (found as Familia Elaboración)
# dat: Found as ingredient and technique
# Same case as nitrógeno líquido
ingredients.remove('pure')
# puré
# nlg: Not found (found as Familia Elaboración)
# dat: Found as ingredient and technique
# Same case as nitrógeno líquido
ingredients.remove('puré')
# sal
# nlg: Found as ingrediente and sabor
# dat: Found as ingredient and technique
techniques.remove('sal')
techniques.append('en sal')
# seca
# nlg: Not found
# dat: Found as ingredient and technique
ingredients.remove('seca')
# sofrito
# nlg: Not found
# dat: Found as ingredient and technique
ingredients.remove('sofrito')
# su
# nlg: Not found
# dat: Found as ingredient and technique
ingredients.remove('su')
techniques.remove('su')
# tempura
# nlg: Found as ingrediente
# dat: Not found
techniques.remove('tempura')
techniques.append('en tempura')
# teriyaki
# nlg: Not found
# dat: Not found
ingredients.remove('teriyaki')
# tsukemono
# nlg: Not found
# dat: Not found
# Same case as teriyaki
ingredients.remove('tsukemono')

In [26]:
len(ingredients)

26927

In [27]:
len(techniques)

4383

In [28]:
with open('out/ingredients.pickle', 'wb') as f:
    pickle.dump(ingredients, f)

In [29]:
with open('out/techniques.pickle', 'wb') as f:
    pickle.dump(techniques, f)