In [1]:
import os
import pickle
import string

import networkx as nx
import nltk
from nltk.util import ngrams

In [2]:
g_nlg = nx.read_gexf('out/elbulli_nlg.gexf')

In [3]:
g_dat = nx.read_gexf('out/elbulli_dat.gexf')

In [4]:
with open('out/ingredients.pickle', 'rb') as f:
    ingredients = pickle.load(f)

In [5]:
len(ingredients)

26927

In [6]:
sorted(ingredients)[:10]

['3d',
 'aabalone',
 'aabalones',
 'abacate',
 'abacates',
 'abadejo',
 'abadejo desalado',
 'abadejo desalados',
 'abadejo fresco',
 'abadejo frescos']

In [7]:
with open('out/techniques.pickle', 'rb') as f:
    techniques = pickle.load(f)

In [8]:
len(techniques)

4383

In [9]:
sorted(techniques)[:10]

['a baja temperatura',
 'a bajado temperatura',
 'a bajamos temperatura',
 'a bajando temperatura',
 'a bajar temperatura',
 'a baje temperatura',
 'a fuego lento',
 'a la brasa',
 'a la brasa rui',
 'a la cazuela']

In [10]:
def my_split(s):
    if '#' in s:
        r = s.split('#')
    else:
        r = s.split('<br>')
    return r

def my_trim(s):
    return ' '.join(s.split())

def my_ngrams(technique):
    ngrms = []
    tokens = nltk.word_tokenize(technique)
    for i in range(1, len(tokens) + 1):
        ngrms.extend(ngrams(tokens, i))
    return list(map(lambda x: ' '.join(x), ngrms))

def in_any(e, es):
    return any(map(lambda x: e in x, es))

In [11]:
%%time

for n, data in g_dat.nodes(data=True):
    if data['nodetype'] == 'Elaboracion':
        ingrs_str = data['ingrs']
        ingreds = set()
        elems = my_split(ingrs_str)
        ingrs = map(my_trim, elems)
        for ingr in ingrs:
            ngrms = my_ngrams(ingr)
            ngrms.reverse()
            for ngrm in ngrms:
                if ngrm in ingredients:
                    v = ngrm
                    g_dat.add_node(v, {'label': v, 'nodetype': 'ingrediente'})
                    g_dat.add_edge(n, v, {'edgetype': 'composicion'})
                    break # I assume one ingredient only per split
#         del(data['ingrs'])
        desc = data['desc']
        techns = set()
        elems = my_split(desc)
        steps = map(my_trim, elems)
        for step in steps:
            used_ngrams = set()
            ngrms = my_ngrams(step)
            ngrms.reverse()
            for ngrm in ngrms:
                if ngrm in techniques and not in_any(ngrm, used_ngrams):
                    v = ngrm
                    g_dat.add_node(v, {'label': v, 'nodetype': 'tecnica'})
                    g_dat.add_edge(n, v, {'edgetype': 'tecnica'})
                    used_ngrams.add(ngrm) # I assume one or more techniques per split
#         del(data['desc'])

CPU times: user 4min 22s, sys: 36 ms, total: 4min 22s
Wall time: 4min 22s


In [12]:
nx.write_gexf(g_dat, 'out/elbulli_dat.gexf')