In [1]:
import networkx as nx
import nltk
from nltk.corpus import stopwords
from nltk.util import ngrams

import api

In [2]:
g_nlg = nx.read_gexf('out/elbulli_nlg_02.gexf')

In [3]:
g_dat = nx.read_gexf('out/elbulli_dat_03.gexf')

In [4]:
spanish_stopwords = set()
with open('data/spanish_stopwords.txt') as f:
    for line in f:
        word = line.strip()
        spanish_stopwords.add(word)
spanish_stopwords = spanish_stopwords.union(stopwords.words('spanish'))

In [5]:
all_ingredients_graph = nx.read_gexf('out/spanish_ingredients_lexicon_04.gexf')

In [6]:
len(all_ingredients_graph)

26871

In [7]:
ingredients = all_ingredients_graph.nodes()
ingredients = [i for i in ingredients if i not in spanish_stopwords and len(i) > 2]

In [8]:
len(ingredients)

26865

In [9]:
sorted(ingredients)[:10]

['aabalone',
 'aabalones',
 'abacate',
 'abacates',
 'abadejo',
 'abadejo desalado',
 'abadejo desalados',
 'abadejo fresco',
 'abadejo frescos',
 'abadejo salado']

In [10]:
all_techniques_graph = nx.read_gexf('out/spanish_techniques_lexicon_04.gexf')

In [11]:
len(all_techniques_graph)

4635

In [12]:
techniques = all_techniques_graph.nodes()
techniques = [t for t in techniques if t not in spanish_stopwords and len(t) > 2]

In [13]:
len(techniques)

4622

In [14]:
sorted(techniques)[:10]

['a baja temperatura',
 'a bajado temperatura',
 'a bajamos temperatura',
 'a bajando temperatura',
 'a bajar temperatura',
 'a baje temperatura',
 'a fuego lento',
 'a la brasa',
 'a la brasa rui',
 'a la cazuela']

In [15]:
def my_split(s):
    if '#' in s:
        r = s.split('#')
    else:
        r = s.split('<br>')
    return r

def my_trim(s):
    return ' '.join(nltk.word_tokenize(s))

def my_ngrams(s):
    ngrms = []
    tokens = nltk.word_tokenize(s)
    for i in range(1, len(tokens) + 1):
        ngrms.extend(ngrams(tokens, i))
    return list(map(lambda x: ' '.join(x), ngrms))

def in_any(e, es):
    return any(map(lambda x: e in x, es))

In [16]:
%%time

for n, data in api.nodes_by_type(g_dat, 'Elaboracion', data=True):
    ingrs_str = data['ingrs']
    ingrs_str = ingrs_str.lower()
    elems = my_split(ingrs_str)
    ingrs = map(my_trim, elems)
    for ingr in ingrs:
        ngrms = my_ngrams(ingr)
        ngrms.reverse()
        for ngrm in ngrms:
            if ngrm in ingredients:
                v = ngrm
                g_dat.add_node(v, {'label': v, 'nodetype': 'ingrediente'})
                g_dat.add_edge(n, v, {'edgetype': 'composicion'})
                all_ingredients_graph.node[v]['count'] += 1
                break # I assume only one ingredient per split
    desc = data['desc']
    desc = desc.lower()
    elems = my_split(desc)
    steps = map(my_trim, elems)
    for step in steps:
        used_ngrams = set()
        ngrms = my_ngrams(step)
        ngrms.reverse()
        for ngrm in ngrms:
            if ngrm in techniques and not in_any(ngrm, used_ngrams):
                v = ngrm
                g_dat.add_node(v, {'label': v, 'nodetype': 'tecnica'})
                g_dat.add_edge(n, v, {'edgetype': 'tecnica'})
                all_techniques_graph.node[v]['count'] += 1
                used_ngrams.add(ngrm) # I assume one or more techniques per split

CPU times: user 10min 7s, sys: 56 ms, total: 10min 7s
Wall time: 10min 7s


In [17]:
%%time

for n, data in api.nodes_by_type(g_dat, 'Acabacion', data=True):
    desc = data['desc']
    desc = desc.lower()
    elems = my_split(desc)
    steps = map(my_trim, elems)
    for step in steps:
        used_ingrs_ngrams = set()
        used_techs_ngrams = set()
        ngrms = my_ngrams(step)
        ngrms.reverse()
        for ngrm in ngrms:
            if ngrm in ingredients and not in_any(ngrm, used_ingrs_ngrams):
                v = ngrm
                g_dat.add_node(v, {'label': v, 'nodetype': 'ingrediente'})
                g_dat.add_edge(n, v, {'edgetype': 'composicion'})
                all_ingredients_graph.node[v]['count'] += 1
                used_ingrs_ngrams.add(ngrm) # I assume one or more ingredients per split
            if ngrm in techniques and not in_any(ngrm, used_techs_ngrams):
                v = ngrm
                g_dat.add_node(v, {'label': v, 'nodetype': 'tecnica'})
                g_dat.add_edge(n, v, {'edgetype': 'tecnica'})
                all_techniques_graph.node[v]['count'] += 1
                used_techs_ngrams.add(ngrm) # I assume one or more techniques per split

CPU times: user 38min 25s, sys: 1.56 s, total: 38min 26s
Wall time: 38min 27s


In [18]:
for cc in nx.connected_components(all_ingredients_graph):
    max_ingr = ''
    max_count = 0
    total_count = 0
    for ingr in cc:
        data = all_ingredients_graph.node[ingr]
        if data['count'] > max_count:
            max_ingr = ingr
            max_count = data['count']
        total_count += data['count']
    if max_ingr:
        all_ingredients_graph.node[max_ingr]['repr_count'] = total_count

In [19]:
for cc in nx.connected_components(all_techniques_graph):
    max_tech = ''
    max_count = 0
    total_count = 0
    for tech in cc:
        data = all_techniques_graph.node[tech]
        if data['count'] > max_count:
            max_tech = tech
            max_count = data['count']
        total_count += data['count']
    if max_tech:
        all_techniques_graph.node[max_tech]['repr_count'] = total_count

In [20]:
nx.write_gexf(g_dat, 'out/elbulli_dat_05.gexf')

In [21]:
nx.write_gexf(all_ingredients_graph, 'out/spanish_ingredients_lexicon_05.gexf')

In [22]:
nx.write_gexf(all_techniques_graph, 'out/spanish_techniques_lexicon_05.gexf')