In [1]:
import os
import string

import networkx as nx

In [2]:
fields = set()

path = 'data/recipes/elbulli/'
for folder in os.listdir(path):
    for filename_number in sorted(map(lambda x: int(x[:-4]), os.listdir(path + folder))):
        filename = str(filename_number) + '.dat'
        with open(path + folder + '/' + filename) as f:
            for line in f:
                field = line.split('=')[0]
                while field[-1].isdecimal():
                    field = field[:-1]
                fields.add(field)

In [3]:
fields

{'&acabatipresentacio',
 '&any',
 '&cubiertos',
 '&descripcioelaboracio',
 '&familia',
 '&ingredientselaboracio',
 '&maneraDegustar',
 '&pers',
 '&temperatura',
 '&temporada',
 '&titol',
 '&titolelaboracio',
 'num'}

In [4]:
g_dat = nx.DiGraph()

path = 'data/recipes/elbulli/'
for folder in os.listdir(path):
    for filename_number in sorted(map(lambda x: int(x[:-4]), os.listdir(path + folder))):
        filename = str(filename_number) + '.dat'
        with open(path + folder + '/' + filename) as f:
            ide = str(filename_number)
            g_dat.add_node(ide, {'nodetype': 'Receta'})
            preps_count = 0
            for line in f:
                line = line.strip()
                if line.startswith('num'):
                    v = line.split('=')[1]
                    g_dat.node[ide]['label'] = v
                elif line.startswith('&titol='):
                    v = line.split('=')[1]
                    g_dat.node[ide]['titulo'] = v
                elif line.startswith('&any'):
                    v = line.split('=')[1]
                    g_dat.add_node(v, {'label': v, 'nodetype': 'Año'})
                    g_dat.add_edge(ide, v, {'edgetype': 'publicado en'})
                elif line.startswith('&familia'):
                    v = line.split('=')[1]
                    g_dat.add_node(v, {'label': v, 'nodetype': 'Familia Receta'})
                    g_dat.add_edge(ide, v, {'edgetype': 'se clasifica'})
                elif line.startswith('&cubiertos'):
                    v = line.split('=')[1]
                    g_dat.node[ide]['cubiertos'] = v
                elif line.startswith('&maneraDegustar'):
                    v = line.split('=')[1]
                    g_dat.node[ide]['maneraDegustar'] = v
                elif line.startswith('&temporada'):
                    v = line.split('=')[1]
                    g_dat.node[ide]['temporada'] = v
                elif line.startswith('&temperatura'):
                    v = line.split('=')[1]
                    g_dat.add_node(v, {'label': v, 'nodetype': 'Temperatura'})
                    g_dat.add_edge(ide, v, {'edgetype': 'temperatura'})
                elif line.startswith('&pers'):
                    v = line.split('=')[1]
                    g_dat.node[ide]['personas'] = v
                elif line.startswith('&acabatipresentacio'):
                    v = line.split('=')[1]
                    g_dat.node[ide]['acabacion'] = v
                elif line.startswith('&titolelaboracio'):
                    preps_count += 1
                    prep_id = ide + '-' + str(preps_count)
                    g_dat.add_node(prep_id, {'label': prep_id, 'nodetype': 'Elaboracion'})
                    v = line.split('=')[1]
                    g_dat.node[prep_id]['title'] = v
                elif line.startswith('&ingredientselaboracio'):
                    v = line.split('=')[1]
                    g_dat.node[prep_id]['ingrs'] = v
                elif line.startswith('&descripcioelaboracio'):
                    v = line.split('=')[1]
                    g_dat.node[prep_id]['desc'] = v
                    g_dat.add_edge(ide, prep_id, {'edgetype': 'elaboracion'})

In [5]:
def is_vowel(c):
    return c in 'aeiou'

def is_consonant(c):
    return not is_vowel(c)

def basic_plural(w):
    if w.endswith('s'):
        r = w
    elif w.endswith('z'):
        r = w[:-1] + 'ces'
    elif is_consonant(w[-1]):
        r = w + 'es'
    else:
        r = w + 's'
    return r

def special_replacements(t):
    t = t.replace('caracolesines', 'caracolines')
    t = t.replace('caracoleses', 'caracoles')
    t = t.replace('; de roquefort; de falso tartufo; de grasa de jamón ibérico y aceite de trufa blanca', '')
    return t

def clean_title(t):
    t = t.strip().lower()
    t = special_replacements(t)
    t = t.replace('á', 'a').replace('é', 'e').replace('í', 'i').replace('ó', 'o').replace('ú', 'u')
    for p in string.punctuation:
        t = t.replace(p, '')
    ws = t.split()
    ws = [basic_plural(w) for w in ws]
    t = ' '.join(ws)
    return t

def similar_titles(t1, t2):
    c1 = clean_title(t1)
    c2 = clean_title(t2)
    return c1 == c2

In [6]:
def equals_nodes(d1, d2):
    result = False
    if d1['label'] != d2['label']:
        errors.add('Different labels')
    else:
        if d1['nodetype'] != d2['nodetype']:
            errors.add('Different nodetypes')
        else:
            if 'titulo' in d1 and 'titulo' in d2 and not similar_titles(d1['titulo'], d2['titulo']):
                errors.add('Different titles')
            else:
                result = True
    return result

def get_edge_value(es, edgetype):
    vs = [k for k in es if es[k]['edgetype'] == edgetype]
    assert(len(vs) == 1)
    v = vs[0]
    return v

def equals_edges(d1, es1, d2, es2):
    result = False
    year1 = get_edge_value(es1, 'publicado en')
    year2 = get_edge_value(es2, 'publicado en')
    if year1 != year2:
        errors.add('Different years')
    else:
        temp1 = get_edge_value(es1, 'temperatura')
        temp2 = get_edge_value(es2, 'temperatura')
        if temp1 != temp2:
            errors.add('Different temperatures')
        else:
            fam1 = get_edge_value(es1, 'se clasifica')
            fam2 = get_edge_value(es2, 'se clasifica')
            if fam1 != fam2:
                errors.add('Different recipe families')
            else:
                n_preps1 = sum(1 for k in es1 if es1[k]['edgetype'] == 'elaboracion')
                n_preps2 = sum(1 for k in es2 if es2[k]['edgetype'] == 'elaboracion')
                if n_preps1 != n_preps2:
                    errors.add('Different number of preparations')
                else:
                    result = True
    return result

In [7]:
g_nlg = nx.read_gexf('out/elbulli_nlg.gexf')

In [8]:
errors = set()

for n1, data1 in g_nlg.nodes_iter(data=True):
    if data1['nodetype'] == 'Receta':
        edges1 = g_nlg[n1]
        n2 = n1
        data2 = g_dat.node[n2]
        edges2 = g_dat[n2]
        equals_nodes(data1, data2)
        equals_edges(data1, edges1, data2, edges2)

for e in errors:
    print(e)

Different number of preparations


In [9]:
nx.write_gexf(g_dat, 'out/elbulli_dat.gexf')