In [1]:
import os
from collections import defaultdict

import networkx as nx
import nltk

# General

## Type

In [2]:
type_dict = defaultdict(set)

path = 'data/classifications/type/'
for filename in os.listdir(path):
    with open(path + filename) as f:
        for line in f:
            if not line.startswith('#'):
                ingredient = line.strip().lower()
                type_dict[ingredient].add(filename[2:-4])

type_dict = dict(type_dict)

In [3]:
ambiguous = []

for i in type_dict:
    if len(type_dict[i]) > 1:
        ambiguous.append(i)

for i in ambiguous:
    print(i, type_dict[i])

cabra {'dairy', 'meats'}
helado {'dairy', 'sweets'}
tomate {'vegetables', 'fruits'}
hinojo {'vegetables', 'spicies_and_condimients'}


In [4]:
type_dict['hinojo'].remove('vegetables')
type_dict['tomate'].remove('fruits')
type_dict['helado'].remove('sweets')
type_dict['cabra'].remove('meats')

In [5]:
for i in type_dict:
    if len(type_dict[i]) > 1:
        print(i)

In [6]:
for i in type_dict:
    type_dict[i] = type_dict[i].pop()

## Cuisine

In [7]:
cuisine_dict = defaultdict(set)

path = 'data/classifications/cuisine/'
for filename in os.listdir(path):
    with open(path + filename) as f:
        for line in f:
            if not line.startswith('#'):
                ingredient = line.strip().lower()
                cuisine_dict[ingredient].add(filename[2:-4])

cuisine_dict = dict(cuisine_dict)

In [8]:
ambiguous = []

for i in cuisine_dict:
    if len(cuisine_dict[i]) > 1:
        ambiguous.append(i)

for i in ambiguous:
    print(i, cuisine_dict[i])

legumbres {'western', 'asian'}
soja {'western', 'asian'}
hongos {'western', 'asian'}
leche de coco {'western', 'asian'}
bonito {'western', 'asian'}
atún {'western', 'asian'}
pepinos {'western', 'asian'}
vaca {'western', 'asian'}
cebollino {'western', 'asian'}
pimienta {'western', 'asian'}
frutos secos {'western', 'asian'}
agar-agar {'asian', 'molecular'}
surimi {'western', 'asian'}
ajíes {'western', 'asian'}
azúcar {'western', 'asian'}
pollo {'western', 'asian'}
pepino {'western', 'asian'}
mostaza {'western', 'asian'}
espinacas {'western', 'asian'}
sake {'western', 'asian'}
cerdo {'western', 'asian'}
arroz {'western', 'asian'}
batata {'western', 'asian'}
cilantro {'western', 'asian'}
nueces {'western', 'asian'}
puerro {'western', 'asian'}
espinaca {'western', 'asian'}
té {'western', 'asian'}
tofu {'western', 'asian'}
chalota {'western', 'asian'}
jengibre {'western', 'asian'}
lechuga {'western', 'asian'}
sal {'western', 'asian', 'molecular'}
cebolla {'western', 'asian'}
castaña {'wester

In [9]:
cuisine_dict['cilantro'].remove('western')
cuisine_dict['sake'].remove('western')
cuisine_dict['vaca'].remove('asian')
cuisine_dict['cebollino'].remove('asian')
cuisine_dict['puerro'].remove('asian')
cuisine_dict['huevos'].remove('asian')
cuisine_dict['cebolla'].remove('asian')
cuisine_dict['soja'].remove('western')
cuisine_dict['chalota'].remove('western')
cuisine_dict['pepinos'].remove('asian')
cuisine_dict['sal'].remove('asian')
cuisine_dict['sal'].remove('molecular')
cuisine_dict['tofu'].remove('western')
cuisine_dict['hongos'].remove('asian')
cuisine_dict['mostaza'].remove('asian')
cuisine_dict['ajíes'].remove('asian')
cuisine_dict['espinaca'].remove('asian')
cuisine_dict['arroz'].remove('asian')
cuisine_dict['batata'].remove('asian')
cuisine_dict['cerdo'].remove('asian')
cuisine_dict['pimienta'].remove('asian')
cuisine_dict['nabos'].remove('asian')
cuisine_dict['legumbres'].remove('asian')
cuisine_dict['pollo'].remove('asian')
cuisine_dict['nueces'].remove('asian')
cuisine_dict['surimi'].remove('western')
cuisine_dict['caballo'].remove('asian')
cuisine_dict['bonito'].remove('asian')
cuisine_dict['cordero'].remove('asian')
cuisine_dict['atún'].remove('asian')
cuisine_dict['encurtidos'].remove('asian')
cuisine_dict['ajo'].remove('asian')
cuisine_dict['frutos secos'].remove('asian')
cuisine_dict['berenjena'].remove('asian')
cuisine_dict['azúcar'].remove('asian')
cuisine_dict['agar-agar'].remove('asian')
cuisine_dict['marisco'].remove('asian')
cuisine_dict['espinacas'].remove('asian')
cuisine_dict['pepino'].remove('asian')
cuisine_dict['leche de coco'].remove('western')
cuisine_dict['té'].remove('asian')
cuisine_dict['castaña'].remove('asian')
cuisine_dict['caballa'].remove('asian')
cuisine_dict['lechuga'].remove('asian')
cuisine_dict['jengibre'].remove('western')

In [10]:
for i in cuisine_dict:
    if len(cuisine_dict[i]) > 1:
        print(i)

In [11]:
for i in cuisine_dict:
    cuisine_dict[i] = cuisine_dict[i].pop()

# Synonym graph

In [2]:
ingredients_graph = nx.read_gexf('data/spanish_ingredients_lexicon_6.gexf')

## Type

In [190]:
for ingr, dat in ingredients_graph.nodes_iter(data=True):
    if ingr in type_dict:
        dat['type'] = type_dict[ingr]

In [191]:
ambiguous = []

for syns in nx.connected_components(ingredients_graph):
    types = set()
    for ingr in syns:
        dat = ingredients_graph.node[ingr]
        if 'type' in dat:
            types.add(dat['type'])
    if len(types) > 1:
        ambiguous.append(syns)

for syns in ambiguous:
    print(syns)
    for ingr in syns:
        print(ingr, ingredients_graph.node[ingr])
    print()

{'biscotes', 'biscote', 'tostadas', 'biscottes', 'tostada', 'biscotte'}
biscotes {'count': 0, 'label': 'biscotes'}
biscote {'type': 'cereals', 'count': 0, 'label': 'biscote'}
tostadas {'count': 16, 'label': 'tostadas'}
biscottes {'count': 0, 'label': 'biscottes'}
tostada {'type': 'spicies_and_condimients', 'count': 6, 'label': 'tostada'}
biscotte {'count': 0, 'label': 'biscotte'}

{'pimientas', 'pimienta'}
pimientas {'type': 'vegetables', 'count': 2, 'label': 'pimientas'}
pimienta {'type': 'spicies_and_condimients', 'count': 24, 'label': 'pimienta'}



In [192]:
ingredients_graph.node['tostada']['type'] = 'cereals'
ingredients_graph.node['pimientas']['type'] = 'spicies_and_condimients'

In [193]:
for syns in nx.connected_components(ingredients_graph):
    types = set()
    for ingr in syns:
        dat = ingredients_graph.node[ingr]
        if 'type' in dat:
            types.add(dat['type'])
    if len(types) > 1:
        print(syns)

In [194]:
for syns in nx.connected_components(ingredients_graph):
    for ingr in syns:
        dat = ingredients_graph.node[ingr]
        if 'type' in dat:
            for ingr in syns:
                ingredients_graph.node[ingr]['type'] = dat['type']
            break

## Cuisine

In [195]:
for ingr, dat in ingredients_graph.nodes_iter(data=True):
    if ingr in cuisine_dict:
        dat['cuisine'] = cuisine_dict[ingr]

In [196]:
ambiguous = []

for syns in nx.connected_components(ingredients_graph):
    cuisines = set()
    for ingr in syns:
        dat = ingredients_graph.node[ingr]
        if 'cuisine' in dat:
            cuisines.add(dat['cuisine'])
    if len(cuisines) > 1:
        ambiguous.append(syns)

for syns in ambiguous:
    print(syns)
    for ingr in syns:
        print(ingr, ingredients_graph.node[ingr])
    print()

{'verdura encurtidas', 'vegetaleses vinagres', 'tsukemono', 'vegetal vinagre', 'verdura encurtida', 'encurtido', 'vegetales en vinagres', 'vegetal en vinagre', 'encurtido de verdura', 'vegetal en vinagres', 'vegetaleses en vinagre', 'vegetales vinagres', 'verduras encurtida', 'encurtidos verdurases', 'encurtido verdura', 'vegetal vinagres', 'encurtidos', 'tsukemonos', 'encurtido de verduras', 'encurtido verduras', 'vegetales en vinagre', 'vegetaleses vinagre', 'vegetales vinagre', 'vegetaleses en vinagres', 'verduras encurtidas', 'encurtidos de verdurases'}
verdura encurtidas {'count': 0, 'label': 'verdura encurtidas'}
vegetaleses vinagres {'count': 0, 'label': 'vegetaleses vinagres'}
tsukemono {'count': 0, 'cuisine': 'asian', 'label': 'tsukemono'}
vegetal vinagre {'count': 0, 'label': 'vegetal vinagre'}
verdura encurtida {'count': 0, 'label': 'verdura encurtida'}
encurtido {'count': 0, 'label': 'encurtido'}
vegetales en vinagres {'count': 0, 'label': 'vegetales en vinagres'}
vegetal e

In [197]:
ingredients_graph.node['encurtidos']['cuisine'] = 'asian'
ingredients_graph.node['ajonjolí']['cuisine'] = 'western'
ingredients_graph.node['sésamos']['cuisine'] = 'western'
ingredients_graph.node['boniatos']['cuisine'] = 'western'
ingredients_graph.node['batatas']['cuisine'] = 'western'
ingredients_graph.node['fideos']['cuisine'] = 'western'
ingredients_graph.node['chiles']['cuisine'] = 'western'
ingredients_graph.node['guindillas']['cuisine'] = 'western'
ingredients_graph.node['chancho']['cuisine'] = 'western'
ingredients_graph.node['culantro']['cuisine'] = 'asian'
ingredients_graph.node['tallarines']['cuisine'] = 'western'
ingredients_graph.node['kome']['cuisine'] = 'western'
ingredients_graph.node['retasu']['cuisine'] = 'western'
ingredients_graph.node['cacahuetes']['cuisine'] = 'western'
ingredients_graph.node['albaricoques']['cuisine'] = 'western'
ingredients_graph.node['alga']['cuisine'] = 'asian'

In [198]:
for syns in nx.connected_components(ingredients_graph):
    cuisines = set()
    for ingr in syns:
        dat = ingredients_graph.node[ingr]
        if 'cuisine' in dat:
            cuisines.add(dat['cuisine'])
    if len(cuisines) > 1:
        print(syns)

In [199]:
for syns in nx.connected_components(ingredients_graph):
    for ingr in syns:
        dat = ingredients_graph.node[ingr]
        if 'cuisine' in dat:
            for ingr in syns:
                ingredients_graph.node[ingr]['cuisine'] = dat['cuisine']
            break

# Superclasses

In [3]:
repr_ingredients_dict = {}

for syns in nx.connected_components(ingredients_graph):
    max_ingr = ''
    max_count = 0
    for ingr in syns:
        dat = ingredients_graph.node[ingr]
        if dat['count'] > max_count:
            max_ingr = ingr
            max_count = dat['count']
    if max_ingr:
        for ingr in syns:
            dat = ingredients_graph.node[ingr]
            dat['repr'] = max_ingr

In [4]:
for ingr, dat in ingredients_graph.nodes_iter(data=True):
    if 'repr' in dat:
        superclasses = []
        tokens = nltk.word_tokenize(dat['repr'])
        for token in tokens:
            if token in ingredients_graph:
                if 'repr' in ingredients_graph.node[token]:
                    superclass = ingredients_graph.node[token]['repr']
                else:
                    superclass = token
                superclasses.append(superclass)
                break
        if superclasses:
            dat['superclass'] = superclasses[0]
        else:
            dat['superclass'] = tokens[0]

In [5]:
c = d = 0
for ingr, dat in ingredients_graph.nodes_iter(data=True):
    if 'type' in dat:
        c+=1
    if 'cuisine' in dat:
        d+=1

In [24]:
b=c=d=0
for ingr, dat in ingredients_graph.nodes_iter(data=True):
    if dat['count'] > 0 and 'superclass' in dat:
        b+=1
        if dat['superclass'] not in ingredients_graph:
            print(dat)
            c+=1
        else:
            d+=1

{'superclass': 'hierba', 'label': 'hierba luisa', 'count': 11, 'repr': 'hierba luisa'}
{'superclass': 'tosaka', 'label': 'tosaka roja', 'count': 1, 'repr': 'tosaka roja'}
{'superclass': 'hierba', 'label': 'hierba buena', 'count': 1, 'repr': 'hierba buena'}
{'superclass': 'hierba', 'label': 'hierba luisa fresca', 'count': 25, 'repr': 'hierba luisa fresca'}
{'superclass': 'colorante', 'label': 'colorante rojo', 'count': 3, 'repr': 'colorante rojo'}
{'superclass': 'jugo', 'label': 'jugo de frutos rojos', 'count': 2, 'repr': 'jugo de frutos rojos'}
{'superclass': 'frutos', 'label': 'frutos rojos', 'count': 1, 'repr': 'frutos rojos'}
{'superclass': 'láminas', 'label': 'láminas de crocant', 'count': 1, 'repr': 'láminas de crocant'}
{'superclass': 'diente', 'label': 'diente de león', 'count': 4, 'repr': 'diente de león'}
{'superclass': 'ficoide', 'label': 'ficoide glaciale', 'count': 3, 'repr': 'ficoide glaciale'}
{'superclass': 'emulsionante', 'label': 'emulsionante en polvo', 'count': 1, 'r

In [25]:
b

1609

In [26]:
c

28

In [27]:
d

1581

In [6]:
len(ingredients_graph)

26472

In [7]:
c

0

In [8]:
d

0

In [217]:
for ingr, dat in ingredients_graph.nodes_iter(data=True):
    if 'type' not in dat and 'superclass' in dat:
        superclass = dat['superclass']
        if superclass in ingredients_graph:
            super_dat = ingredients_graph.node[superclass]
            if 'type' in super_dat:
                dat['type'] = super_dat['type']

In [222]:
c=d=e=0
for ingr, dat in ingredients_graph.nodes_iter(data=True):
    if dat['count'] > 0:
        if 'type' not in dat:
            c+=1
        if 'cuisine' not in dat:
            d+=1
        e+=1

In [224]:
e

1609

In [225]:
c

568

In [226]:
d

1222

# elBulli

In [60]:
a=b=d=e=c=0
with open('data/ingredients/es_elbulli_ingredients.txt') as f:
    for line in f:
        ingredient = line.strip()
        if ingredient in type_dict:
            a+=1
        else:
            b+=1
        if ingredient in cuisine_dict:
            d+=1
        else:
            e+=1
        c+=1

In [61]:
a

162

In [62]:
b

1762

In [63]:
d

224

In [64]:
e

1700

In [65]:
c

1924

In [66]:
a=b=d=e=c=0
with open('data/ingredients/es_elbulli_ingredients.txt') as f:
    for line in f:
        for ingredient in line.strip().split(' o '):
            if 'type' in ingredients_graph.node[ingredient]:
                a+=1
            else:
                b+=1
#                 print(ingredient)
            if 'cuisine' in ingredients_graph.node[ingredient]:
                d+=1
            else:
                e+=1
#                 print(ingredient)
            c+=1

In [67]:
a

194

In [68]:
b

1734

In [69]:
d

224

In [70]:
e

1704

In [71]:
c

1928

In [19]:
a=b=c=0
with open('data/ingredients/es_quehayenlanevera_ingredients.txt') as f:
    for line in f:
        ingredient = line.strip()
        if ingredient in type_dict:
            a+=1
        else:
            b+=1
        c+=1

1034