In [10]:
import csv
import itertools
import json
import os
import time
from collections import defaultdict
from itertools import product
from itertools import permutations

import networkx as nx
import nltk
import requests
from nltk.corpus import cess_esp
from nltk.corpus import wordnet as wn
from nltk.util import ngrams
from pymongo import MongoClient

# Ingredients

In [2]:
def is_spanish_ingredients_file(filename):
    return filename.startswith('es_') and filename.endswith('_ingredients.txt')

In [50]:
def add_edge(g, n1, n2):
    if n1 != n2 and not nx.has_path(g, n1, n2):
        g.add_edge(n1, n2)

In [4]:
graph_syn = nx.Graph()
ingredients_root = 'data/ingredients/'
for e in os.listdir(ingredients_root):
    file_path = ingredients_root + e
    if os.path.isfile(file_path):
        if is_spanish_ingredients_file(e):
            with open(file_path) as f:
                for line in f:
                    syn_set = set()
                    ingrs1 = line.strip()
                    for ingrs2 in ingrs1.split(' / '):
                        for ingrs3 in ingrs2.split(' o '):
                            for ingr in ingrs3.split(' - '):
                                syn_set.add(ingr)
                                if not ingr in graph_syn:
                                    graph_syn.add_node(ingr, count=1)
                                else:
                                    graph_syn.node[ingr]['count'] += 1
                    syn_set = list(syn_set)
                    i1 = syn_set[0]
                    for i2 in syn_set[1:]:
                        add_edge(graph_syn, i1, i2)

In [5]:
len(graph_syn)

3322

In [6]:
graph_syn.number_of_edges()

101

In [7]:
nx.number_connected_components(graph_syn)

3221

In [8]:
nx.write_gexf(graph_syn, 'data/spanish_lexicon_1.gexf')

# Lexicon

In [9]:
client = MongoClient()
# client.drop_database('lexicon')
db = client.lexicon

In [10]:
# with open('data/es_lexicon.csv') as f:
#     reader = csv.reader(
#         f,
#         delimiter=' ',
#     )
#     docs = []
#     count = 0
#     for row in reader:
#         for i in range(1, len(row[1:]), 2):
#             entry = {}
#             entry['flexion'] = row[0].lower()
#             entry['lemma'] = row[i].lower()
#             entry['eagle'] = row[i+1].lower()
#             docs.append(entry)
#             count += 1
#         if count % 1000 == 0:
#             db.es_lexicon.insert_many(docs)
#             docs = []
#     db.es_lexicon.insert_many(docs)
#     docs = []

In [11]:
db.es_lexicon.count()

668825

# POS tagging

In [12]:
tagged_sp_sents = cess_esp.tagged_sents()

In [13]:
size = int(len(tagged_sp_sents) * 0.1)
train_sp_sents = tagged_sp_sents[size:]
test_sp_sents = tagged_sp_sents[:size]

In [14]:
len(tagged_sp_sents) == len(train_sp_sents) + len(test_sp_sents)

True

In [15]:
tagged_sp_words = cess_esp.tagged_words()

In [16]:
tags = [tag for (word, tag) in tagged_sp_words]
most_freq_tags = nltk.FreqDist(tags)
most_freq_tags.most_common()[:10]

[('sps00', 25272),
 ('ncms000', 11428),
 ('Fc', 11420),
 ('ncfs000', 11008),
 ('da0fs0', 6838),
 ('da0ms0', 6012),
 ('rg', 5937),
 ('Fp', 5866),
 ('cc', 5854),
 ('ncmp000', 5711)]

In [17]:
default_tag = 'ncms000'

In [18]:
t0 = nltk.DefaultTagger(None)
t1 = nltk.UnigramTagger(train_sp_sents, backoff=t0)
t2 = nltk.BigramTagger(train_sp_sents, backoff=t1)
sp_tagger = nltk.TrigramTagger(train_sp_sents, backoff=t2)

In [19]:
sp_tagger.evaluate(test_sp_sents)

0.8812609457092819

In [20]:
tag_mapping = {
    'a': 'adj',
    'r': 'adv',
    'd': 'det',
    'n': 'noun',
    'v': 'verb',
    'p': 'pron',
    'c': 'conj',
    'i': 'interj',
    's': 'prep',
    'f': 'punt',
    'z': 'num',
    'w': 'date-time',
}

def get_category(entry):
    return tag_mapping[entry['eagle'][0]]

def has_category(category, entries):
    return category in map(get_category, entries)

def is_number(x):
    return x in ['un', 'una', 'dos', 'tres', 'cuatro', 'cinco', 'seis', 'siete', 'ocho', 'nueve']

def ingredient_tagger_first(x):
    result = []
    tokens = nltk.word_tokenize(x)
    if len(tokens) == 1:
        result.append((x, 'noun'))
    else:
        tags = sp_tagger.tag(tokens)
        for token, tag in tags:
            if not tag:
                res = list(db.es_lexicon.find({'flexion': token}))
                if res:
                    if has_category('noun', res):
                        tag = 'noun'
                    elif has_category('adj', res):
                        tag = 'adj'
                    elif has_category('verb', res):
                        tag = 'verb'
                    elif has_category('num', res):
                        tag = 'num'
                    else:
                        tag = get_category(res[0])
                else:
                    tag = 'noun'
                result.append((token, tag))
            elif is_number(token):
                result.append((token, 'num'))
            else:
                result.append((token, tag_mapping[tag.lower()[0]]))
    return result

In [21]:
# with open('data/ingredients_postags.csv', 'w') as f:
#     writer = csv.writer(
#         f,
#         delimiter=',',
#         quotechar='"',
#         quoting=csv.QUOTE_MINIMAL
#     )
#     for ingredient in graph_syn.nodes_iter():
#         pos_tag = ' '.join(tag for token, tag in ingredient_tagger_first(ingredient))
#         row = [ingredient, pos_tag]
#         writer.writerow(row)

In [22]:
postags = {}
with open('data/ingredients_postags.csv') as f:
    reader = csv.reader(
        f,
        delimiter=',',
    )
    for row in reader:
        postags[row[0]] = row[1]

In [23]:
def ingredient_tagger(x):
    if x in postags:
        tags = postags[x]
    else:
        tags = ' '.join(tag for token, tag in ingredient_tagger_first(x))
    return list(zip(nltk.word_tokenize(x),nltk.word_tokenize(tags)))

# Lemmatization

In [24]:
def first(cat, entries):
    for e in entries:
        if e['eagle'][0] == cat:
            break
    return e

def singularize_first(x):
    singular = []
    tokens = nltk.word_tokenize(x)
    for token in tokens:
        sing = token
        if x.endswith('s'):
            if token == 'los':
                sing = 'el'
            elif token == 'dos':
                sing = 'dos'
            else:
                res = list(db.es_lexicon.find({'flexion': token}))
                if res:
                    if has_category('noun', res):
                        r = first('n', res)
                        eagle = r['eagle'][:3] + 's' + r['eagle'][4:]
                    elif has_category('adj', res):
                        r = first('a', res)
                        eagle = r['eagle'][:4] + 's' + r['eagle'][5:]
                    elif has_category('verb', res):
                        r = first('v', res)
                        eagle = r['eagle'][:5] + 's' + r['eagle'][6:]
                    elif has_category('det', res):
                        r = first('d', res)
                        eagle = r['eagle'][:4] + 's' + r['eagle'][5:]
                    elif has_category('pron', res):
                        r = first('p', res)
                        eagle = r['eagle'][:4] + 's' + r['eagle'][5:]
                    elif has_category('prep', res):
                        r = first('s', res)
                        eagle = r['eagle'][:3] + 's' + r['eagle'][4:]
                    else:
                        r = res[0]
                        eagle = r['eagle']
                    lemma = r['lemma']
                    s = db.es_lexicon.find_one({'lemma': lemma, 'eagle': eagle})
                    if s:
                        sing = s['flexion']
        singular.append(sing)
    return ' '.join(singular)

In [25]:
# with open('data/ingredients_lemmas.csv', 'w') as f:
#     writer = csv.writer(
#         f,
#         delimiter=',',
#         quotechar='"',
#         quoting=csv.QUOTE_MINIMAL
#     )
#     for ingredient in graph_syn.nodes_iter():
#         lemma = singularize_first(ingredient)
#         writer.writerow([ingredient, lemma])

In [26]:
lemmas = {}
with open('data/ingredients_lemmas.csv') as f:
    reader = csv.reader(
        f,
        delimiter=',',
    )
    for row in reader:
        lemmas[row[0]] = row[1]

In [27]:
def lemmatize(x):
    if x in lemmas:
        lemma = lemmas[x]
    else:
        lemma = singularize_first(x)
    return lemma

# apicultur synonyms

In [28]:
graph_syn = nx.read_gexf('data/spanish_lexicon_1.gexf')

In [29]:
len(graph_syn)

3322

In [30]:
graph_syn.number_of_edges()

101

In [31]:
nx.number_connected_components(graph_syn)

3221

In [32]:
nouns = set()
for ingr in graph_syn.nodes_iter():
    tags = ingredient_tagger(ingr)
    for token, tag in tags:
        if tag == 'noun':
            nouns.add(token)

In [33]:
len(nouns)

1691

In [34]:
# with open('data/apicultur_synonyms.csv', 'w') as f:
#     writer = csv.writer(
#         f,
#         delimiter=',',
#         quotechar='"',
#         quoting=csv.QUOTE_MINIMAL
#     )
#     base_url = 'https://store.apicultur.com/api/sinonimosporpalabra/1.0.0/'
#     headers = {'Authorization': 'Bearer uHS_7Q2Esg7XsUKNsaqFx2sB1mca'}
#     count = 0
#     for noun in nouns:
#         if noun in graph_syn:
#             url = base_url + noun
#             response = requests.get(url, headers=headers)
#             if response.text:
#                 js = response.json()
#                 row = [noun]
#                 for d in js:
#                     row.append(d['valor'])
#                 writer.writerow(row)
#             time.sleep(1)
#         count += 1
#         if count % 50 == 0:
#             time.sleep(10)

In [35]:
apicultur_syns = {}
with open('data/apicultur_synonyms.csv') as f:
    reader = csv.reader(
        f,
        delimiter=',',
    )
    for row in reader:
        apicultur_syns[row[0]] = row[1:]

In [36]:
def synonyms(x):
    return apicultur_syns.get(x, [])

In [37]:
syns_found = 0
for noun in nouns:
    syns = synonyms(noun)
    for syn in syns:
        if syn in graph_syn:
            add_edge(graph_syn, noun, syn)
            syns_found += 1
syns_found

290

In [38]:
len(graph_syn)

3322

In [39]:
graph_syn.number_of_edges()

190

In [40]:
nx.number_connected_components(graph_syn)

3132

In [41]:
nx.write_gexf(graph_syn, 'data/spanish_lexicon_2.gexf')

# Normalization

In [42]:
# Numbers
def numbers(x):
    return x.replace(' 1 ', ' uno ') \
            .replace(' 2 ', ' dos ') \
            .replace(' 3 ', ' tres ') \
            .replace(' 4 ', ' cuatro ') \
            .replace(' 5 ', ' cinco ') \
            .replace(' 6 ', ' seis ') \
            .replace(' 7 ', ' siete ') \
            .replace(' 8 ', ' ocho ') \
            .replace(' 9 ', ' nueve ')

# Accent marks on vowels - {'á', 'ã', 'ç', 'è', 'é', 'ê', 'í', 'ñ', 'ò', 'ó', 'ú', 'ü', 'ō'}
def accent_marks(x):
    return x.replace('á', 'a') \
            .replace('ã', 'a') \
            .replace('è', 'e') \
            .replace('é', 'e') \
            .replace('ê', 'e') \
            .replace('í', 'i') \
            .replace('ò', 'o') \
            .replace('ó', 'o') \
            .replace('ō', 'o') \
            .replace('ú', 'u') \
            .replace('ü', 'u')

# Non-ascii consonants - {'á', 'ã', 'ç', 'è', 'é', 'ê', 'í', 'ñ', 'ò', 'ó', 'ú', 'ü', 'ō'}
def nonascii_consonants(x):
    return x.replace('ç', 'c') \
            .replace('ñ', 'n')
    
# Dashes (-)
def dashes1(x):
    return x.replace('-', '')

def dashes2(x):
    return x.replace('-', ' ')

# POS tags
# ADJETIVOS .... A ADJ ...... X
# ADVERBIOS .... R ADV
# DETERMINANTES  D DET
# NOMBRES ...... N NOUN ..... X
# VERBOS ....... V VERB ..... X
# PRONOMBRES ... P PRON
# CONJUNCIONES . C CONJ
# INTERJECCIONES I INTERJ
# PREPOSICIONES  S PREP
# PUNTUACIÓN ... F PUNTUATION
# NUMERALES .... Z NUM ...... X
# FECHAS Y HORAS W DATE-TIME
def pos_tags(x):
    tags = ingredient_tagger(x)
    filtered = [token
                for token, tag in tags
                if tag in ['num', 'verb', 'adj', 'noun']
               ]
    return ' '.join(filtered)

def singular(x):
    return lemmatize(x)

def itself(x):
    return x

funcs = [itself, singular, pos_tags, numbers, accent_marks, nonascii_consonants, dashes1, dashes2]
combinations = []
for i in range(1, len(funcs) + 1):
    combinations.append(list(itertools.combinations(funcs, i)))
combinations = [c for comb in combinations for c in comb]

# def normalize(ingredient): # time consuming
#     result = set()
#     for c in combinations:
#         x = ingredient
#         for f in c:
#             x = f(x)
#         result.add(x)
#     return result

def normalize(ingredient): # dynamic programming version
    d = {}
    for f in funcs:
        d[f] = {}
    result = set()
    for c in combinations:
        x = ingredient
        for f in c:
            if not x in d[f]:
                d[f][x] = f(x)
            x = d[f][x]
        result.add(x)
    return result

In [43]:
len([list(map(lambda x: x.__name__, c)) for c in combinations])

255

In [58]:
graph_syn = nx.read_gexf('data/spanish_lexicon_2.gexf')

In [59]:
len(graph_syn)

3322

In [60]:
graph_syn.number_of_edges()

190

In [61]:
nx.number_connected_components(graph_syn)

3132

In [81]:
# for ingr in graph_syn.nodes():
#     norms = normalize(ingr)
#     for norm in norms:
#         if not norm in graph_syn:
#             graph_syn.add_node(norm, count=1)
#         else:
#             graph_syn.node[norm]['count'] += 1
#         add_edge(graph_syn, ingr, norm)

In [65]:
len(graph_syn)

6996

In [66]:
graph_syn.number_of_edges()

3964

In [67]:
nx.number_connected_components(graph_syn)

3032

In [68]:
nx.write_gexf(graph_syn, 'data/spanish_lexicon_3.gexf')

In [61]:
graph_syn = nx.read_gexf('data/spanish_lexicon_3.gexf')

# Synonyms

In [3]:
def my_ngrams(ingredient):
    ngrms = []
    tokens = nltk.word_tokenize(ingredient)
    for i in range(1, len(tokens) + 1):
        ngrms.extend(ngrams(tokens, i))
    return list(map(lambda x: ' '.join(x), ngrms))

In [4]:
lengths = defaultdict(int)
for ingr in graph_syn.nodes_iter():
    lengths[len(nltk.word_tokenize(ingr))] += 1
lengths = dict(lengths)

In [5]:
lengths

{1: 1209, 2: 2392, 3: 2312, 4: 629, 5: 289, 6: 109, 7: 28, 8: 13, 9: 11, 10: 4}

In [6]:
def minimal_syns(ingredient):
    result = set()
    syns = nx.node_connected_component(graph_syn, ingredient)
    for syn1 in syns:
        ok = True
        for syn2 in syns:
            if syn2 != syn1 and syn2 in nltk.word_tokenize(syn1):
                ok = False
                break
        if ok:
            result.add(syn1)
    return result

In [156]:
g2.nodes(data=True)

[('salsas', {'count': 1}),
 ('tomate', {'count': 1}),
 ('salsa de tomate natural', {'count': 1}),
 ('salsa de tomates', {}),
 ('salsa', {'count': 1}),
 ('tomates', {'count': 1}),
 ('salsa de tomate', {'count': 1})]

In [157]:
g2.edges(data=True)

[('salsa de tomates', 'salsa de tomate', {}),
 ('salsa', 'salsa', {}),
 ('tomates', 'tomates', {})]

In [17]:
def ngram_combinations(ingredient):
    combs = []
    ngram_list = my_ngrams(ingredient)
    for i in range(1, len(ngram_list) + 1):
        combs.extend(permutations(ngram_list, i))
    combs = [list(c) for c in combs if ' '.join(c) == ingredient]
    return combs

In [18]:
%%time
ngram_combinations('salsa de tomate')

CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 1.49 ms


[['salsa de tomate'],
 ['salsa', 'de tomate'],
 ['salsa de', 'tomate'],
 ['salsa', 'de', 'tomate']]

In [53]:
def comb_syns(expr, syn_dict):
    res = set()
    combs = ngram_combinations(expr)
    for ngrms in combs:
        syn_list = [syn_dict[ngrm] for ngrm in ngrms]
        syn_comb = list(product(*syn_list))
        for sc in syn_comb:
            res.add(' '.join(sc))
    return list(res)
    
expr= 'salsa de tomate'
syn_dict = {
    'salsa': ['salsa', 'salsas'],
    'de': ['de'],
    'tomate': ['tomate', 'tomates'],
    'salsa de': ['salsa de'],
    'de tomate': ['de tomate', 'tomatil'],
    'salsa de tomate': ['salsa de tomate'],
}

comb_syns(expr, syn_dict)

['salsas de tomates',
 'salsa tomatil',
 'salsa de tomates',
 'salsas de tomate',
 'salsa de tomate',
 'salsas tomatil']

In [54]:
def create_syn_dict(ngrms):
    d = {}
    for ngrm in ngrms:
        d[ngrm] = set([ngrm])
    return d

create_syn_dict(my_ngrams('salsa de tomate'))

{'de': {'de'},
 'de tomate': {'de tomate'},
 'salsa': {'salsa'},
 'salsa de': {'salsa de'},
 'salsa de tomate': {'salsa de tomate'},
 'tomate': {'tomate'}}

In [55]:
# g2 = nx.Graph()
# g2.add_node('tomate', count=1)
# g2.add_node('tomates', count=1)
# g2.add_node('salsa', count=1)
# g2.add_node('salsas', count=1)
# g2.add_node('de tomate', count=1)
# g2.add_node('tomatil', count=1)
# g2.add_node('salsa de tomate', count=1)
# g2.add_node('salsas de tomate', count=1)
# g2.add_edge('tomate', 'tomates')
# g2.add_edge('de tomate', 'tomatil')
# g2.add_edge('salsa', 'salsas')
# graph_syn = g2
# # g3 = nx.Graph(g2)

In [67]:
# %%time

# for ingr in list(graph_syn.nodes()):
#     if 1 < len(nltk.word_tokenize(ingr)) < 5:
#         syns1 = nx.node_connected_component(graph_syn, ingr)
#         ngrms = my_ngrams(ingr)
#         syn_dict = create_syn_dict(ngrms)
#         for ngrm in ngrms:
#             if ngrm in graph_syn and ngrm not in syns1:
#                 syns2 = minimal_syns(ngrm)
#                 syn_dict[ngrm] = syn_dict[ngrm].union(syns2)
#         syn_combs = comb_syns(ingr, syn_dict)
#         for syn_ingr in syn_combs:
#             if not syn_ingr in graph_syn:
#                 graph_syn.add_node(syn_ingr, count=1)
#             else:
#                 graph_syn.node[syn_ingr]['count'] += 1
#             add_edge(graph_syn, ingr, syn_ingr)

print('CPU times: user 50min 25s, sys: 3min 47s, total: 54min 12s')
print('Wall time: 54min 10s')

CPU times: user 50min 25s, sys: 3min 47s, total: 54min 12s
Wall time: 54min 10s


In [63]:
len(graph_syn)

34548

In [64]:
graph_syn.number_of_edges()

31558

In [65]:
nx.number_connected_components(graph_syn)

2990

In [66]:
nx.write_gexf(graph_syn, 'data/spanish_lexicon_4.gexf')

In [70]:
nx.node_connected_component(graph_syn, 'salsa de tomate')

{'bechamel de tomate',
 'bechamel de tomates',
 'bechamel tomate',
 'bechamel tomates',
 'besamel de tomate',
 'besamel de tomates',
 'besamel tomate',
 'besamel tomates',
 'salsa de tomate',
 'salsa de tomates',
 'salsa tomate',
 'salsa tomates'}

In [217]:
# %%time

# print(list(graph_syn.nodes()))
# print(list(graph_syn.edges()))
# graph_aux = nx.Graph(graph_syn)
graph_aux = graph_syn
c=1
# for length in lengths:
# while True:
n_nodes_old = graph_syn.number_of_nodes()
n_edges_old = graph_syn.number_of_edges()
print('---')
print(n_nodes_old, 'nodes')
print(n_edges_old, 'edges')
print(nx.number_connected_components(graph_syn), 'connected components')
for ingr in list(graph_aux.nodes()):
#     if len(nltk.word_tokenize(ingr)) == length:
    syns1 = nx.node_connected_component(graph_aux, ingr)
    ngrms = my_ngrams(ingr)
    for ngrm in ngrms:
#         print('>>>', ngrm)
        if ngrm in graph_aux and ngrm not in syns1:
            syns2 = minimal_syns(ngrm)
            for syn in syns2:
                syn_ingr = ingr.replace(ngrm, syn)
                if not syn_ingr in graph_syn:
                    graph_syn.add_node(syn_ingr, count=1)
                else:
                    graph_syn.node[syn_ingr]['count'] += 1
                add_edge(graph_syn, ingr, syn_ingr)
    #             if c>10:
    #                 break
    
    
#     print(list(graph_syn.nodes()))
#     print(list(graph_syn.edges()))
    
n_nodes_new = graph_syn.number_of_nodes()
n_edges_new = graph_syn.number_of_edges()
print('Loop', c)
print(n_nodes_new, 'nodes')
print(n_edges_new, 'edges')
print(nx.number_connected_components(graph_syn), 'connected components')
c+=1
#     if n_nodes_old == n_nodes_new and n_edges_old == n_edges_new:
#         break
    
#     c+=1
#     if c>10:
#         break
print('===')
print('End')
print(n_nodes_new, 'nodes')
print(n_edges_new, 'edges')
print(nx.number_connected_components(graph_syn), 'connected components')

---
8 nodes
3 edges
5 connected components
Loop 1
13 nodes
9 edges
4 connected components
===
End
13 nodes
9 edges
4 connected components


In [218]:
g2.nodes(data=True)

[('salsa tomatil', {'count': 1}),
 ('salsas', {'count': 1}),
 ('salsa', {'count': 1}),
 ('salsas de tomate', {'count': 5}),
 ('tomatil', {'count': 1}),
 ('salsa de tomates', {'count': 2}),
 ('salsas de tomates', {'count': 1}),
 ('salsa de tomate', {'count': 5}),
 ('salsas tomatil', {'count': 1}),
 ('tomate', {'count': 1}),
 ('de tomate', {'count': 2}),
 ('tomates', {'count': 1}),
 ('de tomates', {'count': 1})]

In [171]:
g2.edges(data=True)

[('salsas', 'salsa', {}),
 ('tomate', 'tomates', {}),
 ('salsas de tomate', 'salsa de tomate', {}),
 ('salsa de tomates', 'salsa de tomate', {})]

In [135]:
nx.node_connected_component(graph_syn, 'patata')

{'papa', 'papas', 'patata'}

In [99]:
graph_syn.nodes()

['salsa de tomates', 'los tomates', 'tomates', 'salsa de tomate', 'tomate']

In [180]:
len(graph_syn)

11816

In [181]:
nx.number_connected_components(graph_syn)

3092

In [182]:
graph_syn.number_of_edges()

8724

In [135]:
nx.node_connected_component(graph_syn, 'salsa de soja')

{'salsa de soja', 'shoyu', 'shō-yu', 'shōyu'}

In [46]:
for line in all_lines:
    syn_set = set()
    ingrs1 = line.strip()
    for ingrs2 in ingrs1.split(' / '):
        for ingr in ingrs2.split(' o '):
            syn_set.add(ingr)
    syn_set = list(syn_set)
    i1 = syn_set[0]
    for i2 in syn_set[1:]:
        graph_syn.add_edge(i1, i2)

In [31]:
nonascii = set()
for ingr in graph_syn.nodes_iter():
#     q = ingr.replace(' ', 'xxx')
#     if not q.isalpha():
#         print(q.replace('xxx', ' '))
    if not is_ascii(ingr):
        for c in ingr:
            if not is_ascii(c):
                nonascii.add(c)

In [45]:
def normalize(expr):
    pass

def lemmatize(expr):
    pass

In [46]:
for line in all_lines:
    syn_set = set()
    ingrs1 = line.strip()
    for ingrs2 in ingrs1.split(' / '):
        for ingr in ingrs2.split(' o '):
            lemmatized = lemmatize(ingr)
            if not lemmatized in graph_syn:
                graph_syn.add_node(lemmatized, count=1, is_lemma=True, is_repr=False)
            else:
                graph_syn.node[lemmatized]['count'] += 1
                graph_syn.node[lemmatized]['is_lemma'] = True
            syn_set.add(ingr)
            syn_set.add(lemmatized)
    syn_set = list(syn_set)
    i1 = syn_set[0]
    for i2 in syn_set[1:]:
        graph_syn.add_edge(i1, i2)

In [151]:
g=nx.Graph()

In [152]:
g.add_nodes_from([1,2,3])

In [153]:
c=0
for x in list(g.nodes()):
    g.add_node(x+1)
    print('adding',x+1)
    c+=1
    if c>10:
        break

adding 2
adding 3
adding 4


In [150]:
g.nodes()

[1, 2, 3, 4, 5, 6, 7]

In [26]:
g.add_edge(1, 1, b=5)

In [27]:
g.edges(data=True)

[(1, 1, {'b': 5})]

In [28]:
g.add_node(1, {'a':3})

In [29]:
g.nodes(data=True)

[(1, {'a': 3})]

In [26]:
g.node[1]['a'] += 1

In [5]:
type(nltk.FreqDist())

nltk.probability.FreqDist

In [6]:
def lemmatizer(x):
    return nltk.FreqDist(x)

In [12]:
xxx=lemmatizer()

In [None]:
def add_es_ingredient(ingredient):
    #ingredient tiene format "salsa de soja o salsa de soya" -> esto no puede ir en las funciones clean
    #pq entonces se pierde la info de que son ingredientes sinonimos
    #puede estar en singular o plurar
    #hay que lematizar la expresion (ingredient) completa, eliminar preps, arts... segun mi criterio
    #hay que guardar todas las posibles variantes del ingredient y su lematizacion
    #guardar esta funcion en un obj pickle y "exportarla" donde sea necesario

In [28]:
if __name__=='__main__':
    with open('pickle/lemmatizer.pickle', 'wb') as f:
        pickle.dump(xxx, f)

In [24]:
class A:
    def lemmatizer(self, x):
        return nltk.FreqDist(x)

In [25]:
xxx = A()

In [26]:
xxx.lemmatizer(['a','a','b'])

Counter({'a': 2, 'b': 1})

In [38]:
>>> from nltk.corpus import wordnet as wn
for ss in wn.synsets('oil'):
    print(ss.lemma_names('spa'))

['aceite']
['óleo']
['petróleo']
[]
[]
[]


In [69]:
wn.langs

<bound method WordNetCorpusReader.langs of <WordNetCorpusReader in '/home/antonio/nltk_data/corpora/wordnet'>>

In [70]:
from nltk.corpus import omw

ImportError: cannot import name 'omw'

In [83]:
l=wn.lemmas('cane', lang='ita')[0]

In [92]:
l.synset()

Synset('dog.n.01')

In [93]:
l

Lemma('dog.n.01.cane')

In [25]:
ls = wn.lemmas('amaba', lang='spa')

In [26]:
for x in ls:
    print(x.name())

In [None]:
lematizar solamente usando mi lexicon en español
si no existe la palabra y no se puede lematizar, quitar s final si existe
esto aplica a adj y noun, poco probable encontrar un verbo