In [1]:
import csv
import itertools
import json
import os
import pickle
import time
from collections import defaultdict
from itertools import product
from itertools import permutations

import networkx as nx
import nltk
import requests
from nltk.corpus import cess_esp
from nltk.corpus import wordnet as wn
from nltk.util import ngrams
from pymongo import MongoClient

# Lexicon

In [2]:
client = MongoClient()
# client.drop_database('lexicon')
db = client.lexicon

In [3]:
# with open('data/es_lexicon.csv') as f:
#     reader = csv.reader(
#         f,
#         delimiter=' ',
#     )
#     docs = []
#     count = 0
#     for row in reader:
#         for i in range(1, len(row[1:]), 2):
#             entry = {}
#             entry['flexion'] = row[0].lower()
#             entry['lemma'] = row[i].lower()
#             entry['eagle'] = row[i+1].lower()
#             docs.append(entry)
#             count += 1
#         if count % 1000 == 0:
#             db.es_lexicon.insert_many(docs)
#             docs = []
#     db.es_lexicon.insert_many(docs)
#     docs = []

In [4]:
db.es_lexicon.count()

668825

# Stopwords

In [5]:
spanish_stopwords = set()
with open('data/spanish_stopwords.txt') as f:
    for line in f:
        word = line.strip()
        spanish_stopwords.add(word)

# POS tagger

In [6]:
# tagged_sp_sents = cess_esp.tagged_sents()

In [7]:
# size = int(len(tagged_sp_sents) * 0.1)
# train_sp_sents = tagged_sp_sents[size:]
# test_sp_sents = tagged_sp_sents[:size]

In [8]:
# tagged_sp_words = cess_esp.tagged_words()

In [9]:
# tags = [tag for (word, tag) in tagged_sp_words]
# most_freq_tags = nltk.FreqDist(tags)
# most_freq_tags.most_common()[:10]

# [('sps00', 25272),
#  ('ncms000', 11428),
#  ('Fc', 11420),
#  ('ncfs000', 11008),
#  ('da0fs0', 6838),
#  ('da0ms0', 6012),
#  ('rg', 5937),
#  ('Fp', 5866),
#  ('cc', 5854),
#  ('ncmp000', 5711)]

In [10]:
# default_tag = 'ncms000'

In [11]:
# t0 = nltk.DefaultTagger(None)
# t1 = nltk.UnigramTagger(train_sp_sents, backoff=t0)
# t2 = nltk.BigramTagger(train_sp_sents, backoff=t1)
# sp_tagger = nltk.TrigramTagger(train_sp_sents, backoff=t2)

In [12]:
# sp_tagger.evaluate(test_sp_sents)

# 0.8808231173380034

In [13]:
# with open('data/sp_tagger.pickle', 'wb') as f:
#     pickle.dump(sp_tagger, f)

In [14]:
with open('data/sp_tagger.pickle', 'rb') as f:
    sp_tagger = pickle.load(f)

# Ingredients

In [15]:
def is_spanish_ingredients_file(filename):
    return filename.startswith('es_') and filename.endswith('_ingredients.txt')

In [16]:
def add_node(g, n):
    if not n in g:
        g.add_node(n, count=0, represent=True)

In [17]:
def add_edge(g, n1, n2):
    if n1 != n2 and not nx.has_path(g, n1, n2):
        g.add_edge(n1, n2)
        ns = nx.node_connected_component(g, n1)
        rs = [n for n in ns if g.node[n]['represent']]
        for r1 in rs:
            for r2 in rs:
                if r1 != r2 and sublist(nltk.word_tokenize(r2), nltk.word_tokenize(r1)):
                    g.node[r1]['represent'] = False
                    break

In [18]:
def sublist(a, b):
    res = False
    for i in range(len(b)-len(a)+1):
        if b[i:i+len(a)] == a:
            res = True
            break
    return res

In [19]:
graph_syn = nx.Graph()
ingredients_root = 'data/ingredients/'
for e in os.listdir(ingredients_root):
    file_path = ingredients_root + e
    if os.path.isfile(file_path):
        if is_spanish_ingredients_file(e):
            with open(file_path) as f:
                for line in f:
                    syn_set = set()
                    ingrs1 = line.strip()
                    for ingrs2 in ingrs1.split(' / '):
                        for ingrs3 in ingrs2.split(' o '):
                            for ingr in ingrs3.split(' - '):
                                syn_set.add(ingr)
                                add_node(graph_syn, ingr)
                    syn_set = list(syn_set)
                    i1 = syn_set[0]
                    for i2 in syn_set[1:]:
                        add_edge(graph_syn, i1, i2)

In [20]:
len(graph_syn)

3327

In [21]:
len([n for n, d in graph_syn.nodes_iter(data=True) if d['represent']])

3319

In [22]:
nx.number_connected_components(graph_syn)

3222

In [23]:
nx.write_gexf(graph_syn, 'data/spanish_ingredients_lexicon_1.gexf')

In [24]:
graph_syn = nx.read_gexf('data/spanish_ingredients_lexicon_1.gexf')

# POS tagging

In [25]:
tag_mapping = {
    'a': 'adj',
    'r': 'adv',
    'd': 'det',
    'n': 'noun',
    'v': 'verb',
    'p': 'pron',
    'c': 'conj',
    'i': 'interj',
    's': 'prep',
    'f': 'punt',
    'z': 'num',
    'w': 'date-time',
}

def map_tag(eagle):
    return tag_mapping[eagle[0]]

def get_category(entry):
    if entry['eagle'][0] == 'v' and entry['eagle'][2] == 'p':
        categ = 'adj'
    else:
        categ = map_tag(entry['eagle'])
    return categ

def has_category(category, entries):
    return category in map(get_category, entries)

def is_number(x):
    return x in ['dos', 'tres', 'cuatro', 'cinco', 'seis', 'siete', 'ocho', 'nueve']

def ingredient_tagger(x):
    result = []
    tokens = nltk.word_tokenize(x)
    if len(tokens) == 1:
        result.append((x, 'noun'))
    else:
        tags = sp_tagger.tag(tokens)
        for token, tag in tags:
            if is_number(token):
                tag = 'num'
            elif tag:
                tag = map_tag(tag.lower())
                if tag == 'verb':
                    res = list(db.es_lexicon.find({'flexion': token}))
                    if res:
                        if has_category('adj', res):
                            tag = 'adj'
                        elif has_category('noun', res):
                            tag = 'noun'
            else:
                res = list(db.es_lexicon.find({'flexion': token}))
                if res:
                    if has_category('adj', res):
                        tag = 'adj'
                    elif has_category('noun', res):
                        tag = 'noun'
                    elif has_category('verb', res):
                        tag = 'verb'
                    elif has_category('det', res):
                        tag = 'det'
                    elif has_category('pron', res):
                        tag = 'pron'
                    elif has_category('prep', res):
                        tag = 'prep'
                    elif has_category('num', res):
                        tag = 'num'
                    else:
                        tag = get_category(res[0])
                else:
                    tag = 'noun'
            result.append((token, tag))
    return result

In [26]:
# %%time

# with open('data/spanish_ingredients_postags.csv', 'w') as f:
#     writer = csv.writer(
#         f,
#         delimiter=',',
#         quotechar='"',
#         quoting=csv.QUOTE_MINIMAL
#     )
#     for ingr, dat in graph_syn.nodes_iter(data=True):
#         if dat['representative']:
#             pos_tag = ' '.join(tag for token, tag in ingredient_tagger(ingr))
#             row = [ingr, pos_tag]
#             writer.writerow(row)

# CPU times: user 6.56 s, sys: 356 ms, total: 6.91 s
# Wall time: 22min 14s

In [27]:
postags = {}
with open('data/spanish_ingredients_postags.csv') as f:
    reader = csv.reader(
        f,
        delimiter=',',
    )
    for row in reader:
        postags[row[0]] = row[1]

In [28]:
def get_postags(x):
    try:
        tags = postags[x]
    except:
        postags[x] = ' '.join(tag for token, tag in ingredient_tagger(x))
        tags = postags[x]
    return list(zip(nltk.word_tokenize(x),nltk.word_tokenize(tags)))

# Example
get_postags('pimientos verdes')

[('pimientos', 'noun'), ('verdes', 'adj')]

# apicultur synonyms

In [29]:
nouns = set()
for ingr in graph_syn.nodes_iter():
    for token, tag in get_postags(ingr):
        if tag == 'noun':
            nouns.add(token)

In [30]:
len(nouns)

1620

In [31]:
# with open('data/apicultur_ingredients_synonyms.csv', 'w') as f:
#     writer = csv.writer(
#         f,
#         delimiter=',',
#         quotechar='"',
#         quoting=csv.QUOTE_MINIMAL
#     )
#     base_url = 'https://store.apicultur.com/api/sinonimosporpalabra/1.0.0/'
#     headers = {'Authorization': 'Bearer uHS_7Q2Esg7XsUKNsaqFx2sB1mca'}
#     count = 0
#     for noun in nouns:
#         if noun in graph_syn:
#             url = base_url + noun
#             response = requests.get(url, headers=headers)
#             if response.text:
#                 js = response.json()
#                 row = [noun]
#                 for d in js:
#                     row.append(d['valor'])
#                 writer.writerow(row)
#             time.sleep(1)
#         count += 1
#         if count % 50 == 0:
#             time.sleep(10)

In [32]:
apicultur_syns = {}
with open('data/apicultur_ingredients_synonyms.csv') as f:
    reader = csv.reader(
        f,
        delimiter=',',
    )
    for row in reader:
        apicultur_syns[row[0]] = row[1:]

In [33]:
apicultur_graph = nx.Graph()
for k in apicultur_syns:
    syns = apicultur_syns[k]
    for syn in syns:
        apicultur_graph.add_edge(k, syn)

In [34]:
nx.number_connected_components(apicultur_graph)

104

In [35]:
def is_kn_complete(g):
    complete = True
    for n1 in g:
        for n2 in g:
            if n1 != n2 and not g.has_edge(n1, n2):
                complete = False
                break
        if not complete:
            break
    return complete

In [36]:
kn_complete_graphs = []
for subg in nx.connected_component_subgraphs(apicultur_graph):
    if is_kn_complete(subg):
        kn_complete_graphs.append(subg)

In [37]:
len(kn_complete_graphs)

34

In [38]:
syns_found = 0
for g in kn_complete_graphs:
    syn_set = g.nodes()
    i1 = syn_set[0]
    add_node(graph_syn, i1)
    for i2 in syn_set[1:]:
        add_node(graph_syn, i2)
        add_edge(graph_syn, i1, i2)
        syns_found += 1
syns_found

40

In [39]:
len(graph_syn)

3354

In [40]:
len([n for n, d in graph_syn.nodes_iter(data=True) if d['represent']])

3346

In [41]:
graph_syn.number_of_edges()

144

In [42]:
nx.number_connected_components(graph_syn)

3210

In [43]:
nx.write_gexf(graph_syn, 'data/spanish_ingredients_lexicon_2.gexf')

In [44]:
graph_syn = nx.read_gexf('data/spanish_ingredients_lexicon_2.gexf')

# Singular and plural

In [45]:
def is_singular_word(word_tag):
    word = word_tag[0]
    tag = word_tag[1]
    return tag in ['adj', 'noun'] and not word.endswith('s')

def is_singular_ingredient(ingredient):
    return any(map(is_singular_word, get_postags(ingredient)))

def naive_pluralize(word):
    if word[-1] in 'aeiou':
        plural = word + 's'
    elif word[-1] == 'z':
        plural = word[:-1] + 'ces'
    else:
        plural = word + 'es'
    return plural

def pluralize_adj(word):
    if not set('áéíóú').intersection(word):
        return naive_pluralize(word)
    
    singular = word
    r1 = db.es_lexicon.find_one({'flexion': word, 'eagle': {'$regex': 'a...s.*'}})
    if r1:
        lemma = r1['lemma']
        eagle = r1['eagle'][:4] + 'p' + r1['eagle'][5:]
        r2 = db.es_lexicon.find_one({'lemma': lemma, 'eagle': eagle})
        if r2:
            singular = r2['flexion']
    return singular

def pluralize_noun(word):
    if not set('áéíóú').intersection(word):
        return naive_pluralize(word)
    
    singular = word
    r1 = db.es_lexicon.find_one({'flexion': word, 'eagle': {'$regex': 'n..s.*'}})
    if r1:
        lemma = r1['lemma']
        eagle = r1['eagle'][:3] + 'p' + r1['eagle'][4:]
        r2 = db.es_lexicon.find_one({'lemma': lemma, 'eagle': eagle})
        if r2:
            singular = r2['flexion']
    return singular

def pluralize_verb(word):
    singular = word
    r1 = db.es_lexicon.find_one({'flexion': word, 'eagle': {'$regex': 'v....s.*'}})
    if r1:
        lemma = r1['lemma']
        eagle = r1['eagle'][:5] + 'p' + r1['eagle'][6:]
        r2 = db.es_lexicon.find_one({'lemma': lemma, 'eagle': eagle})
        if r2:
            singular = r2['flexion']
    return singular

def pluralize_det(word):
    singular = word
    r1 = db.es_lexicon.find_one({'flexion': word, 'eagle': {'$regex': 'd...s.*'}})
    if r1:
        lemma = r1['lemma']
        eagle = r1['eagle'][:4] + 'p' + r1['eagle'][5:]
        r2 = db.es_lexicon.find_one({'lemma': lemma, 'eagle': eagle})
        if r2:
            singular = r2['flexion']
    return singular

def pluralize_word(word_tag):
    word = word_tag[0]
    tag = word_tag[1]
    plural = word
    if word.isalpha():
        if tag == 'adj':
            plural = pluralize_adj(word)
        elif tag == 'noun':
            plural = pluralize_noun(word)
        elif tag == 'verb':
            singular = pluralize_verb(word)
        elif tag == 'det':
            singular = pluralize_det(word)
    return plural

def pluralize_ingredient(ingredient):
    plurals = map(pluralize_word, get_postags(ingredient))
    return ' '.join(plurals)

# Example
pluralize_ingredient('pimiento verde')

'pimientos verdes'

In [46]:
def is_plural_word(word_tag):
    word = word_tag[0]
    tag = word_tag[1]
    return tag in ['adj', 'noun'] and word.endswith('s')

def is_plural_ingredient(ingredient):
    return any(map(is_plural_word, get_postags(ingredient)))

def naive_singularize(word):
    return word[:-1]

def singularize_adj(word):
    if not set('áéíóú').intersection(word) and len(word) > 1 and word[-2] in 'aiou':
        return naive_singularize(word)
    
    singular = word
    r1 = db.es_lexicon.find_one({'flexion': word, 'eagle': {'$regex': 'a...p.*'}})
    if r1:
        lemma = r1['lemma']
        eagle = r1['eagle'][:4] + 's' + r1['eagle'][5:]
        r2 = db.es_lexicon.find_one({'lemma': lemma, 'eagle': eagle})
        if r2:
            singular = r2['flexion']
    return singular

def singularize_noun(word):
    if not set('áéíóú').intersection(word) and len(word) > 1 and word[-2] in 'aiou':
        return naive_singularize(word)
    
    singular = word
    r1 = db.es_lexicon.find_one({'flexion': word, 'eagle': {'$regex': 'n..p.*'}})
    if r1:
        lemma = r1['lemma']
        eagle = r1['eagle'][:3] + 's' + r1['eagle'][4:]
        r2 = db.es_lexicon.find_one({'lemma': lemma, 'eagle': eagle})
        if r2:
            singular = r2['flexion']
    return singular

def singularize_verb(word):
    singular = word
    r1 = db.es_lexicon.find_one({'flexion': word, 'eagle': {'$regex': 'v....p.*'}})
    if r1:
        lemma = r1['lemma']
        eagle = r1['eagle'][:5] + 's' + r1['eagle'][6:]
        r2 = db.es_lexicon.find_one({'lemma': lemma, 'eagle': eagle})
        if r2:
            singular = r2['flexion']
    return singular

def singularize_det(word):
    singular = word
    r1 = db.es_lexicon.find_one({'flexion': word, 'eagle': {'$regex': 'd...p.*'}})
    if r1:
        lemma = r1['lemma']
        eagle = r1['eagle'][:4] + 's' + r1['eagle'][5:]
        r2 = db.es_lexicon.find_one({'lemma': lemma, 'eagle': eagle})
        if r2:
            singular = r2['flexion']
    return singular

def singularize_word(word_tag):
    word = word_tag[0]
    tag = word_tag[1]
    singular = word
    if word.isalpha():
        if tag == 'adj':
            singular = singularize_adj(word)
        elif tag == 'noun':
            singular = singularize_noun(word)
        elif tag == 'verb':
            singular = singularize_verb(word)
        elif tag == 'det':
            singular = singularize_det(word)
    return singular

def singularize_ingredient(ingredient):
    singulars = map(singularize_word, get_postags(ingredient))
    return ' '.join(singulars)

# Example
singularize_ingredient('pimientos verdes')

'pimiento verde'

In [48]:
# %%time

# for ingr, dat in graph_syn.nodes(data=True):
#     if dat['represent']:
#         if is_singular_ingredient(ingr):
#             plural = pluralize_ingredient(ingr)
#             add_node(graph_syn, plural)
#             add_edge(graph_syn, ingr, plural)
#         if is_plural_ingredient(ingr):
#             singular = singularize_ingredient(ingr)
#             add_node(graph_syn, singular)
#             add_edge(graph_syn, ingr, singular)

# CPU times: user 8.66 s, sys: 120 ms, total: 8.78 s
# Wall time: 8min 5s

In [50]:
# len(graph_syn)

# 6807

In [52]:
# len([n for n, d in graph_syn.nodes_iter(data=True) if d['represent']])

# 6796

In [54]:
# graph_syn.number_of_edges()

# 3695

In [55]:
# nx.number_connected_components(graph_syn)

# 3112

3112

In [56]:
nx.write_gexf(graph_syn, 'data/spanish_ingredients_lexicon_3.gexf')

In [100]:
graph_syn = nx.read_gexf('data/spanish_ingredients_lexicon_3.gexf')

# Normalization

In [58]:
# Numbers
def numbers(x):
    return x.replace(' 1 ', ' uno ') \
            .replace(' 2 ', ' dos ') \
            .replace(' 3 ', ' tres ') \
            .replace(' 4 ', ' cuatro ') \
            .replace(' 5 ', ' cinco ') \
            .replace(' 6 ', ' seis ') \
            .replace(' 7 ', ' siete ') \
            .replace(' 8 ', ' ocho ') \
            .replace(' 9 ', ' nueve ')

# Accent marks on vowels - {'á', 'ã', 'ç', 'è', 'é', 'ê', 'í', 'ñ', 'ò', 'ó', 'ú', 'ü', 'ō'}
def accent_marks(x):
    return x.replace('á', 'a') \
            .replace('ã', 'a') \
            .replace('è', 'e') \
            .replace('é', 'e') \
            .replace('ê', 'e') \
            .replace('í', 'i') \
            .replace('ò', 'o') \
            .replace('ó', 'o') \
            .replace('ō', 'o') \
            .replace('ú', 'u') \
            .replace('ü', 'u')

# Non-ascii consonants - {'á', 'ã', 'ç', 'è', 'é', 'ê', 'í', 'ñ', 'ò', 'ó', 'ú', 'ü', 'ō'}
def nonascii_consonants(x):
    return x.replace('ç', 'c') \
            .replace('ñ', 'n')
    
# Dashes (-)
def dashes1(x):
    return x.replace('-', ' ')

def dashes2(x):
    return x.replace('-', '')

# POS tags
# ADJETIVOS .... A ADJ ...... X
# ADVERBIOS .... R ADV
# DETERMINANTES  D DET
# NOMBRES ...... N NOUN ..... X
# VERBOS ....... V VERB ..... X
# PRONOMBRES ... P PRON
# CONJUNCIONES . C CONJ
# INTERJECCIONES I INTERJ
# PREPOSICIONES  S PREP
# PUNTUACIÓN ... F PUNTUATION
# NUMERALES .... Z NUM ...... X
# FECHAS Y HORAS W DATE-TIME
def pos_tags(x):
    tags = get_postags(x)
    filtered = [token
                for token, tag in tags
                if tag in ['adj', 'noun', 'verb', 'num']
               ]
    return ' '.join(filtered)

def itself(x):
    return x

funcs = [itself, pos_tags, numbers, accent_marks, nonascii_consonants, dashes1, dashes2]
combinations = []
for i in range(1, len(funcs) + 1):
    combinations.append(list(itertools.combinations(funcs, i)))
combinations = [c for comb in combinations for c in comb]

# def normalize(ingredient): # original time consuming version
#     result = set()
#     for c in combinations:
#         x = ingredient
#         for f in c:
#             x = f(x)
#         result.add(x)
#     return result

def normalize(ingredient): # dynamic programming version
    result = set()
    for c in combinations:
        x = ingredient
        for f in c:
            if not x in d[f.__name__]:
                d[f.__name__][x] = f(x)
            x = d[f.__name__][x]
        result.add(x)
    return result

In [59]:
len([list(map(lambda x: x.__name__, c)) for c in combinations])

127

In [60]:
# d = defaultdict(dict)

# or

with open('data/spanish_ingredients_normalization.pickle', 'rb') as f:
    d = pickle.load(f)

In [104]:
# %%time
# c=0
# for ingr, dat in graph_syn.nodes(data=True):
#     if len(nltk.word_tokenize(ingr)) < 4:
#         if dat['represent']:
#             norms = normalize(ingr)
#             for norm in norms:
#                 add_node(graph_syn, norm)
#                 add_edge(graph_syn, ingr, norm)
#     c+=1
#     if c%100==0:
#         print(c)

# d = dict(d)

# CPU times: user 12.4 s, sys: 324 ms, total: 12.8 s
# Wall time: 18min 55s

In [102]:
with open('data/spanish_ingredients_normalization.pickle', 'wb') as f:
    pickle.dump(d, f)

In [105]:
# len(graph_syn)

# 10899

10899

In [106]:
# len([n for n, d in graph_syn.nodes_iter(data=True) if d['represent']])

# 10867

10867

In [107]:
# graph_syn.number_of_edges()

# 7813

7813

In [108]:
# nx.number_connected_components(graph_syn)

# 3086

3086

In [109]:
# nx.write_gexf(graph_syn, 'data/spanish_ingredients_lexicon_4.gexf')

In [125]:
graph_syn = nx.read_gexf('data/spanish_ingredients_lexicon_4.gexf')

# Synonyms

In [69]:
def my_ngrams(ingredient):
    ngrms = []
    tokens = nltk.word_tokenize(ingredient)
    for i in range(1, len(tokens) + 1):
        ngrms.extend(ngrams(tokens, i))
    return list(map(lambda x: ' '.join(x), ngrms))

In [70]:
lengths = defaultdict(int)
for ingr in graph_syn.nodes_iter():
    lengths[len(nltk.word_tokenize(ingr))] += 1
lengths = dict(lengths)

In [71]:
lengths

{1: 2103,
 2: 4502,
 3: 4467,
 4: 1210,
 5: 578,
 6: 205,
 7: 51,
 8: 25,
 9: 23,
 10: 8}

In [None]:
# representative_syns_dict = {}

# or

with open('data/spanish_representative_ingredients.pickle', 'rb') as f:
    representative_syns_dict = pickle.load(f)

In [72]:
def representative_syns(ingredient):
    try:
        result = representative_syns_dict[ingredient]
    except:
        result = set()
        syns = nx.node_connected_component(graph_syn, ingredient)
        for syn in syns:
            dat = graph_syn.node[syn]
            if dat['represent']:
                result.add(syn)
        representative_syns_dict[ingredient] = result
    return result

representative_syns('salsa de tomate')

{'salsa de tomate', 'salsa tomate', 'salsas de tomates', 'salsas tomates'}

In [73]:
def ngram_combinations(ingredient):
    combs = []
    ngram_list = my_ngrams(ingredient)
    for i in range(1, len(ngram_list) + 1):
        combs.extend(permutations(ngram_list, i))
    combs = [list(c) for c in combs if ' '.join(c) == ingredient]
    return combs

# Example
ngram_combinations('salsa de tomate')

[['salsa de tomate'],
 ['salsa', 'de tomate'],
 ['salsa de', 'tomate'],
 ['salsa', 'de', 'tomate']]

In [74]:
def comb_syns(expr, syn_dict):
    res = set()
    combs = ngram_combinations(expr)
    for ngrms in combs:
        syn_list = [syn_dict[ngrm] for ngrm in ngrms]
        syn_comb = list(product(*syn_list))
        for sc in syn_comb:
            res.add(' '.join(sc))
    return list(res)

# Example
expr= 'salsa de tomate'
syn_dict = {
    'salsa': ['salsa', 'salsas'],
    'de': ['de'],
    'tomate': ['tomate', 'tomates'],
    'salsa de': ['salsa de'],
    'de tomate': ['de tomate', 'tomatil'],
    'salsa de tomate': ['salsa de tomate'],
}
comb_syns(expr, syn_dict)

['salsa de tomates',
 'salsa de tomate',
 'salsas de tomate',
 'salsas de tomates',
 'salsas tomatil',
 'salsa tomatil']

In [122]:
ingr='puré de erizo'
ngrms = my_ngrams(ingr)
syn_dict = create_syn_dict(ngrms)
for ngrm in ngrms:
    if ngrm in graph_syn and ngrm not in syns1:
        syns2 = representative_syns(ngrm)
        syn_dict[ngrm] = syn_dict[ngrm].union(syns2)
syn_combs = comb_syns(ingr, syn_dict)

In [124]:
syn_dict

{'de': {'de'},
 'de erizo': {'de erizo'},
 'erizo': {'cerdo espin',
  'cerdo espines',
  'cerdo espín',
  'cerdos espin',
  'cerdos espines',
  'cerdos espín',
  'chancho espin',
  'chancho espines',
  'chancho espín',
  'chanchos espin',
  'chanchos espines',
  'chanchos espín',
  'erizo',
  'erizos',
  'puerco espin',
  'puerco espines',
  'puerco espín',
  'puercos espin',
  'puercos espines',
  'puercos espín'},
 'puré': {'pure', 'pures', 'puré', 'purés'},
 'puré de': {'puré de'},
 'puré de erizo': {'puré de erizo'}}

In [123]:
syn_combs

['purés de puerco espín',
 'pures de cerdos espín',
 'purés de puercos espin',
 'puré de cerdo espin',
 'purés de cerdo espín',
 'pures de cerdo espín',
 'purés de cerdo espines',
 'pure de chancho espines',
 'pures de chanchos espín',
 'puré de erizos',
 'puré de puerco espines',
 'pure de erizos',
 'pures de cerdo espin',
 'purés de puerco espines',
 'purés de puerco espin',
 'purés de chanchos espín',
 'purés de cerdo espin',
 'pure de chanchos espin',
 'puré de puercos espin',
 'purés de erizo',
 'puré de puerco espín',
 'pure de chancho espín',
 'purés de erizos',
 'pure de puercos espín',
 'pures de cerdos espines',
 'pure de cerdos espín',
 'puré de chanchos espin',
 'pure de erizo',
 'puré de cerdo espines',
 'pure de chancho espin',
 'pure de puerco espín',
 'pures de puercos espín',
 'pures de chanchos espin',
 'pures de puercos espin',
 'puré de chancho espines',
 'purés de puercos espín',
 'purés de chanchos espin',
 'pure de puerco espin',
 'pures de puercos espines',
 'pu

In [75]:
def create_syn_dict(ngrms):
    d = {}
    for ngrm in ngrms:
        d[ngrm] = set([ngrm])
    return d

# Example
create_syn_dict(my_ngrams('salsa de tomate'))

{'de': {'de'},
 'de tomate': {'de tomate'},
 'salsa': {'salsa'},
 'salsa de': {'salsa de'},
 'salsa de tomate': {'salsa de tomate'},
 'tomate': {'tomate'}}

In [96]:
# for ingr in ['infusión piña verde pino']:
#     if 1 < len(nltk.word_tokenize(ingr)) < 4:
#         if graph_syn.node[ingr]['represent']:
#             ngrms = my_ngrams(ingr)
#             syn_dict = create_syn_dict(ngrms)
#             for ngrm in ngrms:
#                 if ngrm in graph_syn and ngrm not in syns1:
#                     syns2 = representative_syns(ngrm)
#                     syn_dict[ngrm] = syn_dict[ngrm].union(syns2)
#             syn_combs = comb_syns(ingr, syn_dict)
#             for syn_ingr in syn_combs:
#                 add_node(graph_syn, syn_ingr)
#                 add_edge(graph_syn, ingr, syn_ingr)
#                 print('Adding edge', ingr, syn_ingr)

In [127]:
%%time

c=0

ccs = list(nx.connected_components(graph_syn))
for syns1 in ccs:
    for ingr in syns1:
        if 1 < len(nltk.word_tokenize(ingr)) < 4:
            if graph_syn.node[ingr]['represent']:
                ngrms = my_ngrams(ingr)
                syn_dict = create_syn_dict(ngrms)
                for ngrm in ngrms:
                    if ngrm in graph_syn and ngrm not in syns1:
                        syns2 = representative_syns(ngrm)
                        syn_dict[ngrm] = syn_dict[ngrm].union(syns2)
                syn_combs = comb_syns(ingr, syn_dict)
                for syn_ingr in syn_combs:
                    add_node(graph_syn, syn_ingr)
                    add_edge(graph_syn, ingr, syn_ingr)
        c+=1
        if c%100==0:
            print(c)

# CPU times: user 50min 25s, sys: 3min 47s, total: 54min 12s
# Wall time: 54min 10s

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600
9700
9800
9900
10000
10100
10200
10300
10400
10500
10600
10700
10800
CPU times: user 28min 20s, sys: 292 ms, total: 28min 20s
Wall time: 28min 19s


In [128]:
with open('data/spanish_representative_ingredients.pickle', 'wb') as f:
    pickle.dump(representative_syns_dict, f)

In [112]:
ingr

'pures erizos'

In [119]:
nx.node_connected_component(graph_syn, 'erizo')

{'erizo', 'erizos', 'puerco espin', 'puerco espín', 'puercos espines'}

In [115]:
representative_syns('pures erizo')

{'pure cerdo espin',
 'pure cerdo espines',
 'pure cerdo espín',
 'pure cerdos espin',
 'pure cerdos espines',
 'pure cerdos espín',
 'pure chancho espines',
 'pure chancho espín',
 'pure chanchos espin',
 'pure chanchos espín',
 'pure de cerdo espin',
 'pure de cerdo espines',
 'pure de cerdo espín',
 'pure de cerdos espin',
 'pure de cerdos espines',
 'pure de cerdos espín',
 'pure de chancho espin',
 'pure de chancho espines',
 'pure de chancho espín',
 'pure de chanchos espin',
 'pure de chanchos espines',
 'pure de chanchos espín',
 'pure de erizo',
 'pure de erizos',
 'pure de puerco espin',
 'pure de puerco espines',
 'pure de puerco espín',
 'pure de puercos espin',
 'pure de puercos espines',
 'pure de puercos espín',
 'pure erizo',
 'pure erizos',
 'pure puerco espines',
 'pure puerco espín',
 'pure puercos espines',
 'pure puercos espín',
 'pures cerdo espin',
 'pures cerdo espines',
 'pures cerdos espin',
 'pures chancho espines',
 'pures chancho espín',
 'pures chanchos es

In [88]:
qqq=list(representative_syns('infusión piña verde pino'))
for i in range(len(qqq)-1):
    for j in range(i+1,len(qqq)):
        if sublist(qqq[i].split(), qqq[j].split()):
            print(qqq[i], '<', qqq[j])

In [89]:
representative_syns('infusión piña verde pino')

{'infusion anana verde pino',
 'infusion anana verde pinos',
 'infusion anana verdes pino',
 'infusion anana verdes pinos',
 'infusion ananas verde pino',
 'infusion ananas verde pinos',
 'infusion ananas verdes pino',
 'infusion ananas verdes pinos',
 'infusion ananá verde pino',
 'infusion ananá verde pinos',
 'infusion ananá verdes pino',
 'infusion ananá verdes pinos',
 'infusion ananás verde pino',
 'infusion ananás verde pinos',
 'infusion ananás verdes pino',
 'infusion ananás verdes pinos',
 'infusion de pina verde de pino',
 'infusion de piña verde de pino',
 'infusion pina verde de pino',
 'infusion pina verde pino',
 'infusion pina verde pinos',
 'infusion pina verdes pino',
 'infusion pina verdes pinos',
 'infusion pinas verde pino',
 'infusion pinas verde pinos',
 'infusion pinas verdes de pinos',
 'infusion pinas verdes pino',
 'infusion pinas verdes pinos',
 'infusion piña verde de pino',
 'infusion piña verde pino',
 'infusion piña verde pinos',
 'infusion piña verdes p

In [64]:
#OLD

# %%time

# c=0
# for ingr in list(graph_syn.nodes()):
#     if 1 < len(nltk.word_tokenize(ingr)) < 5:
#         syns1 = nx.node_connected_component(graph_syn, ingr)
#         ngrms = my_ngrams(ingr)
#         syn_dict = create_syn_dict(ngrms)
#         for ngrm in ngrms:
#             if ngrm in graph_syn and ngrm not in syns1:
#                 syns2 = minimal_syns(ngrm)
#                 syn_dict[ngrm] = syn_dict[ngrm].union(syns2)
#         syn_combs = comb_syns(ingr, syn_dict)
#         for syn_ingr in syn_combs:
#             add_node(graph_syn, syn_ingr)
#             add_edge(graph_syn, ingr, syn_ingr)
#     c+=1
#     if c%100==0:
#         print(c)
# # CPU times: user 50min 25s, sys: 3min 47s, total: 54min 12s'
# # Wall time: 54min 10s

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100


KeyboardInterrupt: 

In [129]:
# len(graph_syn)

# 21713

21713

In [130]:
graph_syn.number_of_edges()

# 18635

18635

In [131]:
# nx.number_connected_components(graph_syn)

# 2990

3078

In [132]:
nx.write_gexf(graph_syn, 'data/spanish_ingredients_lexicon_5.gexf')

In [None]:
graph_syn = nx.read_gexf('data/spanish_ingredients_lexicon_5.gexf')