In [37]:
import csv
import itertools
import json
import os
import time
from collections import defaultdict
from itertools import product
from itertools import permutations

import networkx as nx
import nltk
import requests
from nltk.corpus import cess_esp
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams
from pymongo import MongoClient

# Ingredients

In [17]:
def is_english_ingredients_file(filename):
    return filename.startswith('en_') and filename.endswith('_ingredients.txt')

In [18]:
def add_node(g, n):
    if not n in g:
        g.add_node(n, count=0)

In [19]:
def add_edge(g, n1, n2):
    if n1 != n2 and not nx.has_path(g, n1, n2):
        g.add_edge(n1, n2)

In [20]:
graph_syn = nx.Graph()
ingredients_root = 'data/ingredients/'
for e in os.listdir(ingredients_root):
    file_path = ingredients_root + e
    if os.path.isfile(file_path):
        if is_english_ingredients_file(e):
            with open(file_path) as f:
                for line in f:
                    syn_set = set()
                    ingrs1 = line.strip()
                    for ingr in ingrs1.split(' or '):
                        syn_set.add(ingr)
                        add_node(graph_syn, ingr)
                    syn_set = list(syn_set)
                    i1 = syn_set[0]
                    for i2 in syn_set[1:]:
                        add_edge(graph_syn, i1, i2)

In [21]:
len(graph_syn)

4417

In [22]:
graph_syn.number_of_edges()

10

In [23]:
nx.number_connected_components(graph_syn)

4407

In [24]:
nx.write_gexf(graph_syn, 'data/english_ingredients_lexicon_1.gexf')

# Lexicon

In [26]:
# No English lexicon

# POS tagging

In [31]:
def pos_tagger(tokens):
    return nltk.pos_tag(tokens, tagset='universal')

In [32]:
def is_number(x):
    return x in ['a', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine']

def ingredient_tagger_first(x):
    result = []
    tokens = nltk.word_tokenize(x)
    if len(tokens) == 1:
        result.append((x, 'noun'))
    else:
        tags = pos_tagger(tokens)
        for token, tag in tags:
            result.append((token, tag.lower()))
    return result

In [33]:
# with open('data/english_ingredients_postags.csv', 'w') as f:
#     writer = csv.writer(
#         f,
#         delimiter=',',
#         quotechar='"',
#         quoting=csv.QUOTE_MINIMAL
#     )
#     for ingredient in graph_syn.nodes_iter():
#         pos_tag = ' '.join(tag for token, tag in ingredient_tagger_first(ingredient))
#         row = [ingredient, pos_tag]
#         writer.writerow(row)

In [34]:
postags = {}
with open('data/english_ingredients_postags.csv') as f:
    reader = csv.reader(
        f,
        delimiter=',',
    )
    for row in reader:
        postags[row[0]] = row[1]

In [35]:
def ingredient_tagger(x):
    try:
        tags = postags[x]
    except:
        tags = ' '.join(tag for token, tag in ingredient_tagger_first(x))
    return list(zip(nltk.word_tokenize(x),nltk.word_tokenize(tags)))

# Lemmatization

In [41]:
wordnet_lemmatizer = WordNetLemmatizer()

def singularize_first(x):
    lemmas = []
    tokens = nltk.word_tokenize(x)
    for token in tokens:
        lemmas.append(wordnet_lemmatizer.lemmatize(token))
    return ' '.join(lemmas)

In [46]:
# with open('data/english_ingredients_lemmas.csv', 'w') as f:
#     writer = csv.writer(
#         f,
#         delimiter=',',
#         quotechar='"',
#         quoting=csv.QUOTE_MINIMAL
#     )
#     for ingredient in graph_syn.nodes_iter():
#         lemma = singularize_first(ingredient)
#         writer.writerow([ingredient, lemma])

In [47]:
lemmas = {}
with open('data/english_ingredients_lemmas.csv') as f:
    reader = csv.reader(
        f,
        delimiter=',',
    )
    for row in reader:
        lemmas[row[0]] = row[1]

In [48]:
def lemmatize(x):
    if x in lemmas:
        lemma = lemmas[x]
    else:
        lemma = singularize_first(x)
    return lemma

# wordnet synonyms

In [49]:
graph_syn = nx.read_gexf('data/english_ingredients_lexicon_1.gexf')

In [50]:
len(graph_syn)

4417

In [51]:
graph_syn.number_of_edges()

10

In [52]:
nx.number_connected_components(graph_syn)

4407

In [53]:
nouns = set()
for ingr in graph_syn.nodes_iter():
    tags = ingredient_tagger(ingr)
    for token, tag in tags:
        if tag == 'noun':
            nouns.add(token)

In [54]:
len(nouns)

2388

In [34]:
# with open('data/apicultur_ingredients_synonyms.csv', 'w') as f:
#     writer = csv.writer(
#         f,
#         delimiter=',',
#         quotechar='"',
#         quoting=csv.QUOTE_MINIMAL
#     )
#     base_url = 'https://store.apicultur.com/api/sinonimosporpalabra/1.0.0/'
#     headers = {'Authorization': 'Bearer uHS_7Q2Esg7XsUKNsaqFx2sB1mca'}
#     count = 0
#     for noun in nouns:
#         if noun in graph_syn:
#             url = base_url + noun
#             response = requests.get(url, headers=headers)
#             if response.text:
#                 js = response.json()
#                 row = [noun]
#                 for d in js:
#                     row.append(d['valor'])
#                 writer.writerow(row)
#             time.sleep(1)
#         count += 1
#         if count % 50 == 0:
#             time.sleep(10)

In [35]:
apicultur_syns = {}
with open('data/apicultur_ingredients_synonyms.csv') as f:
    reader = csv.reader(
        f,
        delimiter=',',
    )
    for row in reader:
        apicultur_syns[row[0]] = row[1:]

In [36]:
def synonyms(x):
    return apicultur_syns.get(x, [])

In [37]:
syns_found = 0
for noun in nouns:
    syns = synonyms(noun)
    for syn in syns:
        if syn in graph_syn:
            add_edge(graph_syn, noun, syn)
            syns_found += 1
syns_found

290

In [38]:
len(graph_syn)

3322

In [39]:
graph_syn.number_of_edges()

190

In [40]:
nx.number_connected_components(graph_syn)

3132

In [41]:
nx.write_gexf(graph_syn, 'data/spanish_ingredients_lexicon_2.gexf')

# Normalization

In [42]:
# Numbers
def numbers(x):
    return x.replace(' 1 ', ' uno ') \
            .replace(' 2 ', ' dos ') \
            .replace(' 3 ', ' tres ') \
            .replace(' 4 ', ' cuatro ') \
            .replace(' 5 ', ' cinco ') \
            .replace(' 6 ', ' seis ') \
            .replace(' 7 ', ' siete ') \
            .replace(' 8 ', ' ocho ') \
            .replace(' 9 ', ' nueve ')

# Accent marks on vowels - {'á', 'ã', 'ç', 'è', 'é', 'ê', 'í', 'ñ', 'ò', 'ó', 'ú', 'ü', 'ō'}
def accent_marks(x):
    return x.replace('á', 'a') \
            .replace('ã', 'a') \
            .replace('è', 'e') \
            .replace('é', 'e') \
            .replace('ê', 'e') \
            .replace('í', 'i') \
            .replace('ò', 'o') \
            .replace('ó', 'o') \
            .replace('ō', 'o') \
            .replace('ú', 'u') \
            .replace('ü', 'u')

# Non-ascii consonants - {'á', 'ã', 'ç', 'è', 'é', 'ê', 'í', 'ñ', 'ò', 'ó', 'ú', 'ü', 'ō'}
def nonascii_consonants(x):
    return x.replace('ç', 'c') \
            .replace('ñ', 'n')
    
# Dashes (-)
def dashes1(x):
    return x.replace('-', '')

def dashes2(x):
    return x.replace('-', ' ')

# POS tags
# ADJETIVOS .... A ADJ ...... X
# ADVERBIOS .... R ADV
# DETERMINANTES  D DET
# NOMBRES ...... N NOUN ..... X
# VERBOS ....... V VERB ..... X
# PRONOMBRES ... P PRON
# CONJUNCIONES . C CONJ
# INTERJECCIONES I INTERJ
# PREPOSICIONES  S PREP
# PUNTUACIÓN ... F PUNTUATION
# NUMERALES .... Z NUM ...... X
# FECHAS Y HORAS W DATE-TIME
def pos_tags(x):
    tags = ingredient_tagger(x)
    filtered = [token
                for token, tag in tags
                if tag in ['num', 'verb', 'adj', 'noun']
               ]
    return ' '.join(filtered)

def singular(x):
    return lemmatize(x)

def itself(x):
    return x

funcs = [itself, singular, pos_tags, numbers, accent_marks, nonascii_consonants, dashes1, dashes2]
combinations = []
for i in range(1, len(funcs) + 1):
    combinations.append(list(itertools.combinations(funcs, i)))
combinations = [c for comb in combinations for c in comb]

# def normalize(ingredient): # time consuming
#     result = set()
#     for c in combinations:
#         x = ingredient
#         for f in c:
#             x = f(x)
#         result.add(x)
#     return result

def normalize(ingredient): # dynamic programming version
    d = {}
    for f in funcs:
        d[f] = {}
    result = set()
    for c in combinations:
        x = ingredient
        for f in c:
            if not x in d[f]:
                d[f][x] = f(x)
            x = d[f][x]
        result.add(x)
    return result

In [43]:
len([list(map(lambda x: x.__name__, c)) for c in combinations])

255

In [44]:
graph_syn = nx.read_gexf('data/spanish_ingredients_lexicon_2.gexf')

In [45]:
len(graph_syn)

3322

In [46]:
graph_syn.number_of_edges()

190

In [47]:
nx.number_connected_components(graph_syn)

3132

In [48]:
# for ingr in graph_syn.nodes():
#     norms = normalize(ingr)
#     for norm in norms:
#         add_node(graph_syn, norm)
#         add_edge(graph_syn, ingr, norm)

In [49]:
# len(graph_syn)
# 6996

In [50]:
# graph_syn.number_of_edges()
# 3964

In [51]:
# nx.number_connected_components(graph_syn)
# 3032

In [52]:
# nx.write_gexf(graph_syn, 'data/spanish_ingredients_lexicon_3.gexf')

In [53]:
graph_syn = nx.read_gexf('data/spanish_ingredients_lexicon_3.gexf')

# Synonyms

In [54]:
def my_ngrams(ingredient):
    ngrms = []
    tokens = nltk.word_tokenize(ingredient)
    for i in range(1, len(tokens) + 1):
        ngrms.extend(ngrams(tokens, i))
    return list(map(lambda x: ' '.join(x), ngrms))

In [55]:
lengths = defaultdict(int)
for ingr in graph_syn.nodes_iter():
    lengths[len(nltk.word_tokenize(ingr))] += 1
lengths = dict(lengths)

In [56]:
lengths

{1: 1209, 2: 2392, 3: 2312, 4: 629, 5: 289, 6: 109, 7: 28, 8: 13, 9: 11, 10: 4}

In [57]:
def minimal_syns(ingredient):
    result = set()
    syns = nx.node_connected_component(graph_syn, ingredient)
    for syn1 in syns:
        ok = True
        for syn2 in syns:
            if syn2 != syn1 and syn2 in nltk.word_tokenize(syn1):
                ok = False
                break
        if ok:
            result.add(syn1)
    return result

minimal_syns('salsa de tomate')

{'salsa de tomate', 'salsa tomate'}

In [58]:
def ngram_combinations(ingredient):
    combs = []
    ngram_list = my_ngrams(ingredient)
    for i in range(1, len(ngram_list) + 1):
        combs.extend(permutations(ngram_list, i))
    combs = [list(c) for c in combs if ' '.join(c) == ingredient]
    return combs

ngram_combinations('salsa de tomate')

[['salsa de tomate'],
 ['salsa', 'de tomate'],
 ['salsa de', 'tomate'],
 ['salsa', 'de', 'tomate']]

In [59]:
def comb_syns(expr, syn_dict):
    res = set()
    combs = ngram_combinations(expr)
    for ngrms in combs:
        syn_list = [syn_dict[ngrm] for ngrm in ngrms]
        syn_comb = list(product(*syn_list))
        for sc in syn_comb:
            res.add(' '.join(sc))
    return list(res)
    
expr= 'salsa de tomate'
syn_dict = {
    'salsa': ['salsa', 'salsas'],
    'de': ['de'],
    'tomate': ['tomate', 'tomates'],
    'salsa de': ['salsa de'],
    'de tomate': ['de tomate', 'tomatil'],
    'salsa de tomate': ['salsa de tomate'],
}

comb_syns(expr, syn_dict) #example

['salsa tomatil',
 'salsa de tomate',
 'salsas tomatil',
 'salsa de tomates',
 'salsas de tomates',
 'salsas de tomate']

In [60]:
def create_syn_dict(ngrms):
    d = {}
    for ngrm in ngrms:
        d[ngrm] = set([ngrm])
    return d

create_syn_dict(my_ngrams('salsa de tomate'))

{'de': {'de'},
 'de tomate': {'de tomate'},
 'salsa': {'salsa'},
 'salsa de': {'salsa de'},
 'salsa de tomate': {'salsa de tomate'},
 'tomate': {'tomate'}}

In [61]:
# %%time

# for ingr in list(graph_syn.nodes()):
#     if 1 < len(nltk.word_tokenize(ingr)) < 5:
#         syns1 = nx.node_connected_component(graph_syn, ingr)
#         ngrms = my_ngrams(ingr)
#         syn_dict = create_syn_dict(ngrms)
#         for ngrm in ngrms:
#             if ngrm in graph_syn and ngrm not in syns1:
#                 syns2 = minimal_syns(ngrm)
#                 syn_dict[ngrm] = syn_dict[ngrm].union(syns2)
#         syn_combs = comb_syns(ingr, syn_dict)
#         for syn_ingr in syn_combs:
#             add_node(graph_syn, syn_ingr)
#             add_edge(graph_syn, ingr, syn_ingr)

print('CPU times: user 50min 25s, sys: 3min 47s, total: 54min 12s')
print('Wall time: 54min 10s')

CPU times: user 50min 25s, sys: 3min 47s, total: 54min 12s
Wall time: 54min 10s


In [62]:
# len(graph_syn)
# 34548

In [63]:
# graph_syn.number_of_edges()
# 31558

In [64]:
# nx.number_connected_components(graph_syn)
# 2990

In [65]:
# nx.write_gexf(graph_syn, 'data/spanish_ingredients_lexicon_4.gexf')

In [66]:
graph_syn = nx.read_gexf('data/spanish_ingredients_lexicon_4.gexf')

In [67]:
len(graph_syn)

34548

In [68]:
graph_syn.number_of_edges()

31558

In [69]:
nx.number_connected_components(graph_syn)

2990