In [11]:
import networkx as nx

In [68]:
gi = nx.read_gexf('data/spanish_ingredients_lexicon_4.gexf')

In [13]:
gt = nx.read_gexf('data/spanish_techniques_lexicon_4.gexf')

In [19]:
i = 'salsa de tomate'
nx.node_connected_component(gi, i)

{'bechamel de tomate',
 'bechamel de tomates',
 'bechamel tomate',
 'bechamel tomates',
 'besamel de tomate',
 'besamel de tomates',
 'besamel tomate',
 'besamel tomates',
 'salsa de tomate',
 'salsa de tomates',
 'salsa tomate',
 'salsa tomates'}

In [15]:
t = 'estofar'
nx.node_connected_component(gt, t)

{'dun',
 'estofado',
 'estofados',
 'estofar',
 'etouffee',
 'etouffée',
 'guisado',
 'guisar',
 'guiso',
 'nimono'}

In [5]:
len(gi)

34548

# Techniques

In [2]:
def is_spanish_techniques_file(filename):
    return filename.startswith('es_') and filename.endswith('_techniques.txt')

In [3]:
def add_edge(g, n1, n2):
    if n1 != n2 and not nx.has_path(g, n1, n2):
        g.add_edge(n1, n2)

In [4]:
graph_syn = nx.Graph()
techniques_root = 'data/techniques/'
for e in os.listdir(techniques_root):
    file_path = techniques_root + e
    if os.path.isfile(file_path):
        if is_spanish_techniques_file(e):
            with open(file_path) as f:
                for line in f:
                    syn_set = set()
                    techs1 = line.strip()
                    for techs2 in techs1.split(' / '):
                        for techs3 in techs2.split(' o '):
                            for tech in techs3.split('/'):
                                syn_set.add(tech)
                                if not tech in graph_syn:
                                    graph_syn.add_node(tech, count=1)
                                else:
                                    graph_syn.node[tech]['count'] += 1
                    syn_set = list(syn_set)
                    i1 = syn_set[0]
                    for i2 in syn_set[1:]:
                        add_edge(graph_syn, i1, i2)

In [5]:
len(graph_syn)

339

In [6]:
graph_syn.number_of_edges()

128

In [7]:
nx.number_connected_components(graph_syn)

211

In [8]:
nx.write_gexf(graph_syn, 'data/spanish_techniques_lexicon_1.gexf')

# Lexicon

In [9]:
client = MongoClient()
# client.drop_database('lexicon')
db = client.lexicon

In [10]:
# with open('data/es_lexicon.csv') as f:
#     reader = csv.reader(
#         f,
#         delimiter=' ',
#     )
#     docs = []
#     count = 0
#     for row in reader:
#         for i in range(1, len(row[1:]), 2):
#             entry = {}
#             entry['flexion'] = row[0].lower()
#             entry['lemma'] = row[i].lower()
#             entry['eagle'] = row[i+1].lower()
#             docs.append(entry)
#             count += 1
#         if count % 1000 == 0:
#             db.es_lexicon.insert_many(docs)
#             docs = []
#     db.es_lexicon.insert_many(docs)
#     docs = []

In [11]:
db.es_lexicon.count()

668825

# POS tagging

In [12]:
tagged_sp_sents = cess_esp.tagged_sents()

In [13]:
size = int(len(tagged_sp_sents) * 0.1)
train_sp_sents = tagged_sp_sents[size:]
test_sp_sents = tagged_sp_sents[:size]

In [14]:
len(tagged_sp_sents) == len(train_sp_sents) + len(test_sp_sents)

True

In [15]:
tagged_sp_words = cess_esp.tagged_words()

In [16]:
tags = [tag for (word, tag) in tagged_sp_words]
most_freq_tags = nltk.FreqDist(tags)
most_freq_tags.most_common()[:10]

[('sps00', 25272),
 ('ncms000', 11428),
 ('Fc', 11420),
 ('ncfs000', 11008),
 ('da0fs0', 6838),
 ('da0ms0', 6012),
 ('rg', 5937),
 ('Fp', 5866),
 ('cc', 5854),
 ('ncmp000', 5711)]

In [17]:
default_tag = 'ncms000'

In [18]:
t0 = nltk.DefaultTagger(None)
t1 = nltk.UnigramTagger(train_sp_sents, backoff=t0)
t2 = nltk.BigramTagger(train_sp_sents, backoff=t1)
sp_tagger = nltk.TrigramTagger(train_sp_sents, backoff=t2)

In [19]:
sp_tagger.evaluate(test_sp_sents)

0.8809106830122592

In [20]:
tag_mapping = {
    'a': 'adj',
    'r': 'adv',
    'd': 'det',
    'n': 'noun',
    'v': 'verb',
    'p': 'pron',
    'c': 'conj',
    'i': 'interj',
    's': 'prep',
    'f': 'punt',
    'z': 'num',
    'w': 'date-time',
}

def get_category(entry):
    return tag_mapping[entry['eagle'][0]]

def has_category(category, entries):
    return category in map(get_category, entries)

def is_number(x):
    return x in ['un', 'una', 'dos', 'tres', 'cuatro', 'cinco', 'seis', 'siete', 'ocho', 'nueve']

def technique_tagger_first(x):
    result = []
    tokens = nltk.word_tokenize(x)
    if len(tokens) == 1:
        result.append((x, 'noun'))
    else:
        tags = sp_tagger.tag(tokens)
        for token, tag in tags:
            if not tag:
                res = list(db.es_lexicon.find({'flexion': token}))
                if res:
                    if has_category('noun', res):
                        tag = 'noun'
                    elif has_category('adj', res):
                        tag = 'adj'
                    elif has_category('verb', res):
                        tag = 'verb'
                    elif has_category('num', res):
                        tag = 'num'
                    else:
                        tag = get_category(res[0])
                else:
                    tag = 'noun'
                result.append((token, tag))
            elif is_number(token):
                result.append((token, 'num'))
            else:
                result.append((token, tag_mapping[tag.lower()[0]]))
    return result

In [21]:
# with open('data/spanish_techniques_postags.csv', 'w') as f:
#     writer = csv.writer(
#         f,
#         delimiter=',',
#         quotechar='"',
#         quoting=csv.QUOTE_MINIMAL
#     )
#     for technique in graph_syn.nodes_iter():
#         pos_tag = ' '.join(tag for token, tag in technique_tagger_first(technique))
#         row = [technique, pos_tag]
#         writer.writerow(row)

In [22]:
postags = {}
with open('data/spanish_techniques_postags.csv') as f:
    reader = csv.reader(
        f,
        delimiter=',',
    )
    for row in reader:
        postags[row[0]] = row[1]

In [23]:
def technique_tagger(x):
    try:
        tags = postags[x]
    except:
        tags = ' '.join(tag for token, tag in technique_tagger_first(x))
    return list(zip(nltk.word_tokenize(x),nltk.word_tokenize(tags)))

# Lemmatization

In [24]:
def first(cat, entries):
    for e in entries:
        if e['eagle'][0] == cat:
            break
    return e

def singularize_first(x):
    singular = []
    tokens = nltk.word_tokenize(x)
    for token in tokens:
        sing = token
        if x.endswith('s'):
            if token == 'los':
                sing = 'el'
            elif token == 'dos':
                sing = 'dos'
            else:
                res = list(db.es_lexicon.find({'flexion': token}))
                if res:
                    if has_category('noun', res):
                        r = first('n', res)
                        eagle = r['eagle'][:3] + 's' + r['eagle'][4:]
                    elif has_category('adj', res):
                        r = first('a', res)
                        eagle = r['eagle'][:4] + 's' + r['eagle'][5:]
                    elif has_category('verb', res):
                        r = first('v', res)
                        eagle = r['eagle'][:5] + 's' + r['eagle'][6:]
                    elif has_category('det', res):
                        r = first('d', res)
                        eagle = r['eagle'][:4] + 's' + r['eagle'][5:]
                    elif has_category('pron', res):
                        r = first('p', res)
                        eagle = r['eagle'][:4] + 's' + r['eagle'][5:]
                    elif has_category('prep', res):
                        r = first('s', res)
                        eagle = r['eagle'][:3] + 's' + r['eagle'][4:]
                    else:
                        r = res[0]
                        eagle = r['eagle']
                    lemma = r['lemma']
                    s = db.es_lexicon.find_one({'lemma': lemma, 'eagle': eagle})
                    if s:
                        sing = s['flexion']
        singular.append(sing)
    return ' '.join(singular)

In [25]:
# with open('data/spanish_techniques_lemmas.csv', 'w') as f:
#     writer = csv.writer(
#         f,
#         delimiter=',',
#         quotechar='"',
#         quoting=csv.QUOTE_MINIMAL
#     )
#     for technique in graph_syn.nodes_iter():
#         lemma = singularize_first(technique)
#         writer.writerow([technique, lemma])

In [26]:
lemmas = {}
with open('data/spanish_techniques_lemmas.csv') as f:
    reader = csv.reader(
        f,
        delimiter=',',
    )
    for row in reader:
        lemmas[row[0]] = row[1]

In [27]:
def lemmatize(x):
    try:
        lemma = lemmas[x]
    except:
        lemma = singularize_first(x)
    return lemma

# apicultur synonyms

In [28]:
graph_syn = nx.read_gexf('data/spanish_techniques_lexicon_1.gexf')

In [29]:
len(graph_syn)

339

In [30]:
graph_syn.number_of_edges()

128

In [31]:
nx.number_connected_components(graph_syn)

211

In [32]:
nouns = set()
for tech in graph_syn.nodes_iter():
    tags = technique_tagger(tech)
    for token, tag in tags:
        if tag == 'noun':
            nouns.add(token)

In [33]:
len(nouns)

313

In [34]:
# with open('data/apicultur_techniques_synonyms.csv', 'w') as f:
#     writer = csv.writer(
#         f,
#         delimiter=',',
#         quotechar='"',
#         quoting=csv.QUOTE_MINIMAL
#     )
#     base_url = 'https://store.apicultur.com/api/sinonimosporpalabra/1.0.0/'
#     headers = {'Authorization': 'Bearer yUDGVYOcvFbr3hBCPW9TulJDvd8a'}
#     count = 0
#     for noun in nouns:
#         if noun in graph_syn:
#             url = base_url + noun
#             response = requests.get(url, headers=headers)
#             if response.text:
#                 js = response.json()
#                 row = [noun]
#                 for d in js:
#                     row.append(d['valor'])
#                 writer.writerow(row)
#             time.sleep(1)
#         count += 1
#         if count % 20 == 0:
#             time.sleep(65)

In [35]:
apicultur_syns = {}
with open('data/apicultur_techniques_synonyms.csv') as f:
    reader = csv.reader(
        f,
        delimiter=',',
    )
    for row in reader:
        apicultur_syns[row[0]] = row[1:]

In [36]:
def synonyms(x):
    return apicultur_syns.get(x, [])

In [37]:
syns_found = 0
for noun in nouns:
    syns = synonyms(noun)
    for syn in syns:
        if syn in graph_syn:
            add_edge(graph_syn, noun, syn)
            syns_found += 1
syns_found

59

In [38]:
len(graph_syn)

339

In [39]:
graph_syn.number_of_edges()

141

In [40]:
nx.number_connected_components(graph_syn)

198

In [41]:
nx.write_gexf(graph_syn, 'data/spanish_techniques_lexicon_2.gexf')

# Normalization

In [42]:
# Numbers
def numbers(x):
    return x.replace(' 1 ', ' uno ') \
            .replace(' 2 ', ' dos ') \
            .replace(' 3 ', ' tres ') \
            .replace(' 4 ', ' cuatro ') \
            .replace(' 5 ', ' cinco ') \
            .replace(' 6 ', ' seis ') \
            .replace(' 7 ', ' siete ') \
            .replace(' 8 ', ' ocho ') \
            .replace(' 9 ', ' nueve ')

# Accent marks on vowels - {'á', 'ã', 'ç', 'è', 'é', 'ê', 'í', 'ñ', 'ò', 'ó', 'ú', 'ü', 'ō'}
def accent_marks(x):
    return x.replace('á', 'a') \
            .replace('ã', 'a') \
            .replace('è', 'e') \
            .replace('é', 'e') \
            .replace('ê', 'e') \
            .replace('í', 'i') \
            .replace('ò', 'o') \
            .replace('ó', 'o') \
            .replace('ō', 'o') \
            .replace('ú', 'u') \
            .replace('ü', 'u')

# Non-ascii consonants - {'á', 'ã', 'ç', 'è', 'é', 'ê', 'í', 'ñ', 'ò', 'ó', 'ú', 'ü', 'ō'}
def nonascii_consonants(x):
    return x.replace('ç', 'c') \
            .replace('ñ', 'n')
    
# Dashes (-)
def dashes1(x):
    return x.replace('-', '')

def dashes2(x):
    return x.replace('-', ' ')

# POS tags
# ADJETIVOS .... A ADJ ...... X
# ADVERBIOS .... R ADV
# DETERMINANTES  D DET
# NOMBRES ...... N NOUN ..... X
# VERBOS ....... V VERB ..... X
# PRONOMBRES ... P PRON
# CONJUNCIONES . C CONJ
# INTERJECCIONES I INTERJ
# PREPOSICIONES  S PREP
# PUNTUACIÓN ... F PUNTUATION
# NUMERALES .... Z NUM ...... X
# FECHAS Y HORAS W DATE-TIME
def pos_tags(x):
    tags = technique_tagger(x)
    filtered = [token
                for token, tag in tags
                if tag in ['num', 'verb', 'adj', 'noun']
               ]
    return ' '.join(filtered)

def singular(x):
    return lemmatize(x)

def itself(x):
    return x

funcs = [itself, singular, pos_tags, numbers, accent_marks, nonascii_consonants, dashes1, dashes2]
combinations = []
for i in range(1, len(funcs) + 1):
    combinations.append(list(itertools.combinations(funcs, i)))
combinations = [c for comb in combinations for c in comb]

# def normalize(technique): # time consuming
#     result = set()
#     for c in combinations:
#         x = technique
#         for f in c:
#             x = f(x)
#         result.add(x)
#     return result

def normalize(technique): # dynamic programming version
    d = {}
    for f in funcs:
        d[f] = {}
    result = set()
    for c in combinations:
        x = technique
        for f in c:
            if not x in d[f]:
                d[f][x] = f(x)
            x = d[f][x]
        result.add(x)
    return result

In [43]:
len([list(map(lambda x: x.__name__, c)) for c in combinations])

255

In [44]:
graph_syn = nx.read_gexf('data/spanish_techniques_lexicon_2.gexf')

In [45]:
len(graph_syn)

339

In [46]:
graph_syn.number_of_edges()

141

In [47]:
nx.number_connected_components(graph_syn)

198

In [48]:
# for tech in graph_syn.nodes():
#     norms = normalize(tech)
#     for norm in norms:
#         if not norm in graph_syn:
#             graph_syn.add_node(norm, count=1)
#         else:
#             graph_syn.node[norm]['count'] += 1
#         add_edge(graph_syn, tech, norm)

In [49]:
# len(graph_syn)
# 500

In [50]:
# graph_syn.number_of_edges()
# 316

In [51]:
# nx.number_connected_components(graph_syn)
# 184

In [52]:
# nx.write_gexf(graph_syn, 'data/spanish_techniques_lexicon_3.gexf')

In [53]:
graph_syn = nx.read_gexf('data/spanish_techniques_lexicon_3.gexf')

# Synonyms

In [54]:
def my_ngrams(technique):
    ngrms = []
    tokens = nltk.word_tokenize(technique)
    for i in range(1, len(tokens) + 1):
        ngrms.extend(ngrams(tokens, i))
    return list(map(lambda x: ' '.join(x), ngrms))

In [55]:
lengths = defaultdict(int)
for tech in graph_syn.nodes_iter():
    lengths[len(nltk.word_tokenize(tech))] += 1
lengths = dict(lengths)

In [56]:
lengths

{1: 310, 2: 100, 3: 73, 4: 14, 5: 3}

In [57]:
def minimal_syns(technique):
    result = set()
    syns = nx.node_connected_component(graph_syn, technique)
    for syn1 in syns:
        ok = True
        for syn2 in syns:
            if syn2 != syn1 and syn2 in nltk.word_tokenize(syn1):
                ok = False
                break
        if ok:
            result.add(syn1)
    return result

In [58]:
def ngram_combinations(technique):
    combs = []
    ngram_list = my_ngrams(technique)
    for i in range(1, len(ngram_list) + 1):
        combs.extend(permutations(ngram_list, i))
    combs = [list(c) for c in combs if ' '.join(c) == technique]
    return combs

In [59]:
def comb_syns(expr, syn_dict):
    res = set()
    combs = ngram_combinations(expr)
    for ngrms in combs:
        syn_list = [syn_dict[ngrm] for ngrm in ngrms]
        syn_comb = list(product(*syn_list))
        for sc in syn_comb:
            res.add(' '.join(sc))
    return list(res)
    
expr= 'salsa de tomate'
syn_dict = {
    'salsa': ['salsa', 'salsas'],
    'de': ['de'],
    'tomate': ['tomate', 'tomates'],
    'salsa de': ['salsa de'],
    'de tomate': ['de tomate', 'tomatil'],
    'salsa de tomate': ['salsa de tomate'],
}

comb_syns(expr, syn_dict) #example

['salsas de tomate',
 'salsas de tomates',
 'salsa de tomates',
 'salsas tomatil',
 'salsa tomatil',
 'salsa de tomate']

In [60]:
def create_syn_dict(ngrms):
    d = {}
    for ngrm in ngrms:
        d[ngrm] = set([ngrm])
    return d

create_syn_dict(my_ngrams('salsa de tomate'))

{'de': {'de'},
 'de tomate': {'de tomate'},
 'salsa': {'salsa'},
 'salsa de': {'salsa de'},
 'salsa de tomate': {'salsa de tomate'},
 'tomate': {'tomate'}}

In [61]:
# %%time

# for tech in list(graph_syn.nodes()):
#     if 1 < len(nltk.word_tokenize(tech)) < 5:
#         syns1 = nx.node_connected_component(graph_syn, tech)
#         ngrms = my_ngrams(tech)
#         syn_dict = create_syn_dict(ngrms)
#         for ngrm in ngrms:
#             if ngrm in graph_syn and ngrm not in syns1:
#                 syns2 = minimal_syns(ngrm)
#                 syn_dict[ngrm] = syn_dict[ngrm].union(syns2)
#         syn_combs = comb_syns(tech, syn_dict)
#         for syn_tech in syn_combs:
#             if not syn_tech in graph_syn:
#                 graph_syn.add_node(syn_tech, count=1)
#             else:
#                 graph_syn.node[syn_tech]['count'] += 1
#             add_edge(graph_syn, tech, syn_tech)

print('CPU times: user 1min 7s, sys: 5.31 s, total: 1min 12s')
print('Wall time: 1min 12s')

CPU times: user 1min 7s, sys: 5.31 s, total: 1min 12s
Wall time: 1min 12s


In [62]:
# len(graph_syn)
# 1133

In [63]:
# graph_syn.number_of_edges()
# 949

In [64]:
# nx.number_connected_components(graph_syn)
# 184

In [65]:
# nx.write_gexf(graph_syn, 'data/spanish_techniques_lexicon_4.gexf')

In [66]:
graph_syn = nx.read_gexf('data/spanish_techniques_lexicon_4.gexf')

In [67]:
len(graph_syn)

1133

In [68]:
graph_syn.number_of_edges()

949

In [69]:
nx.number_connected_components(graph_syn)

184