In [1]:
import csv
import itertools
import json
import os
import pickle
import time
from collections import defaultdict
from itertools import product
from itertools import permutations

import networkx as nx
import nltk
import requests
from nltk.corpus import cess_esp
from nltk.corpus import wordnet as wn
from nltk.util import ngrams
from pymongo import MongoClient

# Lexicon

In [2]:
client = MongoClient()
# client.drop_database('lexicon')
db = client.lexicon

In [3]:
# with open('data/es_lexicon.csv') as f:
#     reader = csv.reader(
#         f,
#         delimiter=' ',
#     )
#     docs = []
#     count = 0
#     for row in reader:
#         for i in range(1, len(row[1:]), 2):
#             entry = {}
#             entry['flexion'] = row[0].lower()
#             entry['lemma'] = row[i].lower()
#             entry['eagle'] = row[i+1].lower()
#             docs.append(entry)
#             count += 1
#         if count % 1000 == 0:
#             db.es_lexicon.insert_many(docs)
#             docs = []
#     db.es_lexicon.insert_many(docs)
#     docs = []

In [4]:
db.es_lexicon.count()

668825

# POS tagger

In [5]:
# tagged_sp_sents = cess_esp.tagged_sents()

In [6]:
# size = int(len(tagged_sp_sents) * 0.1)
# train_sp_sents = tagged_sp_sents[size:]
# test_sp_sents = tagged_sp_sents[:size]

In [7]:
# tagged_sp_words = cess_esp.tagged_words()

In [8]:
# tags = [tag for (word, tag) in tagged_sp_words]
# most_freq_tags = nltk.FreqDist(tags)
# most_freq_tags.most_common()[:10]

# [('sps00', 25272),
#  ('ncms000', 11428),
#  ('Fc', 11420),
#  ('ncfs000', 11008),
#  ('da0fs0', 6838),
#  ('da0ms0', 6012),
#  ('rg', 5937),
#  ('Fp', 5866),
#  ('cc', 5854),
#  ('ncmp000', 5711)]

In [9]:
# default_tag = 'ncms000'

In [10]:
# t0 = nltk.DefaultTagger(None)
# t1 = nltk.UnigramTagger(train_sp_sents, backoff=t0)
# t2 = nltk.BigramTagger(train_sp_sents, backoff=t1)
# sp_tagger = nltk.TrigramTagger(train_sp_sents, backoff=t2)

In [11]:
# sp_tagger.evaluate(test_sp_sents)

# 0.8815674255691769

In [12]:
# with open('data/sp_tagger.pickle', 'wb') as f:
#     pickle.dump(sp_tagger, f)

In [13]:
with open('data/sp_tagger.pickle', 'rb') as f:
    sp_tagger = pickle.load(f)

# Techniques

In [14]:
def is_spanish_techniques_file(filename):
    return filename.startswith('es_') and filename.endswith('_techniques.txt')

In [15]:
def add_node(g, n):
    if not n in g:
        g.add_node(n, count=0)

In [16]:
def add_edge(g, n1, n2):
    if n1 != n2 and not nx.has_path(g, n1, n2):
        g.add_edge(n1, n2)

In [17]:
graph_syn = nx.Graph()
techniques_root = 'data/techniques/'
for e in os.listdir(techniques_root):
    file_path = techniques_root + e
    if os.path.isfile(file_path):
        if is_spanish_techniques_file(e):
            with open(file_path) as f:
                for line in f:
                    syn_set = set()
                    techs1 = line.strip()
                    for techs2 in techs1.split(' / '):
                        for techs3 in techs2.split(' o '):
                            for tech in techs3.split('/'):
                                syn_set.add(tech)
                                add_node(graph_syn, tech)
                    syn_set = list(syn_set)
                    t1 = syn_set[0]
                    for t2 in syn_set[1:]:
                        add_edge(graph_syn, t1, t2)

In [18]:
len(graph_syn)

339

In [19]:
graph_syn.number_of_edges()

128

In [20]:
nx.number_connected_components(graph_syn)

211

In [21]:
nx.write_gexf(graph_syn, 'data/spanish_techniques_lexicon_1.gexf')

In [22]:
graph_syn = nx.read_gexf('data/spanish_techniques_lexicon_1.gexf')

# POS tagging

In [23]:
tag_mapping = {
    'a': 'adj',
    'r': 'adv',
    'd': 'det',
    'n': 'noun',
    'v': 'verb',
    'p': 'pron',
    'c': 'conj',
    'i': 'interj',
    's': 'prep',
    'f': 'punt',
    'z': 'num',
    'w': 'date-time',
}

def map_tag(eagle):
    return tag_mapping[eagle[0]]

def get_category(entry):
    return map_tag(entry['eagle'])

def has_category(category, entries):
    return category in map(get_category, entries)

def is_number(x):
    return x in ['dos', 'tres', 'cuatro', 'cinco', 'seis', 'siete', 'ocho', 'nueve']

def technique_tagger(x):
    result = []
    tokens = nltk.word_tokenize(x)
    tags = sp_tagger.tag(tokens)
    for token, tag in tags:
        if is_number(token):
            tag = 'num'
        elif tag:
            tag = map_tag(tag.lower())
        else:
            res = list(db.es_lexicon.find({'flexion': token}))
            if res:
                if has_category('adj', res):
                    tag = 'adj'
                elif has_category('noun', res):
                    tag = 'noun'
                elif has_category('verb', res):
                    tag = 'verb'
                elif has_category('det', res):
                    tag = 'det'
                elif has_category('pron', res):
                    tag = 'pron'
                elif has_category('prep', res):
                    tag = 'prep'
                elif has_category('num', res):
                    tag = 'num'
                else:
                    tag = get_category(res[0])
            else:
                tag = 'noun'
        result.append((token, tag))
    return result

In [24]:
# %%time

# with open('data/spanish_techniques_postags.csv', 'w') as f:
#     writer = csv.writer(
#         f,
#         delimiter=',',
#         quotechar='"',
#         quoting=csv.QUOTE_MINIMAL
#     )
#     for tech in graph_syn.nodes_iter():
#         pos_tag = ' '.join(tag for token, tag in technique_tagger(tech))
#         row = [tech, pos_tag]
#         writer.writerow(row)
        
# CPU times: user 220 ms, sys: 4 ms, total: 224 ms
# Wall time: 1min 22s

In [25]:
postags = {}
with open('data/spanish_techniques_postags.csv') as f:
    reader = csv.reader(
        f,
        delimiter=',',
    )
    for row in reader:
        postags[row[0]] = row[1]

In [26]:
def get_postags(x):
    try:
        tags = postags[x]
    except:
        postags[x] = ' '.join(tag for token, tag in technique_tagger(x))
        tags = postags[x]
    return list(zip(nltk.word_tokenize(x),nltk.word_tokenize(tags)))

# Example
get_postags('cocción al vacío')

[('cocción', 'noun'), ('al', 'prep'), ('vacío', 'noun')]

# apicultur synonyms

In [27]:
nouns_and_verbs = set()
for tech in graph_syn.nodes_iter():
    tags = get_postags(tech)
    for token, tag in tags:
        if tag in ['noun', 'verb']:
            nouns_and_verbs.add(token)

In [28]:
len(nouns_and_verbs)

310

In [29]:
# with open('data/apicultur_techniques_synonyms.csv', 'w') as f:
#     writer = csv.writer(
#         f,
#         delimiter=',',
#         quotechar='"',
#         quoting=csv.QUOTE_MINIMAL
#     )
#     base_url = 'https://store.apicultur.com/api/sinonimosporpalabra/1.0.0/'
#     headers = {'Authorization': 'Bearer yUDGVYOcvFbr3hBCPW9TulJDvd8a'}
#     count = 0
#     for x in nouns_and_verbs:
#         if x in graph_syn:
#             url = base_url + x
#             response = requests.get(url, headers=headers)
#             if response.text:
#                 js = response.json()
#                 row = [x]
#                 for d in js:
#                     row.append(d['valor'])
#                 writer.writerow(row)
#             count += 1
#             if count % 20 == 0:
#                 time.sleep(65)

In [30]:
apicultur_syns = {}
with open('data/apicultur_techniques_synonyms.csv') as f:
    reader = csv.reader(
        f,
        delimiter=',',
    )
    for row in reader:
        apicultur_syns[row[0]] = row[1:]

In [31]:
apicultur_graph = nx.Graph()
for k in apicultur_syns:
    syns = apicultur_syns[k]
    for syn in syns:
        apicultur_graph.add_edge(k, syn)

In [32]:
nx.number_connected_components(apicultur_graph)

33

In [33]:
def is_kn_complete(g):
    complete = True
    for n1 in g:
        for n2 in g:
            if n1 != n2 and not g.has_edge(n1, n2):
                complete = False
                break
        if not complete:
            break
    return complete

In [34]:
kn_complete_graphs = []
for subg in nx.connected_component_subgraphs(apicultur_graph):
    if is_kn_complete(subg):
        kn_complete_graphs.append(subg)

In [35]:
len(kn_complete_graphs)

4

In [36]:
syns_found = 0
for g in kn_complete_graphs:
    syn_set = g.nodes()
    i1 = syn_set[0]
    add_node(graph_syn, i1)
    for i2 in syn_set[1:]:
        add_node(graph_syn, i2)
        add_edge(graph_syn, i1, i2)
        syns_found += 1
syns_found

4

In [37]:
len(graph_syn)

343

In [38]:
graph_syn.number_of_edges()

132

In [39]:
nx.number_connected_components(graph_syn)

211

In [40]:
nx.write_gexf(graph_syn, 'data/spanish_techniques_lexicon_2.gexf')

In [41]:
graph_syn = nx.read_gexf('data/spanish_ingredients_lexicon_2.gexf')

# Infinitive, geround, and participle

In [42]:
def is_verb_word(word_tag):
    tag = word_tag[1]
    return tag == 'verb'

def is_verb_technique(technique):
    return any(map(is_verb_word, get_postags(ingredient)))

def infinitive_verb(word):
    if word.endswith('ar') or word.endswith('er') or word.endswith('ir'):
        return word
    
    inf = word
    r1 = db.es_lexicon.find_one({'flexion': word, 'eagle': {'$regex': 'v.*'}})
    if r1:
        inf = r1['lemma']
    return inf

def geround_verb(word):
    if word.endswith('iendo') or word.endswith('yendo'): # -ando is ambiguous
        return word
    
    ger = word
    r1 = db.es_lexicon.find_one({'flexion': word, 'eagle': {'$regex': 'v.*'}})
    if r1:
        lemma = r1['lemma']
        eagle = r1['eagle'][:2] + 'g0000'
        r2 = db.es_lexicon.find_one({'lemma': lemma, 'eagle': eagle})
        if r2:
            ger = r2['flexion']
    return ger

def participle_verb(word):
#     -ado and -ido are ambiguous
    
    par = word
    r1 = db.es_lexicon.find_one({'flexion': word, 'eagle': {'$regex': 'v.*'}})
    if r1:
        lemma = r1['lemma']
        eagle = r1['eagle'][:2] + 'g00' + r1['eagle'][6] + 's'
        r2 = db.es_lexicon.find_one({'lemma': lemma, 'eagle': eagle})
        if r2:
            par = r2['flexion']
    return par

def infinitive_ingredient(ingredient):
    infs = [infinitive_verb(word) for word, tag in get_postags(ingredient) if tag == 'verb']
    return ' '.join(infs)

def geround_ingredient(ingredient):
    gers = [geround_verb(word) for word, tag in get_postags(ingredient) if tag == 'verb']
    return ' '.join(gers)

def participle_ingredient(ingredient):
    pars = [participle_verb(word) for word, tag in get_postags(ingredient) if tag == 'verb']
    return ' '.join(pars)

# Example
print(infinitive_ingredient('guiso'))
print(geround_ingredient('guiso'))
print(participle_ingredient('guiso'))






In [43]:
def is_infinitivable_word(word_tag):
    tag = word_tag[1]
    return tag in ['noun', 'verb']

def is_infinitivable_ingredient(technique):
    return any(map(is_infinitivable_word, get_postags(technique)))

def infinitive_noun(word):
    inf = word
    r1 = db.es_lexicon.find_one({'flexion': word, 'eagle': {'$regex': 'n.*'}})
    if r1:
        stem = CALCULAR_STEM(r1['flexion'])
        eagle = 'vmn0000'
        r2 = db.es_lexicon.find({'lemma': {'$regex': stem + '.*'}, 'eagle': eagle})
        if r2:
            inf = r2['flexion']
    return inf

def infinitive_verb(word):
    if word.endswith('ar') or word.endswith('er') or word.endswith('ir'):
        return word
    
    inf = word
    r1 = db.es_lexicon.find_one({'flexion': word, 'eagle': {'$regex': 'v.*'}})
    if r1:
        inf = r1['lemma']
    return inf

def infinitive_word(word_tag):
    word = word_tag[0]
    tag = word_tag[1]
    inf = word
    if word.isalpha():
        if tag == 'noun':
            inf = infinitive_noun(word)
        elif tag == 'verb':
            inf = infinitive_verb(word)
    return inf

def infinitive_ingredient(ingredient):
    infs = map(infinitive_word, get_postags(ingredient))
    return ' '.join(infs)

# Example
infinitive_ingredient('cocción al vacío')

''

In [43]:
def is_singular_word(word_tag):
    word = word_tag[0]
    tag = word_tag[1]
    return tag in ['adj', 'noun'] and not word.endswith('s')

def is_singular_ingredient(ingredient):
    return any(map(is_singular_word, get_postags(ingredient)))

def naive_pluralize(word):
    if word[-1] in 'aeiou':
        plural = word + 's'
    elif word[-1] == 'z':
        plural = word[:-1] + 'ces'
    else:
        plural = word + 'es'
    return plural

def pluralize_adj(word):
    if not set('áéíóú').intersection(word):
        return naive_pluralize(word)
    
    singular = word
    r1 = db.es_lexicon.find_one({'flexion': word, 'eagle': {'$regex': 'a...s.*'}})
    if r1:
        lemma = r1['lemma']
        eagle = r1['eagle'][:4] + 'p' + r1['eagle'][5:]
        r2 = db.es_lexicon.find_one({'lemma': lemma, 'eagle': eagle})
        if r2:
            singular = r2['flexion']
    return singular

def pluralize_noun(word):
    if not set('áéíóú').intersection(word):
        return naive_pluralize(word)
    
    singular = word
    r1 = db.es_lexicon.find_one({'flexion': word, 'eagle': {'$regex': 'n..s.*'}})
    if r1:
        lemma = r1['lemma']
        eagle = r1['eagle'][:3] + 'p' + r1['eagle'][4:]
        r2 = db.es_lexicon.find_one({'lemma': lemma, 'eagle': eagle})
        if r2:
            singular = r2['flexion']
    return singular

def pluralize_verb(word):
    singular = word
    r1 = db.es_lexicon.find_one({'flexion': word, 'eagle': {'$regex': 'v....s.*'}})
    if r1:
        lemma = r1['lemma']
        eagle = r1['eagle'][:5] + 'p' + r1['eagle'][6:]
        r2 = db.es_lexicon.find_one({'lemma': lemma, 'eagle': eagle})
        if r2:
            singular = r2['flexion']
    return singular

def pluralize_det(word):
    singular = word
    r1 = db.es_lexicon.find_one({'flexion': word, 'eagle': {'$regex': 'd...s.*'}})
    if r1:
        lemma = r1['lemma']
        eagle = r1['eagle'][:4] + 'p' + r1['eagle'][5:]
        r2 = db.es_lexicon.find_one({'lemma': lemma, 'eagle': eagle})
        if r2:
            singular = r2['flexion']
    return singular

def pluralize_word(word_tag):
    word = word_tag[0]
    tag = word_tag[1]
    plural = word
    if word.isalpha():
        if tag == 'adj':
            plural = pluralize_adj(word)
        elif tag == 'noun':
            plural = pluralize_noun(word)
        elif tag == 'verb':
            singular = pluralize_verb(word)
        elif tag == 'det':
            singular = pluralize_det(word)
    return plural

def pluralize_ingredient(ingredient):
    plurals = map(pluralize_word, get_postags(ingredient))
    return ' '.join(plurals)

# Example
pluralize_ingredient('pimiento verde')

''

In [43]:
def is_singular_word(word_tag):
    word = word_tag[0]
    tag = word_tag[1]
    return tag in ['adj', 'noun'] and not word.endswith('s')

def is_singular_ingredient(ingredient):
    return any(map(is_singular_word, get_postags(ingredient)))

def naive_pluralize(word):
    if word[-1] in 'aeiou':
        plural = word + 's'
    elif word[-1] == 'z':
        plural = word[:-1] + 'ces'
    else:
        plural = word + 'es'
    return plural

def pluralize_adj(word):
    if not set('áéíóú').intersection(word):
        return naive_pluralize(word)
    
    singular = word
    r1 = db.es_lexicon.find_one({'flexion': word, 'eagle': {'$regex': 'a...s.*'}})
    if r1:
        lemma = r1['lemma']
        eagle = r1['eagle'][:4] + 'p' + r1['eagle'][5:]
        r2 = db.es_lexicon.find_one({'lemma': lemma, 'eagle': eagle})
        if r2:
            singular = r2['flexion']
    return singular

def pluralize_noun(word):
    if not set('áéíóú').intersection(word):
        return naive_pluralize(word)
    
    singular = word
    r1 = db.es_lexicon.find_one({'flexion': word, 'eagle': {'$regex': 'n..s.*'}})
    if r1:
        lemma = r1['lemma']
        eagle = r1['eagle'][:3] + 'p' + r1['eagle'][4:]
        r2 = db.es_lexicon.find_one({'lemma': lemma, 'eagle': eagle})
        if r2:
            singular = r2['flexion']
    return singular

def pluralize_verb(word):
    singular = word
    r1 = db.es_lexicon.find_one({'flexion': word, 'eagle': {'$regex': 'v....s.*'}})
    if r1:
        lemma = r1['lemma']
        eagle = r1['eagle'][:5] + 'p' + r1['eagle'][6:]
        r2 = db.es_lexicon.find_one({'lemma': lemma, 'eagle': eagle})
        if r2:
            singular = r2['flexion']
    return singular

def pluralize_det(word):
    singular = word
    r1 = db.es_lexicon.find_one({'flexion': word, 'eagle': {'$regex': 'd...s.*'}})
    if r1:
        lemma = r1['lemma']
        eagle = r1['eagle'][:4] + 'p' + r1['eagle'][5:]
        r2 = db.es_lexicon.find_one({'lemma': lemma, 'eagle': eagle})
        if r2:
            singular = r2['flexion']
    return singular

def pluralize_word(word_tag):
    word = word_tag[0]
    tag = word_tag[1]
    plural = word
    if word.isalpha():
        if tag == 'adj':
            plural = pluralize_adj(word)
        elif tag == 'noun':
            plural = pluralize_noun(word)
        elif tag == 'verb':
            singular = pluralize_verb(word)
        elif tag == 'det':
            singular = pluralize_det(word)
    return plural

def pluralize_ingredient(ingredient):
    plurals = map(pluralize_word, get_postags(ingredient))
    return ' '.join(plurals)

# Example
pluralize_ingredient('pimiento verde')

''

In [26]:
# with open('data/spanish_techniques_lemmas.csv', 'w') as f:
#     writer = csv.writer(
#         f,
#         delimiter=',',
#         quotechar='"',
#         quoting=csv.QUOTE_MINIMAL
#     )
#     for technique in graph_syn.nodes_iter():
#         lemma = infinitive_first(singularize_first(technique))
#         writer.writerow([technique, lemma])

In [27]:
lemmas = {}
with open('data/spanish_techniques_lemmas.csv') as f:
    reader = csv.reader(
        f,
        delimiter=',',
    )
    for row in reader:
        lemmas[row[0]] = row[1]

In [28]:
def lemmatize(x):
    try:
        lemma = lemmas[x]
    except:
        lemma = infinitive_first(singularize_first(x))
    return lemma

In [29]:
graph_syn = nx.read_gexf('data/spanish_techniques_lexicon_1.gexf')

In [30]:
len(graph_syn)

339

In [31]:
graph_syn.number_of_edges()

128

In [32]:
nx.number_connected_components(graph_syn)

211

# Normalization

In [43]:
# Numbers
def numbers(x):
    return x.replace(' 1 ', ' uno ') \
            .replace(' 2 ', ' dos ') \
            .replace(' 3 ', ' tres ') \
            .replace(' 4 ', ' cuatro ') \
            .replace(' 5 ', ' cinco ') \
            .replace(' 6 ', ' seis ') \
            .replace(' 7 ', ' siete ') \
            .replace(' 8 ', ' ocho ') \
            .replace(' 9 ', ' nueve ')

# Accent marks on vowels - {'á', 'ã', 'ç', 'è', 'é', 'ê', 'í', 'ñ', 'ò', 'ó', 'ú', 'ü', 'ō'}
def accent_marks(x):
    return x.replace('á', 'a') \
            .replace('ã', 'a') \
            .replace('è', 'e') \
            .replace('é', 'e') \
            .replace('ê', 'e') \
            .replace('í', 'i') \
            .replace('ò', 'o') \
            .replace('ó', 'o') \
            .replace('ō', 'o') \
            .replace('ú', 'u') \
            .replace('ü', 'u')

# Non-ascii consonants - {'á', 'ã', 'ç', 'è', 'é', 'ê', 'í', 'ñ', 'ò', 'ó', 'ú', 'ü', 'ō'}
def nonascii_consonants(x):
    return x.replace('ç', 'c') \
            .replace('ñ', 'n')
    
# Dashes (-)
def dashes1(x):
    return x.replace('-', '')

def dashes2(x):
    return x.replace('-', ' ')

# POS tags
# ADJETIVOS .... A ADJ ...... X
# ADVERBIOS .... R ADV
# DETERMINANTES  D DET
# NOMBRES ...... N NOUN ..... X
# VERBOS ....... V VERB ..... X
# PRONOMBRES ... P PRON
# CONJUNCIONES . C CONJ
# INTERJECCIONES I INTERJ
# PREPOSICIONES  S PREP
# PUNTUACIÓN ... F PUNTUATION
# NUMERALES .... Z NUM ...... X
# FECHAS Y HORAS W DATE-TIME
def pos_tags(x):
    tags = technique_tagger(x)
    filtered = [token
                for token, tag in tags
                if tag in ['num', 'verb', 'adj', 'noun']
               ]
    return ' '.join(filtered)

def singular(x):
    return lemmatize(x)

def itself(x):
    return x

funcs = [itself, singular, pos_tags, numbers, accent_marks, nonascii_consonants, dashes1, dashes2]
combinations = []
for i in range(1, len(funcs) + 1):
    combinations.append(list(itertools.combinations(funcs, i)))
combinations = [c for comb in combinations for c in comb]

# def normalize(technique): # time consuming
#     result = set()
#     for c in combinations:
#         x = technique
#         for f in c:
#             x = f(x)
#         result.add(x)
#     return result

def normalize(technique): # dynamic programming version
    d = {}
    for f in funcs:
        d[f] = {}
    result = set()
    for c in combinations:
        x = technique
        for f in c:
            if not x in d[f]:
                d[f][x] = f(x)
            x = d[f][x]
        result.add(x)
    return result

In [44]:
len([list(map(lambda x: x.__name__, c)) for c in combinations])

255

In [45]:
graph_syn = nx.read_gexf('data/spanish_techniques_lexicon_2.gexf')

In [46]:
len(graph_syn)

339

In [47]:
graph_syn.number_of_edges()

141

In [48]:
nx.number_connected_components(graph_syn)

198

In [49]:
# for tech in graph_syn.nodes():
#     norms = normalize(tech)
#     for norm in norms:
#         if len(nltk.word_tokenize(norm)) == 0:
#             print(tech)
#         add_node(graph_syn, norm)
#         add_edge(graph_syn, tech, norm)

In [50]:
# len(graph_syn)
# 589

589

In [51]:
# graph_syn.number_of_edges()
# 410

410

In [52]:
# nx.number_connected_components(graph_syn)
# 179

179

In [53]:
# nx.write_gexf(graph_syn, 'data/spanish_techniques_lexicon_3.gexf')

In [54]:
graph_syn = nx.read_gexf('data/spanish_techniques_lexicon_3.gexf')

# Synonyms

In [55]:
def my_ngrams(technique):
    ngrms = []
    tokens = nltk.word_tokenize(technique)
    for i in range(1, len(tokens) + 1):
        ngrms.extend(ngrams(tokens, i))
    return list(map(lambda x: ' '.join(x), ngrms))

In [56]:
lengths = defaultdict(int)
for tech in graph_syn.nodes_iter():
    lengths[len(nltk.word_tokenize(tech))] += 1
lengths = dict(lengths)

In [57]:
lengths

{1: 325, 2: 130, 3: 105, 4: 23, 5: 6}

In [58]:
def minimal_syns(technique):
    result = set()
    syns = nx.node_connected_component(graph_syn, technique)
    for syn1 in syns:
        ok = True
        for syn2 in syns:
            if syn2 != syn1 and syn2 in nltk.word_tokenize(syn1):
                ok = False
                break
        if ok:
            result.add(syn1)
    return result

In [59]:
def ngram_combinations(technique):
    combs = []
    ngram_list = my_ngrams(technique)
    for i in range(1, len(ngram_list) + 1):
        combs.extend(permutations(ngram_list, i))
    combs = [list(c) for c in combs if ' '.join(c) == technique]
    return combs

In [60]:
def comb_syns(expr, syn_dict):
    res = set()
    combs = ngram_combinations(expr)
    for ngrms in combs:
        syn_list = [syn_dict[ngrm] for ngrm in ngrms]
        syn_comb = list(product(*syn_list))
        for sc in syn_comb:
            res.add(' '.join(sc))
    return list(res)
    
expr= 'salsa de tomate'
syn_dict = {
    'salsa': ['salsa', 'salsas'],
    'de': ['de'],
    'tomate': ['tomate', 'tomates'],
    'salsa de': ['salsa de'],
    'de tomate': ['de tomate', 'tomatil'],
    'salsa de tomate': ['salsa de tomate'],
}

comb_syns(expr, syn_dict) #example

['salsas de tomate',
 'salsa tomatil',
 'salsas tomatil',
 'salsa de tomate',
 'salsas de tomates',
 'salsa de tomates']

In [61]:
def create_syn_dict(ngrms):
    d = {}
    for ngrm in ngrms:
        d[ngrm] = set([ngrm])
    return d

create_syn_dict(my_ngrams('salsa de tomate'))

{'de': {'de'},
 'de tomate': {'de tomate'},
 'salsa': {'salsa'},
 'salsa de': {'salsa de'},
 'salsa de tomate': {'salsa de tomate'},
 'tomate': {'tomate'}}

In [62]:
# %%time

# for tech in list(graph_syn.nodes()):
#     if 1 < len(nltk.word_tokenize(tech)) < 5:
#         syns1 = nx.node_connected_component(graph_syn, tech)
#         ngrms = my_ngrams(tech)
#         syn_dict = create_syn_dict(ngrms)
#         for ngrm in ngrms:
#             if ngrm in graph_syn and ngrm not in syns1:
#                 syns2 = minimal_syns(ngrm)
#                 syn_dict[ngrm] = syn_dict[ngrm].union(syns2)
#         syn_combs = comb_syns(tech, syn_dict)
#         for syn_tech in syn_combs:
#             add_node(graph_syn, syn_tech)
#             add_edge(graph_syn, tech, syn_tech)

print('CPU times: user 1min 7s, sys: 5.31 s, total: 1min 12s')
print('Wall time: 1min 12s')

CPU times: user 1min 47s, sys: 8.34 s, total: 1min 55s
Wall time: 1min 55s


In [63]:
# len(graph_syn)
# 1403

1403

In [64]:
# graph_syn.number_of_edges()
# 1224

1224

In [65]:
# nx.number_connected_components(graph_syn)
# 179

179

In [66]:
# nx.write_gexf(graph_syn, 'data/spanish_techniques_lexicon_4.gexf')

In [67]:
graph_syn = nx.read_gexf('data/spanish_techniques_lexicon_4.gexf')

In [68]:
len(graph_syn)

1403

In [69]:
graph_syn.number_of_edges()

1224

In [70]:
nx.number_connected_components(graph_syn)

179