In [1]:
import itertools
import os
import pickle
from collections import defaultdict

import networkx as nx
import nltk
from nltk.corpus import cess_esp
from nltk.corpus import wordnet as wn
from nltk.util import ngrams
from pymongo import MongoClient

In [2]:
client = MongoClient()
# client.drop_database('lexicon')
db = client.lexicon

# Lexicon

In [3]:
%%time

import csv

with open('data/sp_lexicon.csv') as f:
    reader = csv.reader(
        f,
        delimiter=' ',
    )
    docs = []
    count = 0
    for row in reader:
        for i in range(1, len(row[1:]), 2):
            entry = {}
            entry['flexion'] = row[0].lower()
            entry['lemma'] = row[i].lower()
            entry['eagle'] = row[i+1].lower()
            docs.append(entry)
            count += 1
        if count % 1000 == 0:
            db.es_lexicon.insert_many(docs)
            docs = []
    db.es_lexicon.insert_many(docs)
    docs = []

CPU times: user 12.3 s, sys: 88 ms, total: 12.4 s
Wall time: 18.2 s


In [4]:
db.es_lexicon.count()

668825

# POS tagger

In [5]:
tagged_sp_sents = cess_esp.tagged_sents()

In [6]:
size = int(len(tagged_sp_sents) * 0.1)
train_sp_sents = tagged_sp_sents[size:]
test_sp_sents = tagged_sp_sents[:size]

In [7]:
len(tagged_sp_sents) == len(train_sp_sents) + len(test_sp_sents)

True

In [8]:
tagged_sp_words = cess_esp.tagged_words()

In [9]:
tags = [tag for (word, tag) in tagged_sp_words]
most_freq_tags = nltk.FreqDist(tags)
most_freq_tags.most_common()[:10]

[('sps00', 25272),
 ('ncms000', 11428),
 ('Fc', 11420),
 ('ncfs000', 11008),
 ('da0fs0', 6838),
 ('da0ms0', 6012),
 ('rg', 5937),
 ('Fp', 5866),
 ('cc', 5854),
 ('ncmp000', 5711)]

In [10]:
default_tag = 'ncms000'

In [11]:
t0 = nltk.DefaultTagger(None)
t1 = nltk.UnigramTagger(train_sp_sents, backoff=t0)
t2 = nltk.BigramTagger(train_sp_sents, backoff=t1)
sp_tagger = nltk.TrigramTagger(train_sp_sents, backoff=t2)

In [12]:
sp_tagger.evaluate(test_sp_sents)

0.8809982486865149

# Normalization

In [100]:
tag_mapping = {
    'a': 'adj',
    'r': 'adv',
    'd': 'det',
    'n': 'noun',
    'v': 'verb',
    'p': 'pron',
    'c': 'conj',
    'i': 'interj',
    's': 'prep',
    'f': 'punt',
    'z': 'num',
    'w': 'date-time',
}

def has_category(category, entries):
    return category in map(lambda x: tag_mapping[x['eagle'][0]], entries)

def is_number(x):
    return x in ['un', 'una', 'dos', 'tres', 'cuatro', 'cinco', 'seis', 'siete', 'ocho', 'nueve']

def ingredient_tagger(x):
    result = []
    tokens = nltk.word_tokenize(x)
    if len(tokens) == 1:
        result.append((x, 'noun'))
    else:
        tags = sp_tagger.tag(tokens)
        for token, tag in tags:
            if not tag:
                res = list(db.es_lexicon.find({'flexion': token}))
                if res:
                    if has_category('noun', res):
                        tag = 'noun'
                    elif has_category('adj', res):
                        tag = 'adj'
                    elif has_category('verb', res):
                        tag = 'verb'
                    elif has_category('num', res):
                        tag = 'num'
                    else:
                        tag = tag_mapping[res[0]['eagle'][0]]
                else:
                    tag = 'noun'
                result.append((token, tag))
            elif is_number(token):
                result.append((token, 'num'))
            else:
                result.append((token, tag_mapping[tag.lower()[0]]))
    return result

In [126]:
def first(cat, entries):
    for e in entries:
        if e['eagle'][0] == cat:
            break
    return e

def singularize(x):
    sing = x
    if x.endswith('s'):
        if x == 'los':
            sing = 'el'
        elif x == 'dos':
            sing = 'dos'
        else:
            res = list(db.es_lexicon.find({'flexion': x}))
            if res:
                if has_category('noun', res):
                    r = first('n', res)
                    eagle = r['eagle'][:3] + 's' + r['eagle'][4:]
                elif has_category('adj', res):
                    r = first('a', res)
                    eagle = r['eagle'][:4] + 's' + r['eagle'][5:]
                elif has_category('verb', res):
                    r = first('v', res)
                    eagle = r['eagle'][:5] + 's' + r['eagle'][6:]
                elif has_category('det', res):
                    r = first('d', res)
                    eagle = r['eagle'][:4] + 's' + r['eagle'][5:]
                elif has_category('pron', res):
                    r = first('p', res)
                    eagle = r['eagle'][:4] + 's' + r['eagle'][5:]
                elif has_category('prep', res):
                    r = first('s', res)
                    eagle = r['eagle'][:3] + 's' + r['eagle'][4:]
                else:
                    r = res[0]
                    eagle = r['eagle']
                lemma = r['lemma']
                s = db.es_lexicon.find_one({'lemma': lemma, 'eagle': eagle})
                if s:
                    sing = s['flexion']
    return sing

def lemmatize(ingredient):
    tokens = nltk.word_tokenize(ingredient)
    return ' '.join((map(singularize, tokens)))

In [118]:
# Numbers
def numbers(x):
    return x.replace(' 1 ', ' uno ') \
            .replace(' 2 ', ' dos ') \
            .replace(' 3 ', ' tres ') \
            .replace(' 4 ', ' cuatro ') \
            .replace(' 5 ', ' cinco ') \
            .replace(' 6 ', ' seis ') \
            .replace(' 7 ', ' siete ') \
            .replace(' 8 ', ' ocho ') \
            .replace(' 9 ', ' nueve ')

# Accent marks on vowels - {'á', 'ã', 'ç', 'è', 'é', 'ê', 'í', 'ñ', 'ò', 'ó', 'ú', 'ü', 'ō'}
def accent_marks(x):
    return x.replace('á', 'a') \
            .replace('ã', 'a') \
            .replace('è', 'e') \
            .replace('é', 'e') \
            .replace('ê', 'e') \
            .replace('í', 'i') \
            .replace('ò', 'o') \
            .replace('ó', 'o') \
            .replace('ō', 'o') \
            .replace('ú', 'u') \
            .replace('ü', 'u')

# Non-ascii consonants - {'á', 'ã', 'ç', 'è', 'é', 'ê', 'í', 'ñ', 'ò', 'ó', 'ú', 'ü', 'ō'}
def nonascii_consonants(x):
    return x.replace('ç', 'c') \
            .replace('ñ', 'n')
    
# Dashes (-)
def dashes1(x):
    return x.replace('-', '')

def dashes2(x):
    return x.replace('-', ' ')

# POS tags
# ADJETIVOS .... A ADJ ...... X
# ADVERBIOS .... R ADV
# DETERMINANTES  D DET
# NOMBRES ...... N NOUN ..... X
# VERBOS ....... V VERB ..... X
# PRONOMBRES ... P PRON
# CONJUNCIONES . C CONJ
# INTERJECCIONES I INTERJ
# PREPOSICIONES  S PREP
# PUNTUACIÓN ... F PUNTUATION
# NUMERALES .... Z NUM ...... X
# FECHAS Y HORAS W DATE-TIME
def pos_tags(x):
    tags = ingredient_tagger(x)
    filtered = [token
                for token, tag in tags
                if tag in ['num', 'verb', 'adj', 'noun']
               ]
    return ' '.join(filtered)

def singular(x):
    return lemmatize(x)

funcs = [singular, pos_tags, numbers, accent_marks, nonascii_consonants, dashes1, dashes2]
combinations = []
for i in range(1, len(funcs) + 1):
    combinations.append(list(itertools.combinations(funcs, i)))
combinations = [c for comb in combinations for c in comb]

# def normalize(ingredient): # time consuming
#     result = set()
#     for c in combinations:
#         x = ingredient
#         for f in c:
#             x = f(x)
#         result.add(x)
#     return result

def normalize(ingredient): # dynamic programming version
    d = {}
    for f in funcs:
        d[f] = {}
    result = set()
    for c in combinations:
        x = ingredient
        for f in c:
            if not x in d[f]:
                d[f][x] = f(x)
            x = d[f][x]
        result.add(x)
    return result

# Ingredients

In [13]:
def is_spanish_ingredients_file(filename):
    return filename.startswith('es_') and filename.endswith('_ingredients.txt')

In [212]:
graph_syn = nx.Graph()
ingredients_root = 'data/ingredients/'
for e in os.listdir(ingredients_root):
    file_path = ingredients_root + e
    if os.path.isfile(file_path):
        if is_spanish_ingredients_file(e):
            with open(file_path) as f:
                for line in f:
                    syn_set = set()
                    ingrs1 = line.strip()
                    for ingrs2 in ingrs1.split(' / '):
                        for ingrs3 in ingrs2.split(' o '):
                            for ingr in ingrs3.split(' - '):
                                syn_set.add(ingr)
#                                 is_elbulli = e == 'es_elbulli_ingredients.txt'
                                if not ingr in graph_syn:
                                    graph_syn.add_node(ingr, count=1)
#                                     graph_syn.node[ingr]['elbulli'] = is_elbulli
                                else:
                                    graph_syn.node[ingr]['count'] += 1
#                                     graph_syn.node[ingr]['elbulli'] = graph_syn.node[ingr]['elbulli'] or is_elbulli
                    syn_set = list(syn_set)
                    i1 = syn_set[0]
                    for i2 in syn_set[1:]:
                        graph_syn.add_edge(i1, i2)

In [216]:
len(graph_syn)

3322

In [215]:
nx.number_connected_components(graph_syn)

3221

In [243]:
%%time

count = 0
for n in list(graph_syn.nodes(data=True)):
    ingr = n[0]
    norms = normalize(ingr)
    for norm in norms:
        if not norm in graph_syn:
            graph_syn.add_node(norm, count=1)
        else:
            graph_syn.node[norm]['count'] += 1
        graph_syn.add_edge(ingr, norm)
    
    count += 1
    if count % 1000 == 0:
        print(count, 'nodes processed.')
if count % 1000 == 0:
    print(count, 'nodes processed.')

KeyboardInterrupt: 

In [222]:
len(graph_syn)

7523

In [223]:
nx.number_connected_components(graph_syn)

3094

In [206]:
def my_ngrams(ingredient):
    ngrms = []
    tokens = nltk.word_tokenize(ingredient)
    for i in range(1, len(tokens) + 1):
        ngrms.extend(ngrams(tokens, i))
    return list(map(lambda x: ' '.join(x), ngrms))

In [230]:
lengths = defaultdict(int)
for ingr in graph_syn.nodes_iter():
    lengths[len(nltk.word_tokenize(ingr))] += 1
lengths = dict(lengths)

In [231]:
lengths

{1: 1209, 2: 2547, 3: 2552, 4: 699, 5: 334, 6: 119, 7: 28, 8: 16, 9: 15, 10: 4}

In [241]:
nx.node_connected_component(graph_syn, 'arroz')

{'arroz', 'kome'}

In [242]:
%%time

count = 0
for n in list(graph_syn.nodes(data=True)):
    ingr = n[0]
    ngrms = my_ngrams(ingr)
    for ngrm in ngrms:
        if ngrm in graph_syn:
            syns = nx.node_connected_component(graph_syn, ngrm)
            for syn in syns:
                syn_ingr = ingr.replace(ngrm, syn)
                if not syn_ingr in graph_syn:
                    graph_syn.add_node(syn_ingr, count=1)
                else:
                    graph_syn.node[syn_ingr]['count'] += 1
                graph_syn.add_edge(ingr, syn_ingr)
    
    count += 1
    if count % 100 == 0:
        print(count, 'nodes processed.')

100 nodes processed.
200 nodes processed.
300 nodes processed.
400 nodes processed.
500 nodes processed.
600 nodes processed.
700 nodes processed.
800 nodes processed.
900 nodes processed.
1000 nodes processed.
1100 nodes processed.
1200 nodes processed.
1300 nodes processed.
1400 nodes processed.
1500 nodes processed.
1600 nodes processed.
1700 nodes processed.
1800 nodes processed.
1900 nodes processed.
2000 nodes processed.
2100 nodes processed.
2200 nodes processed.
2300 nodes processed.
2400 nodes processed.
2500 nodes processed.
2600 nodes processed.
2700 nodes processed.
2800 nodes processed.
2900 nodes processed.
3000 nodes processed.
3100 nodes processed.
3200 nodes processed.
3300 nodes processed.
3400 nodes processed.
3500 nodes processed.
3600 nodes processed.
3700 nodes processed.
3800 nodes processed.
3900 nodes processed.
4000 nodes processed.
4100 nodes processed.
4200 nodes processed.
4300 nodes processed.
4400 nodes processed.
4500 nodes processed.
4600 nodes processe

In [69]:
while True:
    n_nodes_old = graph_syn.number_of_nodes()
    n_edges_old = graph_syn.number_of_edges()
    for n in list(graph_syn.nodes(data=True)):
        norms = normalize(n)
        for norm in norms:
            
        ngrms = my_ngrams(n)
        if len(tokens) == length:
            norms = normalize(n)
            if not lemmatized in graph_syn:
                graph_syn.add_node(lemmatized, count=1)
            else:
                graph_syn.node[lemmatized]['count'] += 1
            graph_syn.add_edge(ingr, lemmatized)

3238

In [135]:
nx.node_connected_component(graph_syn, 'salsa de soja')

{'salsa de soja', 'shoyu', 'shō-yu', 'shōyu'}

In [46]:
for line in all_lines:
    syn_set = set()
    ingrs1 = line.strip()
    for ingrs2 in ingrs1.split(' / '):
        for ingr in ingrs2.split(' o '):
            syn_set.add(ingr)
    syn_set = list(syn_set)
    i1 = syn_set[0]
    for i2 in syn_set[1:]:
        graph_syn.add_edge(i1, i2)

In [31]:
nonascii = set()
for ingr in graph_syn.nodes_iter():
#     q = ingr.replace(' ', 'xxx')
#     if not q.isalpha():
#         print(q.replace('xxx', ' '))
    if not is_ascii(ingr):
        for c in ingr:
            if not is_ascii(c):
                nonascii.add(c)

In [45]:
def normalize(expr):
    pass

def lemmatize(expr):
    pass

In [46]:
for line in all_lines:
    syn_set = set()
    ingrs1 = line.strip()
    for ingrs2 in ingrs1.split(' / '):
        for ingr in ingrs2.split(' o '):
            lemmatized = lemmatize(ingr)
            if not lemmatized in graph_syn:
                graph_syn.add_node(lemmatized, count=1, is_lemma=True, is_repr=False)
            else:
                graph_syn.node[lemmatized]['count'] += 1
                graph_syn.node[lemmatized]['is_lemma'] = True
            syn_set.add(ingr)
            syn_set.add(lemmatized)
    syn_set = list(syn_set)
    i1 = syn_set[0]
    for i2 in syn_set[1:]:
        graph_syn.add_edge(i1, i2)

In [151]:
g=nx.Graph()

In [152]:
g.add_nodes_from([1,2,3])

In [153]:
c=0
for x in list(g.nodes()):
    g.add_node(x+1)
    print('adding',x+1)
    c+=1
    if c>10:
        break

adding 2
adding 3
adding 4


In [150]:
g.nodes()

[1, 2, 3, 4, 5, 6, 7]

In [26]:
g.add_edge(1, 1, b=5)

In [27]:
g.edges(data=True)

[(1, 1, {'b': 5})]

In [28]:
g.add_node(1, {'a':3})

In [29]:
g.nodes(data=True)

[(1, {'a': 3})]

In [26]:
g.node[1]['a'] += 1

In [5]:
type(nltk.FreqDist())

nltk.probability.FreqDist

In [6]:
def lemmatizer(x):
    return nltk.FreqDist(x)

In [12]:
xxx=lemmatizer()

In [None]:
def add_es_ingredient(ingredient):
    #ingredient tiene format "salsa de soja o salsa de soya" -> esto no puede ir en las funciones clean
    #pq entonces se pierde la info de que son ingredientes sinonimos
    #puede estar en singular o plurar
    #hay que lematizar la expresion (ingredient) completa, eliminar preps, arts... segun mi criterio
    #hay que guardar todas las posibles variantes del ingredient y su lematizacion
    #guardar esta funcion en un obj pickle y "exportarla" donde sea necesario

In [28]:
if __name__=='__main__':
    with open('pickle/lemmatizer.pickle', 'wb') as f:
        pickle.dump(xxx, f)

In [24]:
class A:
    def lemmatizer(self, x):
        return nltk.FreqDist(x)

In [25]:
xxx = A()

In [26]:
xxx.lemmatizer(['a','a','b'])

Counter({'a': 2, 'b': 1})

In [38]:
>>> from nltk.corpus import wordnet as wn
for ss in wn.synsets('oil'):
    print(ss.lemma_names('spa'))

['aceite']
['óleo']
['petróleo']
[]
[]
[]


In [69]:
wn.langs

<bound method WordNetCorpusReader.langs of <WordNetCorpusReader in '/home/antonio/nltk_data/corpora/wordnet'>>

In [70]:
from nltk.corpus import omw

ImportError: cannot import name 'omw'

In [83]:
l=wn.lemmas('cane', lang='ita')[0]

In [92]:
l.synset()

Synset('dog.n.01')

In [93]:
l

Lemma('dog.n.01.cane')

In [25]:
ls = wn.lemmas('amaba', lang='spa')

In [26]:
for x in ls:
    print(x.name())

In [None]:
lematizar solamente usando mi lexicon en español
si no existe la palabra y no se puede lematizar, quitar s final si existe
esto aplica a adj y noun, poco probable encontrar un verbo