In [1]:
import itertools
import os
import pickle
from collections import defaultdict

import networkx as nx
import nltk
from nltk.corpus import cess_esp
from nltk.corpus import wordnet as wn
from pymongo import MongoClient

In [65]:
client = MongoClient()
client.drop_database('lexicon')
db = client.lexicon

# Lexicon

In [66]:
%%time

import csv

with open('data/sp_lexicon.csv') as f:
    reader = csv.reader(
        f,
        delimiter=' ',
    )
    docs = []
    count = 0
    for row in reader:
        for i in range(1, len(row[1:]), 2):
            entry = {}
            entry['flexion'] = row[0].lower()
            entry['lemma'] = row[i].lower()
            entry['eagle'] = row[i+1].lower()
            docs.append(entry)
            count += 1
        if count % 1000 == 0:
            db.es_lexicon.insert_many(docs)
            docs = []
    db.es_lexicon.insert_many(docs)
    docs = []

CPU times: user 12.6 s, sys: 72 ms, total: 12.7 s
Wall time: 19 s


In [15]:
db.es_lexicon.count()

668825

In [56]:
'Å'.lower()

'å'

# POS tagger

In [4]:
tagged_sp_sents = cess_esp.tagged_sents()

In [5]:
size = int(len(tagged_sp_sents) * 0.1)
train_sp_sents = tagged_sp_sents[size:]
test_sp_sents = tagged_sp_sents[:size]

In [6]:
len(tagged_sp_sents) == len(train_sp_sents) + len(test_sp_sents)

True

In [7]:
tagged_sp_words = cess_esp.tagged_words()

In [8]:
tags = [tag for (word, tag) in tagged_sp_words]
most_freq_tags = nltk.FreqDist(tags)
most_freq_tags.most_common()[:10]

[('sps00', 25272),
 ('ncms000', 11428),
 ('Fc', 11420),
 ('ncfs000', 11008),
 ('da0fs0', 6838),
 ('da0ms0', 6012),
 ('rg', 5937),
 ('Fp', 5866),
 ('cc', 5854),
 ('ncmp000', 5711)]

In [9]:
default_tag = 'ncms000'

In [10]:
t0 = nltk.DefaultTagger(None)
t1 = nltk.UnigramTagger(train_sp_sents, backoff=t0)
t2 = nltk.BigramTagger(train_sp_sents, backoff=t1)
sp_tagger = nltk.TrigramTagger(train_sp_sents, backoff=t2)

In [11]:
sp_tagger.evaluate(test_sp_sents)

0.8813485113835376

In [73]:
tagger('del')

[('del', 'noun')]

In [44]:
list(db.es_lexicon.find({'flexion': 'salsa'}))

[{'_id': ObjectId('5701d778a688eb3304f2b5f5'),
  'eagle': 'NCFS000',
  'flexion': 'salsa',
  'lemma': 'salsa'}]

In [46]:
nonascii = ['á', 'ã', 'ç', 'è', 'é', 'ê', 'í', 'ñ', 'ò', 'ó', 'ú', 'ü', 'ō']

In [48]:
nonascii2 = list(map(lambda x: x.upper(), nonascii))

In [49]:
nonascii2

['Á', 'Ã', 'Ç', 'È', 'É', 'Ê', 'Í', 'Ñ', 'Ò', 'Ó', 'Ú', 'Ü', 'Ō']

In [50]:
list(map(lambda x: x.lower(), nonascii2))

['á', 'ã', 'ç', 'è', 'é', 'ê', 'í', 'ñ', 'ò', 'ó', 'ú', 'ü', 'ō']

# Ingredients

In [2]:
def is_spanish_ingredients_file(filename):
    return filename.startswith('es_') and filename.endswith('_ingredients.txt')

In [17]:
graph_syn = nx.Graph()
ingredients_root = 'data/ingredients/'
for e in os.listdir(ingredients_root):
    file_path = ingredients_root + e
    if os.path.isfile(file_path):
        if is_spanish_ingredients_file(e):
            with open(file_path) as f:
                for line in f:
                    syn_set = set()
                    ingrs1 = line.strip()
                    for ingrs2 in ingrs1.split(' / '):
                        for ingrs3 in ingrs2.split(' o '):
                            for ingr in ingrs3.split(' - '):
                                syn_set.add(ingr)
                                if not ingr in graph_syn:
                                    graph_syn.add_node(ingr, count=1)
                                else:
                                    graph_syn.node[ingr]['count'] += 1
                    syn_set = list(syn_set)
                    i1 = syn_set[0]
                    for i2 in syn_set[1:]:
                        graph_syn.add_edge(i1, i2)

In [21]:
len(graph_syn)

3322

In [22]:
nx.number_connected_components(graph_syn)

3221

In [19]:
nltk.word_tokenize('crema de foie-gras de pato')

['crema', 'de', 'foie-gras', 'de', 'pato']

In [75]:
ingredient_tagger('crema de foie-gras de pato')

[('crema', 'noun'),
 ('de', 'prep'),
 ('foie-gras', 'noun'),
 ('de', 'prep'),
 ('pato', 'noun')]

In [31]:
nonascii = set()
for ingr in graph_syn.nodes_iter():
#     q = ingr.replace(' ', 'xxx')
#     if not q.isalpha():
#         print(q.replace('xxx', ' '))
    if not is_ascii(ingr):
        for c in ingr:
            if not is_ascii(c):
                nonascii.add(c)

In [23]:
lengths = set()
for ingr in graph_syn.nodes_iter():
    tokens = nltk.word_tokenize(ingr)
    lengths.add(len(tokens))

In [24]:
lengths

{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}

In [89]:
tag_mapping = {
    'a': 'adj',
    'r': 'adv',
    'd': 'det',
    'n': 'noun',
    'v': 'verb',
    'p': 'pron',
    'c': 'conj',
    'i': 'interj',
    's': 'prep',
    'f': 'punt',
    'z': 'num',
    'w': 'date-time',
}

def ingredient_tagger(x):
    result = []
    tokens = nltk.word_tokenize(x)
    if len(tokens) == 1:
        result.append((x, 'noun'))
    else:
        tags = sp_tagger.tag(tokens)
        for token, tag in tags:
            if not tag:
                res = db.es_lexicon.find({'flexion': token})
                for r in res:
                    categ = tag_mapping[r['eagle'][0]]
                    if not tag:
                        tag = categ
                    elif categ == 'num' and tag not in ['verb', 'adj', 'noun']:
                        tag = categ
                    elif categ == 'verb' and tag not in ['adj', 'noun']:
                        tag = categ
                    elif categ == 'adj' and tag not in ['noun']:
                        tag = categ
                    elif categ == 'noun':
                        tag = categ
                        break
                if not tag:
                    tag = 'noun'
                result.append((token, tag))
            else:
                result.append((token, tag_mapping[tag.lower()[0]]))
    return result

In [137]:
# Numbers
def numbers(x):
    return x.replace(' 1 ', ' uno ') \
            .replace(' 2 ', ' dos ') \
            .replace(' 3 ', ' tres ') \
            .replace(' 4 ', ' cuatro ') \
            .replace(' 5 ', ' cinco ') \
            .replace(' 6 ', ' seis ') \
            .replace(' 7 ', ' siete ') \
            .replace(' 8 ', ' ocho ') \
            .replace(' 9 ', ' nueve ')

# Accent marks on vowels - {'á', 'ã', 'ç', 'è', 'é', 'ê', 'í', 'ñ', 'ò', 'ó', 'ú', 'ü', 'ō'}
def accent_marks(x):
    return x.replace('á', 'a') \
            .replace('ã', 'a') \
            .replace('è', 'e') \
            .replace('é', 'e') \
            .replace('ê', 'e') \
            .replace('í', 'i') \
            .replace('ò', 'o') \
            .replace('ó', 'o') \
            .replace('ō', 'o') \
            .replace('ú', 'u') \
            .replace('ü', 'u')

# Non-ascii consonants - {'á', 'ã', 'ç', 'è', 'é', 'ê', 'í', 'ñ', 'ò', 'ó', 'ú', 'ü', 'ō'}
def nonascii_consonants(x):
    return x.replace('ç', 'c') \
            .replace('ñ', 'n')
    
# Dashes (-)
def dashes1(x):
    return x.replace('-', '')

def dashes2(x):
    return x.replace('-', ' ')

# POS tags
# ADJETIVOS .... A ADJ ...... X
# ADVERBIOS .... R ADV
# DETERMINANTES  D DET
# NOMBRES ...... N NOUN ..... X
# VERBOS ....... V VERB ..... X
# PRONOMBRES ... P PRON
# CONJUNCIONES . C CONJ
# INTERJECCIONES I INTERJ
# PREPOSICIONES  S PREP
# PUNTUACIÓN ... F PUNTUATION
# NUMERALES .... Z NUM ...... X
# FECHAS Y HORAS W DATE-TIME
def pos_tags(x):
    tags = ingredient_tagger(x)
    filtered = [token
                for token, tag in tags
                if tag in ['num', 'verb', 'adj', 'noun']]
    return ' '.join(filtered)

funcs = [pos_tags, numbers, accent_marks, nonascii_consonants, dashes1, dashes2] # pos_tags first
combinations = []
for i in range(1, len(funcs) + 1):
    combinations.append(list(itertools.combinations(funcs, i)))
combinations = [c for comb in combinations for c in comb]

def normalize_old(ingredient):
    result = set()
    for c in combinations:
        x = ingredient
        for f in c:
            x = f(x)
        result.add(x)
    return result

def normalize(ingredient): # dynamic programming version
    d = {}
    for f in funcs:
        d[f] = {}
    result = set()
    for c in combinations:
        x = ingredient
        for f in c:
            if not x in d[f]:
                d[f][x] = f(x)
            x = d[f][x]
        result.add(x)
    return result

In [138]:
ingredient_tagger('ácido de 5 piñas')

[('ácido', 'noun'), ('de', 'prep'), ('5', 'num'), ('piñas', 'noun')]

In [139]:
ingredient_tagger('ácido de cinco piñas')

[('ácido', 'noun'), ('de', 'prep'), ('cinco', 'det'), ('piñas', 'noun')]

In [141]:
%%time
normalize('ácido de 5 piñas')

CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 288 ms


{'acido 5 pinas',
 'acido 5 piñas',
 'acido cinco pinas',
 'acido cinco piñas',
 'acido de 5 pinas',
 'acido de 5 piñas',
 'acido de cinco pinas',
 'acido de cinco piñas',
 'ácido 5 pinas',
 'ácido 5 piñas',
 'ácido cinco pinas',
 'ácido cinco piñas',
 'ácido de 5 pinas',
 'ácido de 5 piñas',
 'ácido de cinco pinas',
 'ácido de cinco piñas'}

In [142]:
%%time
normalize_old('ácido de 5 piñas')

CPU times: user 36 ms, sys: 0 ns, total: 36 ms
Wall time: 8.02 s


{'acido 5 pinas',
 'acido 5 piñas',
 'acido cinco pinas',
 'acido cinco piñas',
 'acido de 5 pinas',
 'acido de 5 piñas',
 'acido de cinco pinas',
 'acido de cinco piñas',
 'ácido 5 pinas',
 'ácido 5 piñas',
 'ácido cinco pinas',
 'ácido cinco piñas',
 'ácido de 5 pinas',
 'ácido de 5 piñas',
 'ácido de cinco pinas',
 'ácido de cinco piñas'}

In [19]:
pos_tags('ácido de 5 piñas')

[('ácido', 'ncms000'), ('de', 'sps00'), ('5', 'Z'), ('piñas', None)]

In [93]:
res = set()
for c in combinations:
    x = 'ácido de 5 piñas'
    for f in c:
        x = f(x)
    res.add(x)
res

TypeError: unhashable type: 'list'

In [82]:
res = set()
for c in combinations:
    x = 'ácido de 5 piñas'
    for f in c:
        x = f(x)
    res.add(x)
res

{'acido de 5 pinas',
 'acido de 5 piñas',
 'acido de cinco pinas',
 'acido de cinco piñas',
 'ácido de 5 pinas',
 'ácido de 5 piñas',
 'ácido de cinco pinas',
 'ácido de cinco piñas'}

In [47]:
for f in funcs:
    print(f('ácido de 5 piñas'))

['helado de cinco piñas']
['helado de 5 piñas']
['helado de 5 pinas']
['helado de 5 piñas', 'helado de 5 piñas']


In [56]:
import itertools
r = []
for i in range(1, len([1,2,3,4]) + 1):
    r.append(list(itertools.combinations([1,2,3,4], i)))
[y for x in r for y in x]

[(1,),
 (2,),
 (3,),
 (4,),
 (1, 2),
 (1, 3),
 (1, 4),
 (2, 3),
 (2, 4),
 (3, 4),
 (1, 2, 3),
 (1, 2, 4),
 (1, 3, 4),
 (2, 3, 4),
 (1, 2, 3, 4)]

In [28]:
def is_ascii(s):
    return all(ord(c) < 128 for c in s)

In [69]:
for length in lengths:
    for ingr in graph_syn.nodes_iter():
        tokens = nltk.word_tokenize(ingr)
        if len(tokens) == length:
            lemmatized = lemmatize(tokens)
            if not lemmatized in graph_syn:
                graph_syn.add_node(lemmatized, count=1)
            else:
                graph_syn.node[lemmatized]['count'] += 1
            graph_syn.add_edge(ingr, lemmatized)

3238

In [71]:
nx.node_connected_component(graph_syn, 'algas')

{'algas'}

In [46]:
for line in all_lines:
    syn_set = set()
    ingrs1 = line.strip()
    for ingrs2 in ingrs1.split(' / '):
        for ingr in ingrs2.split(' o '):
            syn_set.add(ingr)
    syn_set = list(syn_set)
    i1 = syn_set[0]
    for i2 in syn_set[1:]:
        graph_syn.add_edge(i1, i2)

In [45]:
def normalize(expr):
    pass

def lemmatize(expr):
    pass

In [46]:
for line in all_lines:
    syn_set = set()
    ingrs1 = line.strip()
    for ingrs2 in ingrs1.split(' / '):
        for ingr in ingrs2.split(' o '):
            lemmatized = lemmatize(ingr)
            if not lemmatized in graph_syn:
                graph_syn.add_node(lemmatized, count=1, is_lemma=True, is_repr=False)
            else:
                graph_syn.node[lemmatized]['count'] += 1
                graph_syn.node[lemmatized]['is_lemma'] = True
            syn_set.add(ingr)
            syn_set.add(lemmatized)
    syn_set = list(syn_set)
    i1 = syn_set[0]
    for i2 in syn_set[1:]:
        graph_syn.add_edge(i1, i2)

In [25]:
g=nx.Graph()

In [26]:
g.add_edge(1, 1, b=5)

In [27]:
g.edges(data=True)

[(1, 1, {'b': 5})]

In [28]:
g.add_node(1, {'a':3})

In [29]:
g.nodes(data=True)

[(1, {'a': 3})]

In [26]:
g.node[1]['a'] += 1

In [5]:
type(nltk.FreqDist())

nltk.probability.FreqDist

In [6]:
def lemmatizer(x):
    return nltk.FreqDist(x)

In [12]:
xxx=lemmatizer()

In [None]:
def add_es_ingredient(ingredient):
    #ingredient tiene format "salsa de soja o salsa de soya" -> esto no puede ir en las funciones clean
    #pq entonces se pierde la info de que son ingredientes sinonimos
    #puede estar en singular o plurar
    #hay que lematizar la expresion (ingredient) completa, eliminar preps, arts... segun mi criterio
    #hay que guardar todas las posibles variantes del ingredient y su lematizacion
    #guardar esta funcion en un obj pickle y "exportarla" donde sea necesario

In [28]:
if __name__=='__main__':
    with open('pickle/lemmatizer.pickle', 'wb') as f:
        pickle.dump(xxx, f)

In [24]:
class A:
    def lemmatizer(self, x):
        return nltk.FreqDist(x)

In [25]:
xxx = A()

In [26]:
xxx.lemmatizer(['a','a','b'])

Counter({'a': 2, 'b': 1})

In [38]:
>>> from nltk.corpus import wordnet as wn
for ss in wn.synsets('oil'):
    print(ss.lemma_names('spa'))

['aceite']
['óleo']
['petróleo']
[]
[]
[]


In [69]:
wn.langs

<bound method WordNetCorpusReader.langs of <WordNetCorpusReader in '/home/antonio/nltk_data/corpora/wordnet'>>

In [70]:
from nltk.corpus import omw

ImportError: cannot import name 'omw'

In [83]:
l=wn.lemmas('cane', lang='ita')[0]

In [92]:
l.synset()

Synset('dog.n.01')

In [93]:
l

Lemma('dog.n.01.cane')

In [25]:
ls = wn.lemmas('amaba', lang='spa')

In [26]:
for x in ls:
    print(x.name())

In [None]:
lematizar solamente usando mi lexicon en español
si no existe la palabra y no se puede lematizar, quitar s final si existe
esto aplica a adj y noun, poco probable encontrar un verbo