In [1]:
import shlex

import nltk
from nltk.corpus import cess_esp
from pymongo import MongoClient

# Tagger

In [2]:
tagged_sp_sents = cess_esp.tagged_sents()

In [3]:
size = int(len(tagged_sp_sents) * 0.1)
train_sp_sents = tagged_sp_sents[size:]
test_sp_sents = tagged_sp_sents[:size]

In [4]:
len(tagged_sp_sents) == len(train_sp_sents) + len(test_sp_sents)

True

In [5]:
tagged_sp_words = cess_esp.tagged_words()

In [6]:
tags = [tag for (word, tag) in tagged_sp_words]
most_freq_tags = nltk.FreqDist(tags)
most_freq_tags.most_common()[:10]

[('sps00', 25272),
 ('ncms000', 11428),
 ('Fc', 11420),
 ('ncfs000', 11008),
 ('da0fs0', 6838),
 ('da0ms0', 6012),
 ('rg', 5937),
 ('Fp', 5866),
 ('cc', 5854),
 ('ncmp000', 5711)]

In [7]:
default_tag = 'ncms000'

In [8]:
t0 = nltk.DefaultTagger(default_tag)
t1 = nltk.UnigramTagger(train_sp_sents, backoff=t0)
t2 = nltk.BigramTagger(train_sp_sents, backoff=t1)
sp_tagger = nltk.TrigramTagger(train_sp_sents, backoff=t2)

In [9]:
sp_tagger.evaluate(test_sp_sents)

0.8843257443082312

# Parser

In [10]:
# states
# 'NODETYPES', 'EDGETYPES', 'NODE', 'NODEFIELDS', 'EDGES', 'EDGEFIELDS', 'NOTHING'
current_st = 'NOTHING'

In [11]:
ingredients = set()

In [12]:
path_db = 'data/BulliCompletoEditado.nlg'
with open(path_db) as f:
    for line in f:
        stripped_line = line.strip()
        if stripped_line == '"<NodesTypes>"' and current_st == 'NOTHING':
            current_st = 'NODETYPES'
        elif stripped_line == '"<EndNodesTypes>"' and current_st == 'NODETYPES':
            current_st = 'NOTHING'
        elif stripped_line == '"<EdgesTypes>"' and current_st == 'NOTHING':
            current_st = 'EDGETYPES'
        elif stripped_line == '"<EndEdgesTypes>"' and current_st == 'EDGETYPES':
            current_st = 'NOTHING'
        elif stripped_line == '"<Nodes>"' and current_st == 'NOTHING':
            current_st = 'NODEFIELDS'
        elif stripped_line == '"<EndNodes>"' and current_st == 'NODE':
            current_st = 'NOTHING'
        elif stripped_line == '"<Edges>"' and current_st == 'NOTHING':
            current_st = 'EDGEFIELDS'
        elif stripped_line == '"<EndEdges>"' and current_st == 'EDGE':
            current_st = 'NOTHING'
        elif current_st == 'NODETYPES':
            pass
        elif current_st == 'EDGETYPES':
            pass
        elif current_st == 'NODEFIELDS':
            current_st = 'NODE'
        elif current_st == 'EDGEFIELDS':
            current_st = 'EDGE'
        elif current_st == 'NODE':
            if stripped_line.endswith('"sabor"'):
                values = shlex.split(stripped_line)
                ide = values[0]
                name = ide[len('sabor:'):]
                exprs = name.split(' / ')
                for expr in exprs:
                    ingrs = expr.split(' y ')
                    if len(ingrs) == 1:
                        ingredients.add(ingrs[0])
                    else:
                        for ingr in ingrs:
                            tokens = nltk.word_tokenize(ingr)
                            tags = sp_tagger.tag(tokens)
                            if any(map(lambda x: x[1].startswith('n'), tags)):
                                ingredients.add(ingr)
            elif stripped_line.endswith('"ingrediente"'):
                values = shlex.split(stripped_line)
                ide = values[0]
                name = ide
                exprs1 = [n.strip() for n in name.split(',')]
                for expr1 in exprs1:
                    exprs2 = expr1.split(' / ')
                    for expr2 in exprs2:
                        ingrs = expr2.split(' y ')
                        if len(ingrs) == 1:
                            ingredients.add(ingrs[0])
                        else:
                            for ingr in ingrs:
                                tokens = nltk.word_tokenize(ingr)
                                tags = sp_tagger.tag(tokens)
                                if any(map(lambda x: x[1].startswith('n'), tags)):
                                    ingredients.add(ingr)
        elif current_st == 'EDGE':
            pass

In [13]:
len(ingredients)

1922

# Lemmatizer

In [14]:
client = MongoClient()
client.drop_database('sp_lexicon')
db = client.sp_lexicon

In [15]:
%%time

import csv

with open('data/sp_lexicon.csv') as f:
    reader = csv.reader(
        f,
        delimiter=' ',
    )
    docs = []
    count = 0
    for row in reader:
        for i in range(1, len(row[1:]), 2):
            entry = {}
            entry['flexion'] = row[0]
            entry['lemma'] = row[i]
            entry['eagle'] = row[i+1]
            docs.append(entry)
            count += 1
        if count % 1000 == 0:
            db.lexicon.insert_many(docs)
            docs = []

CPU times: user 12 s, sys: 64 ms, total: 12.1 s
Wall time: 18.2 s


In [16]:
db.lexicon.count()

668000

In [15]:
def singularize(word):
#     amarilla amarillo AQ0FS0
#     amarillas amarillo AQ0FP0
    singular de "amarillas" es la flexion cuyo lemma sea el mismo que el de "amarillas" pero cuyo eagle sea S en vez de P
    adj y nouns o tb articulos?
    return result


extraer los ingredientes del texto de la preparacion o sacarlos del grafo que ya tengo?

In [None]:
singularized_ingredients = set()
for 
            singularized = ' '.join(map(singularize, ingredient.split()))
            ingredients.add(singularized)

In [13]:
with open('data/ingredients/elbulli_ingredients.txt', 'w') as f:
    f.write('\n'.join(sorted(ingredients)))