In [None]:
# -*- coding: utf-8 -*-
import os
import config
import codecs
import chardet
import string
import freeling

#Freeling initialize

FREELINGDIR = "/usr/local"
DATA = FREELINGDIR + "/share/freeling/"
LANG="es"

freeling.util_init_locale("default")

# create language analyzer
la = freeling.lang_ident(DATA + "common/lang_ident/ident.dat")
# create options set for maco analyzer. Default values are Ok, except for data files.
op = freeling.maco_options("es")
op.set_data_files("",
                  DATA + "common/punct.dat",
                  DATA + LANG + "/dicc.src",
                  DATA + LANG + "/afixos.dat",
                  "",
                  DATA + LANG + "/locucions.dat",
                  DATA + LANG + "/np.dat",
                  DATA + LANG + "/quantities.dat",
                  DATA + LANG + "/probabilitats.dat")


# create analyzers
tk=freeling.tokenizer(DATA+LANG+"/tokenizer.dat")
sp=freeling.splitter(DATA+LANG+"/splitter.dat")
sid=sp.open_session()
mf=freeling.maco(op)

# activate mmorpho odules to be used in next call
mf.set_active_options(False, True, True, True,  # select which among created 
                      True, True, False, True,  # submodules are to be used. 
                      True, True, True, True ) # default: all created submodules are used

# create tagger, sense anotator, and parsers
tg=freeling.hmm_tagger(DATA+LANG+"/tagger.dat",True,2)
sen=freeling.senses(DATA+LANG+"/senses.dat")
parser= freeling.chart_parser(DATA+LANG+"/chunker/grammar-chunk.dat")
dep=freeling.dep_txala(DATA+LANG+"/dep_txala/dependences.dat", parser.get_start_symbol())

def generate_freeling_annotations(input_text):
    tokens = tk.tokenize(input_text)
    ls = sp.split(sid,tokens,False)

    ls = mf.analyze(ls)
    ls = tg.analyze(ls)
    ls = sen.analyze(ls)
    ls = parser.analyze(ls)
    ls = dep.analyze(ls)
    
    return ls

def process_line(input_line):
    result = []
    freeling_annotations = generate_freeling_annotations(input_line)
    
    for s in freeling_annotations:
        ws = s.get_words()
        for w in ws:
            result.append([w.get_form(), w.get_lemma(),w.get_tag()])
    
    return result

def process_line_filtered(input_line, filters):
    result = []
    freeling_annotations = generate_freeling_annotations(input_line)
    
    for s in freeling_annotations:
        ws = s.get_words()
        for w in ws:
            if pass_filter(w.get_tag(), filters):
                result.append([w.get_form(), w.get_lemma(),w.get_tag()])
    
    return result

def pass_filter(word, filters):
    for f in filters:
        if word.startswith(f):
            return True
    return False

def get_lemmatized_text(input_line, filters):
    result = ""
    tuples  = process_line_filtered(input_line, filters)
    for t in tuples:
        result = result + str(" ") + str(t[1])
    
    return result.strip()

def parse_file(file_path):
    dic = {}
    f= codecs.open(file_path, encoding='utf-8')
    u = f.read()
    line =u.split('\n')
    dic['titulo'] = line[0]
    dic['resumen'] = line[1]
    dic['texto'] = line[2]
    return dic

In [None]:
documents = []
documents_lemmatized = []

for file_name in os.listdir(config.DATASET_TED_RAW):
    try:
        documents.append(parse_file(config.DATASET_TED_RAW + '/' + file_name))
    except Exception as e:
        print file_name
        print e.message

for doc in documents:
    documents_lemmatized.append(get_lemmatized_text(str(doc['texto']).encode('utf-8'), ["N", "V", "AQ"]))

print documents_lemmatized[0]
print documents_lemmatized[1]
print documents_lemmatized[2]