In [119]:
%load_ext watermark
%watermark

2017-05-22T18:07:35-04:00

CPython 3.6.0
IPython 5.3.0

compiler   : GCC 4.2.1 Compatible Apple LLVM 6.0 (clang-600.0.57)
system     : Darwin
release    : 16.5.0
machine    : x86_64
processor  : i386
CPU cores  : 4
interpreter: 64bit


In [186]:
from tf_idf import preprocess_fundamentos, remove_accents
from sklearn import model_selection
from tqdm import tqdm, trange
from collections import Counter, defaultdict
from operator import itemgetter

from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from models import PNUD
import logging
import random
import pandas as pd
import numpy as np
import unicodedata
import re
import itertools

from pathlib import Path
from pprint import pprint

from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.dummy import DummyClassifier
from sklearn.svm import SVC  # support vector machine classifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB  # naive bayes
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import sent_tokenize, word_tokenize, wordpunct_tokenize
import string
import sys

import subprocess

logger = logging.getLogger(__name__)

# engine = create_engine('mysql://root@localhost/constabierta')
# _session = sessionmaker(bind=engine, autocommit=True, expire_on_commit=True)
# session = _session()

data_dir = Path('data/')
temas = list(range(1, 5))
stop_words = stopwords.words('spanish')
stemmer = SnowballStemmer('spanish')

In [166]:
def remove_accents(s):
    return unicodedata.normalize('NFKD', s) \
        .encode('ASCII', 'ignore') \
        .decode('utf-8')
            
def cleaner(token):
    if token in stop_words or token in string.punctuation:
        return None
    return remove_accents(token)

# Feature Extraction con FreeLing

Freeling no se lleva bien con strings de sólo mayúsculas. Los convertiremos a lowercase:

In [72]:
# CONVERTIR ARCHIVOS

### convierte los archivos x_*_tema_*_categorias_pnud_0.txt 
### en lowercase x_*_tema_*_categorias_pnud_0_lower.txt

### ¿BUG? en freeling: si hay una frase del tipo "hola. ¿esto es una pregunta?",
### freeling considera esa frase como dos documentos, separados por el ". ¿"
### al pasar a lowercase, se elimina el "." de ese tipo de frases

### hay más casos parecidos con la puntuación, se reemplaza toda punto por coma

x_files = [d.as_posix() for d in data_dir.glob('x_*_tema_*_categorias_pnud_0.txt')]
x_files_lower = list(map(lambda f: f.split('.')[0] + '_lower.txt', x_files))

for x_file, x_file_lower in tqdm(zip(x_files, x_files_lower), total=len(x_files)):
    with open(x_file, 'r') as f, open(x_file_lower, 'w') as g:
        for line in f:
            to_write = line.lower().replace('.', ',')
            to_write = re.sub('\?+ *¿', '?, ¿', to_write)
            
            #g.write(re.sub('\.+ *¿', ' ¿', line.lower()))
            g.write(to_write)

100%|██████████| 12/12 [00:01<00:00,  4.75it/s]


In [73]:
# GENERAR ARCHIVOS CON FREELING

# freeling
for tema in tqdm(temas):
    x_files = [d.as_posix() for d in data_dir.glob(f'x_*_tema_{tema}_categorias_pnud_0_lower.txt')]
    for input_file in x_files:
        output_file = input_file.split('.')[0] + '_fl.txt'
        subprocess.getoutput(f'analyze -f freeling_es.cfg < {input_file} > {output_file}')

100%|██████████| 4/4 [07:52<00:00, 117.64s/it]


In [125]:
# CARGAR ARCHIVOS

temas = range(1, 5)

# load freeling processed files
docs_train = defaultdict(list)
docs_dev = defaultdict(list)
docs_test = defaultdict(list)

for tema in tqdm(temas, desc="temas"):
    logger.info("train")
    with Path('data', f'x_train_tema_{tema}_categorias_pnud_0_lower_fl.txt').open('r') as f:
        doc = []
        for line in f:
            if not line.strip():
                docs_train[tema].append(doc)
                doc = []
                continue
            doc.append(line[:-1].split()[1:6])
            
    logger.info("dev")
    with Path('data', f'x_dev_tema_{tema}_categorias_pnud_0_lower_fl.txt').open('r') as g:
        doc = []
        for line in g:
            if not line.strip():
                docs_dev[tema].append(doc)
                doc = []
                continue
            doc.append(line[:-1].split()[1:6])
            
    logger.info("test")
    with Path('data', f'x_test_tema_{tema}_categorias_pnud_0_lower_fl.txt').open('r') as h:
        doc = []
        for line in h:
            if not line.strip():
                docs_test[tema].append(doc)
                doc = []
                continue
            doc.append(line[:-1].split()[1:6])

temas: 100%|██████████| 4/4 [00:13<00:00,  3.37s/it]


In [168]:
# prepare train & test data

temas = range(1, 5)
train = defaultdict(list)
dev = defaultdict(list)
test = defaultdict(list)

# para cada tokenset (dado por freeling):
# ejemplo:
# ['los',   <--- token original
# 'el',     <--- lema
# 'DA0MP0', <--- POS
# 'DA',     <--- POS2
# 'pos=determiner|type=article|gen=masculine|num=plural']  <--- detalle de POS
# usamos el lema sin caracteres no-ascii y filtramos stopwords y puntuacion

for tema in temas:
    for doc in tqdm(docs_train[tema], f"train (tema {tema})\t"):
        tokens = map(itemgetter(1), doc)
        tokens = map(cleaner, tokens)
        tokens = filter(lambda t: t is not None, tokens)
        train[tema].append(list(tokens))
        
    for doc in tqdm(docs_dev[tema], f"dev (tema {tema})\t"):
        tokens = map(itemgetter(1), doc)
        tokens = map(cleaner, tokens)
        tokens = filter(lambda t: t is not None, tokens)
        dev[tema].append(list(tokens))
        
    for doc in tqdm(docs_test[tema], f"test (tema {tema})\t"):
        tokens = map(itemgetter(1), doc)
        tokens = map(cleaner, tokens)
        tokens = filter(lambda t: t is not None, tokens)
        test[tema].append(list(tokens))

train (tema 1)	: 100%|██████████| 36886/36886 [00:04<00:00, 8978.49it/s]
dev (tema 1)	: 100%|██████████| 4611/4611 [00:00<00:00, 8061.77it/s]
test (tema 1)	: 100%|██████████| 4611/4611 [00:00<00:00, 8141.04it/s]
train (tema 2)	: 100%|██████████| 34963/34963 [00:04<00:00, 8733.44it/s]
dev (tema 2)	: 100%|██████████| 4371/4371 [00:00<00:00, 7937.89it/s]
test (tema 2)	: 100%|██████████| 4370/4370 [00:00<00:00, 7613.67it/s]
train (tema 3)	: 100%|██████████| 32104/32104 [00:03<00:00, 9823.79it/s] 
dev (tema 3)	: 100%|██████████| 4013/4013 [00:00<00:00, 7780.63it/s]
test (tema 3)	: 100%|██████████| 4013/4013 [00:00<00:00, 8574.49it/s]
train (tema 4)	: 100%|██████████| 31440/31440 [00:03<00:00, 10038.14it/s]
dev (tema 4)	: 100%|██████████| 3930/3930 [00:00<00:00, 8681.61it/s] 
test (tema 4)	: 100%|██████████| 3930/3930 [00:00<00:00, 9362.14it/s]


In [128]:
y_train = defaultdict(list)
y_dev = defaultdict(list)
y_test = defaultdict(list)
target_names = defaultdict(list)

for tema in tqdm(temas):
    # train
    with Path('data', f'y_train_tema_{tema}_categorias_pnud_0.txt').open('r') as f:
        y_train[tema] = [int(line[:-1]) for line in f]

    # dev
    with Path('data', f'y_dev_tema_{tema}_categorias_pnud_0.txt').open('r') as f:
        y_dev[tema] = [int(line[:-1]) for line in f]

    # test
    with Path('data', f'y_test_tema_{tema}_categorias_pnud_0.txt').open('r') as f:
        y_test[tema] = [int(line[:-1]) for line in f]
    
    # labels
    with Path('data', f'categorias_tema_{tema}_pnud_0.txt').open('r') as f:
        target_names[tema] = list(map(lambda l: l[:-1], f.readlines()))

100%|██████████| 4/4 [00:00<00:00, 14.04it/s]


In [None]:
# DEBUG

# para encontrar mismatchs entre datos originales y producidos por freeling:
lines = []
with open('data/x_train_tema_1_categorias_pnud_0_lower_fl.txt', 'r') as f:
    with open('data/x_train_tema_1_categorias_pnud_0_lower.txt', 'r') as g:
        g_lines = g.readlines()
        doc_p = set()
        i = 0
        for line in tqdm(f.readlines()):
            if not line.strip():
                doc_o = set(g_lines[i].split())
                jacc = len(doc_o & doc_p) / len(doc_o | doc_p)
                if jacc < 0.001:
                    lines.append((i, ' '.join(sorted(list(doc_o))), ' '.join(sorted(list(doc_p)))))
                i += 1
                doc_p = set()
                continue
            doc_p.add(line.split()[1])

# Sanity checks

Verificar si las dimensiones de los datasets son correctas

In [171]:
for tema in temas:
    print(f'tema:\t{tema}')
    print(f'train:', ('✅' if len(train[tema]) == len(y_train[tema]) else '❌'), sep='\t')
    print(f'dev:', ('✅' if len(dev[tema]) == len(y_dev[tema]) else '❌'), sep='\t')
    print(f'test', ('✅' if len(test[tema]) == len(y_test[tema]) else '❌'), sep='\t')
    print()

tema:	1
train:	✅
dev:	✅
test	✅

tema:	2
train:	✅
dev:	✅
test	✅

tema:	3
train:	✅
dev:	✅
test	✅

tema:	4
train:	✅
dev:	✅
test	✅



In [173]:
train[1][:3]

[['gobierno',
  'deber',
  'ser',
  'trasparente',
  'informar',
  'antes_de',
  'gastar',
  'dinero',
  'informar',
  'publicitar',
  'proyecto',
  'ejecutar'],
 ['responsable', 'amplio', 'libre', 'participativo', 'sano'],
 ['mejor',
  'justicia',
  'falta',
  'pueblo',
  'ciudad',
  'persona',
  'mejor',
  'manera',
  'mantener',
  'pais',
  'contento',
  'convivencia',
  'respeto',
  'institucion',
  'deber',
  'ser',
  'justo',
  'origen',
  'aplicacion']]

In [167]:
docs_train[1][0][0]

['los',
 'el',
 'DA0MP0',
 'DA',
 'pos=determiner|type=article|gen=masculine|num=plural']

In [184]:
cnt = CountVectorizer(ngram_range=(1, 1))
data = ['gato perro conejo', 'hola gato', 'chao perro']
m = cnt.fit_transform(data)
print(sorted(cnt.vocabulary_.items(), key=itemgetter(1)))
m.todense()

[('chao', 0), ('conejo', 1), ('gato', 2), ('hola', 3), ('perro', 4)]


matrix([[0, 1, 1, 0, 1],
        [0, 0, 1, 1, 0],
        [1, 0, 0, 0, 1]], dtype=int64)

In [146]:
tf = TfidfTransformer(use_idf=False)
m2 = tf.transform(m)
m2.todense()

matrix([[ 0.        ,  0.57735027,  0.57735027,  0.        ,  0.57735027],
        [ 0.        ,  0.        ,  0.70710678,  0.70710678,  0.        ],
        [ 0.70710678,  0.        ,  0.        ,  0.        ,  0.70710678]])

----

# Definir experimentos

1. Features
    1. tokens
    2. tokens + POS
    3. tokens + POS + estructural (largo frase, nº de oraciones, ?)
2. Representación
    2. Conteo de unigramas
    2. Conteo de bigramas
    3. Unigramas normalizado por documento (tf)
    3. Bigramas normalizado por documento (tf)
    4. Unigramas normalizado por corpus (tf-idf)
    4. Bigramas normalizado por corpus (tf-idf)    
3. Clasificadores
    1. Dummy stratified
    2. Multinomial NB
    3. SVC (linear)
    6. Random forest
    5. KNN
4. Medidas de rendimiento
    1. Precision, recall, f1, accuracy @1
    2. Precision, recall, f1, accuracy @5

## 1. Features

In [176]:
# train, test, & dev contienen sólo los tokens por ahora (19:00 may 22)

## 2. Representación

In [200]:
count_vectorizer_1g = CountVectorizer(analyzer='word', tokenizer=lambda x: x, lowercase=False)
count_vectorizer_2g = CountVectorizer(analyzer='word', tokenizer=lambda x: x, lowercase=False, ngram_range=(2, 2))
count_vectorizer_all= CountVectorizer(analyzer='word', tokenizer=lambda x: x, lowercase=False, ngram_range=(1, 2))

tf = TfidfTransformer(use_idf=False)
tfidf = TfidfTransformer()

counters = [('unigram', count_vectorizer_1g), 
            ('bigram', count_vectorizer_2g),
            ('ngram', count_vectorizer_all)]

norm = [('tf', tf), ('tf-idf', tfidf)]

In [201]:
X_train = dict()
X_dev = dict()
for tema in tqdm(temas):
    for name, cnt in counters:
        X = cnt.fit_transform(train[tema] + dev[tema])
        X_train[(tema, name)], X_dev[(tema, name)] = X[:len(train[tema])], X[len(train[tema]):]

100%|██████████| 4/4 [00:13<00:00,  3.30s/it]


In [202]:
X_train

{(1, 'bigram'): <36886x165980 sparse matrix of type '<class 'numpy.int64'>'
 	with 368637 stored elements in Compressed Sparse Row format>,
 (1, 'ngram'): <36886x176565 sparse matrix of type '<class 'numpy.int64'>'
 	with 750538 stored elements in Compressed Sparse Row format>,
 (1, 'unigram'): <36886x10586 sparse matrix of type '<class 'numpy.int64'>'
 	with 381901 stored elements in Compressed Sparse Row format>,
 (2, 'bigram'): <34963x151152 sparse matrix of type '<class 'numpy.int64'>'
 	with 346652 stored elements in Compressed Sparse Row format>,
 (2, 'ngram'): <34963x161391 sparse matrix of type '<class 'numpy.int64'>'
 	with 704960 stored elements in Compressed Sparse Row format>,
 (2, 'unigram'): <34963x10240 sparse matrix of type '<class 'numpy.int64'>'
 	with 358308 stored elements in Compressed Sparse Row format>,
 (3, 'bigram'): <32104x117642 sparse matrix of type '<class 'numpy.int64'>'
 	with 275687 stored elements in Compressed Sparse Row format>,
 (3, 'ngram'): <32104x

# Baseline 1: Conteo de tokens

In [105]:
cnt = CountVectorizer(analyzer='word', tokenizer=lambda x: x, lowercase=False)
X = cnt.fit_transform(train + test)
X_train, X_test = X[:len(train)], X[len(train):]

X_train_dense, X_test_dense = X_train.toarray(), X_test.toarray()

In [102]:
# sanity checks

print(f"X_train: {X_train.shape}\ny_train: {len(y_train)}\nX_test: {X_test.shape}\ny_test: {len(y_test)}")
print(len(docs_train[1]), len(y_train))
print(len(docs_test[1]), len(y_test))

X_train: (36886, 8656)
y_train: 36886
X_test: (4611, 8656)
y_test: 4611
36886 36886
4611 4611


## KNN@5

In [97]:
clf = KNeighborsClassifier(n_neighbors=5)
clf.fit(X_train, y_train)
predicted = clf.predict(X_test)
print("Accuracy:", np.mean(predicted == y_test))

0.45304706137497291

In [101]:
print(metrics.classification_report(y_test, predicted, target_names=target_names))

                                                          precision    recall  f1-score   support

                                          Amistad cívica       0.06      0.11      0.08        27
                                    Autonomía / Libertad       0.37      0.52      0.44       168
                                  Bien Común / Comunidad       0.36      0.60      0.45       276
                                              Ciudadanía       0.00      0.00      0.00        16
                                              Democracia       0.39      0.60      0.47       380
                                              Desarrollo       0.25      0.33      0.29        52
                                       Descentralización       0.70      0.76      0.73       307
                                                Dignidad       0.41      0.42      0.42       193
                                              Diversidad       0.24      0.32      0.28        72
                   

## GaussianNB

In [106]:
clf = GaussianNB()
clf.fit(X_train_dense, y_train)
predicted = clf.predict(X_test_dense)
print("Accuracy:", np.mean(predicted == y_test))

Accuracy: 0.147256560399


In [108]:
print(metrics.classification_report(y_test, predicted, target_names=target_names))

                                                          precision    recall  f1-score   support

                                          Amistad cívica       0.02      0.11      0.03        27
                                    Autonomía / Libertad       0.28      0.17      0.21       168
                                  Bien Común / Comunidad       0.17      0.06      0.09       276
                                              Ciudadanía       0.01      0.12      0.02        16
                                              Democracia       0.32      0.08      0.13       380
                                              Desarrollo       0.04      0.12      0.06        52
                                       Descentralización       0.55      0.26      0.35       307
                                                Dignidad       0.24      0.13      0.17       193
                                              Diversidad       0.05      0.08      0.06        72
                   

## Multinomial NB

In [109]:
clf = MultinomialNB()
clf.fit(X_train_dense, y_train)
predicted = clf.predict(X_test_dense)
print("Accuracy:", np.mean(predicted == y_test))
print(metrics.classification_report(y_test, predicted, target_names=target_names))

Accuracy: 0.615267837779
                                                          precision    recall  f1-score   support

                                          Amistad cívica       1.00      0.04      0.07        27
                                    Autonomía / Libertad       0.65      0.63      0.64       168
                                  Bien Común / Comunidad       0.52      0.75      0.61       276
                                              Ciudadanía       0.00      0.00      0.00        16
                                              Democracia       0.50      0.81      0.62       380
                                              Desarrollo       0.78      0.13      0.23        52
                                       Descentralización       0.82      0.94      0.88       307
                                                Dignidad       0.60      0.64      0.62       193
                                              Diversidad       0.42      0.11      0.18     

  'precision', 'predicted', average, warn_for)


## Dummy stratified

In [112]:
clf = DummyClassifier(strategy='stratified')
clf.fit(X_train_dense, y_train)
predicted = clf.predict(X_test_dense)
print("Accuracy:", np.mean(predicted == y_test))
print(metrics.classification_report(y_test, predicted, target_names=target_names))

Accuracy: 0.0483626111473
                                                          precision    recall  f1-score   support

                                          Amistad cívica       0.00      0.00      0.00        27
                                    Autonomía / Libertad       0.04      0.04      0.04       168
                                  Bien Común / Comunidad       0.05      0.05      0.05       276
                                              Ciudadanía       0.00      0.00      0.00        16
                                              Democracia       0.06      0.06      0.06       380
                                              Desarrollo       0.02      0.02      0.02        52
                                       Descentralización       0.07      0.07      0.07       307
                                                Dignidad       0.05      0.05      0.05       193
                                              Diversidad       0.00      0.00      0.00    

# Baseline 2: Conteo + TF

In [117]:
text_clf = Pipeline([('vect', CountVectorizer(analyzer='word', tokenizer=lambda x: x, lowercase=False)),
                     ('tf', TfidfTransformer(use_idf=False)),
                     ('clf', MultinomialNB())])

%time clf = text_clf.fit(X_train_dense, y_train)

CPU times: user 1min 35s, sys: 2.53 s, total: 1min 37s
Wall time: 1min 44s


In [118]:
predicted = clf.predict(X_test_dense)
print("Accuracy:", np.mean(predicted == y_test))
print(metrics.classification_report(y_test, predicted, target_names=target_names))

Accuracy: 0.0852309694209
                                                          precision    recall  f1-score   support

                                          Amistad cívica       0.00      0.00      0.00        27
                                    Autonomía / Libertad       0.00      0.00      0.00       168
                                  Bien Común / Comunidad       0.00      0.00      0.00       276
                                              Ciudadanía       0.00      0.00      0.00        16
                                              Democracia       0.00      0.00      0.00       380
                                              Desarrollo       0.00      0.00      0.00        52
                                       Descentralización       0.00      0.00      0.00       307
                                                Dignidad       0.00      0.00      0.00       193
                                              Diversidad       0.00      0.00      0.00    

  'precision', 'predicted', average, warn_for)
