In [1]:
%load_ext watermark

In [2]:
%watermark

2017-05-15T17:15:32-04:00

CPython 3.6.0
IPython 5.3.0

compiler   : GCC 4.2.1 Compatible Apple LLVM 6.0 (clang-600.0.57)
system     : Darwin
release    : 16.5.0
machine    : x86_64
processor  : i386
CPU cores  : 4
interpreter: 64bit


In [87]:
from tf_idf import preprocess_fundamentos
from sklearn import model_selection
from tqdm import tqdm, trange
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from collections import Counter, defaultdict
from operator import itemgetter

from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from models import UDP
import logging
import random
import pandas as pd
import numpy as np

from sklearn.dummy import DummyClassifier
from sklearn.svm import SVC  # support vector machine classifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB  # naive bayes
from sklearn.neighbors import KNeighborsClassifier

In [35]:
logger = logging.getLogger(__name__)

engine = create_engine('mysql://root@localhost/constabierta')
_session = sessionmaker(bind=engine, autocommit=True, expire_on_commit=True)
session = _session()

documents = session.query(UDP).all()
SEED = 1

In [45]:
# esto demora 5 minutos aprox
X, processed_texts, y, le = preprocess_fundamentos(documents)

100%|██████████| 51867/51867 [06:34<00:00, 131.60it/s] 


In [62]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=.33, random_state=SEED)

docs_df = pd.DataFrame({'fundamento': [doc.fundamento for doc in documents],
                        'fund_proc': processed_texts,
                        'label': le.inverse_transform(y), 
                        'y': y})

# Exploración

In [83]:
print(f"total rows (fundamento): {X.shape[0]}")
print(f"total cols (vocabulary): {X.shape[1]}")

total rows (fundamento): 51867
total cols (vocabulary): 13222


## Muestra de fundamentos

In [84]:
k = 5
pd.options.display.max_colwidth = 0

rows = np.random.choice(docs_df.index.values, 5)
docs_df.ix[rows]

Unnamed: 0,fund_proc,fundamento,label,y
36977,"[derech, cad, ciudadan, vist, respet, igual, independient, gener, exist, iguald, salari, derech, reproduct, trabaj, salud, etc]","DERECHO A QUE CADA CIUDADANO SEA VISTO Y RESPETADO COMO IGUAL, INDEPENDIENTE DE SU GENERO, QUE EXISTA IGUALDAD EN SALARIOS, DERECHOS REPRODUCTIVOS, TRABAJO, SALUD, ETC.",propositivo,7
15635,"[deb, ten, cuent, condicion, pod, ejerc, derech]",se debe tener en cuenta condiciones para poder ejercer el derecho,propositivo,7
22369,"[respet, lug, dond, sent, bas, vid, republ]",Respetar el lugar dónde sentamos las bases de nuestra vida y nuestra república.,indeterminado,6
1471,"[derech, fundamental, garantiz, pais, exclus, econom, deb, ten, acces, dign, integral]",- Es un derecho fundamental no garantizado en el país y exclusivo desde lo económico. - Debería tener un acceso digno e integral.,factico.definicion,2
22192,"[mas, derech, respet, naturalez, deb, consagr, derech, naturalez, ser, respet, derech, pers, aunqu, vec, impliqu, carec, ciert, cos, deb, cambi, cultur, antropocentr, ecocentr]","Mas que el derecho de ""respetar la naturaleza"", se debería consagrar el ""derecho de la naturaleza a ser respetada"", ya que esta tiene derechos perse, aunque a veces implique carecer de ciertas cosas. Se debe cambiar la cultura antropocentrica por una ecocentrica",propositivo,7


## Distribución de clases

In [71]:
docs_df['label'].value_counts()

propositivo              34501
factico.definicion       8999 
indeterminado            3974 
valorativo.pragmatico    2066 
blanco                   1578 
factico.causalidad       388  
factico.prediccion       228  
valorativo.negativo      55   
valorativo.positivo      45   
factico.pasado           31   
valorativa.pragmatica    1    
factico.negativo         1    
Name: label, dtype: int64

## Vocabulario

In [75]:
vocab = Counter()
for tokens in tqdm(processed_texts):
    for token in tokens:
        vocab[token] += 1

100%|██████████| 51867/51867 [00:00<00:00, 109933.33it/s]


In [82]:
k = 10
tmp = pd.DataFrame({'word': list(map(itemgetter(0), vocab.most_common(k))),
                    'freq': list(map(itemgetter(1), vocab.most_common(k)))})
tmp

Unnamed: 0,freq,word
0,25495,derech
1,19617,deb
2,9281,ser
3,7857,person
4,7695,salud
5,7320,calid
6,6718,garantiz
7,6709,educ
8,5959,vid
9,5876,tod


# Clasificación

In [98]:
c0 = ("Base", DummyClassifier(strategy='stratified'))
c1 = ("SVM", SVC(kernel='rbf'))
c1b = ("SVM", SVC(kernel='linear'))
c2 = ("DT", DecisionTreeClassifier())
c3 = ("NB", GaussianNB())
c4 = ("KNN", KNeighborsClassifier(n_neighbors=5))

# Dummy Classifier (Baseline)

In [91]:
print(c0[0])
clf = c0[1]

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred, target_names=target_names))

Base
                       precision    recall  f1-score   support

               blanco       0.03      0.03      0.03       527
   factico.causalidad       0.00      0.00      0.00       110
   factico.definicion       0.18      0.18      0.18      2981
     factico.negativo       0.00      0.00      0.00         9
       factico.pasado       0.01      0.02      0.01        66
   factico.prediccion       0.08      0.07      0.07      1370
        indeterminado       0.66      0.67      0.66     11350
          propositivo       0.00      0.00      0.00         0
valorativa.pragmatica       0.00      0.00      0.00        20
  valorativo.negativo       0.00      0.00      0.00        13
  valorativo.positivo       0.04      0.04      0.04       671

          avg / total       0.48      0.48      0.48     17117



  'recall', 'true', average, warn_for)


## SVM (RBF)

In [92]:
print(c1[0])
clf = c1[1]

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred, target_names=target_names))

SVM
                       precision    recall  f1-score   support

               blanco       0.00      0.00      0.00       527
   factico.causalidad       0.00      0.00      0.00       110
   factico.definicion       0.00      0.00      0.00      2981
     factico.negativo       0.00      0.00      0.00         9
       factico.pasado       0.00      0.00      0.00        66
   factico.prediccion       0.00      0.00      0.00      1370
        indeterminado       0.66      1.00      0.80     11350
          propositivo       0.00      0.00      0.00        20
valorativa.pragmatica       0.00      0.00      0.00        13
  valorativo.negativo       0.00      0.00      0.00       671

          avg / total       0.44      0.66      0.53     17117



  'precision', 'predicted', average, warn_for)


## SVM (linear)

In [100]:
print(c1b[0])
clf = c1b[1]

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred, target_names=target_names))

SVM
                       precision    recall  f1-score   support

               blanco       0.59      0.44      0.50       527
   factico.causalidad       0.00      0.00      0.00       110
   factico.definicion       0.55      0.24      0.34      2981
     factico.negativo       0.00      0.00      0.00         9
       factico.pasado       0.00      0.00      0.00        66
   factico.prediccion       0.40      0.03      0.05      1370
        indeterminado       0.72      0.97      0.83     11350
          propositivo       0.00      0.00      0.00        20
valorativa.pragmatica       0.00      0.00      0.00        13
  valorativo.negativo       0.39      0.03      0.05       671

          avg / total       0.64      0.70      0.63     17117



  'precision', 'predicted', average, warn_for)


## Decision Tree

In [93]:
print(c2[0])
clf = c2[1]

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred, target_names=target_names))

DT
                       precision    recall  f1-score   support

               blanco       0.47      0.71      0.56       527
   factico.causalidad       0.03      0.02      0.02       110
   factico.definicion       0.37      0.34      0.36      2981
     factico.negativo       0.00      0.00      0.00         0
       factico.pasado       0.12      0.11      0.12         9
   factico.prediccion       0.02      0.02      0.02        66
        indeterminado       0.20      0.12      0.15      1370
          propositivo       0.77      0.82      0.79     11350
valorativa.pragmatica       0.00      0.00      0.00        20
  valorativo.negativo       0.00      0.00      0.00        13
  valorativo.positivo       0.20      0.14      0.16       671

          avg / total       0.61      0.64      0.62     17117



  'recall', 'true', average, warn_for)


## Gaussian Naive Bayes

In [97]:
print(c3[0])
clf = c3[1]

clf.fit(X_train.toarray(), y_train)
y_pred = clf.predict(X_test.toarray())
print(classification_report(y_test, y_pred, target_names=target_names))

NB
                       precision    recall  f1-score   support

               blanco       0.15      0.27      0.19       527
   factico.causalidad       0.01      0.19      0.02       110
   factico.definicion       0.17      0.12      0.14      2981
     factico.negativo       0.00      0.11      0.00         9
       factico.pasado       0.00      0.20      0.01        66
   factico.prediccion       0.09      0.19      0.12      1370
        indeterminado       0.72      0.12      0.20     11350
          propositivo       0.00      0.00      0.00        20
valorativa.pragmatica       0.00      0.15      0.01        13
  valorativo.negativo       0.05      0.28      0.09       671

          avg / total       0.52      0.13      0.18     17117



## KNN (k=5)

In [7]:
print(c4[0])
clf = c4[1]

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred, target_names=target_names))

                       precision    recall  f1-score   support

               blanco       0.03      0.53      0.06       507
   factico.causalidad       0.00      0.00      0.00       128
   factico.definicion       0.41      0.17      0.24      3004
     factico.negativo       0.00      0.00      0.00         1
       factico.pasado       0.00      0.00      0.00         8
   factico.prediccion       0.25      0.02      0.03        63
        indeterminado       0.12      0.26      0.17      1306
          propositivo       0.78      0.31      0.44     11399
valorativa.pragmatica       0.00      0.00      0.00        21
  valorativo.negativo       0.00      0.00      0.00        10
  valorativo.positivo       0.49      0.05      0.09       670

          avg / total       0.62      0.27      0.35     17117



  'precision', 'predicted', average, warn_for)
