Load data first...

In [1]:
# load data to predict
_dataDir = "../../data/tarea2/"
import string

def read_text_file_for_ft_input(filename):
    with open(filename) as f:
        out = []
        for line in f:
            # keep tab to separate original concepts from justifications
            strdata = "".join([c for c in line[:-1] if c not in string.punctuation or c == '\t']).lower()
            if strdata == '':
                strdata = ' '
            out.append(strdata)
    return out

def read_numbers_file_for_ft_input(filename):
    with open(filename) as f:
        out = []
        for line in f:
            out.append(int(line))
    return out


temas = [1,2,3,4]

test_x_concepto = {}
test_x_fundamento = {}
test_x_concepto_fundamento = {}
test_y = {}

for i in temas:
    test_x_concepto_fundamento[i] = read_text_file_for_ft_input(
        _dataDir + "x_test_tema_" + str(i) + "_categorias_pnud_1.txt")
    test_y[i] = read_numbers_file_for_ft_input(
        _dataDir + "y_test_tema_" + str(i) + "_categorias_pnud_1.txt")
    
for i in temas:
    test_x_concepto[i] = []
    test_x_fundamento[i] = []
    for texto in test_x_concepto_fundamento[i]:
        test_x_concepto[i].append(texto.split('\t')[0])
        test_x_fundamento[i].append(texto.split('\t')[1])

categories = {}
clean_categories = {}
for i in temas:
    categories[i] = []
    clean_categories[i] = []
    # load categories first
    categoriesFile = _dataDir + "categorias_tema_" + str(i) + "_pnud_0.txt"
    with open(categoriesFile) as f:
        for line in f:
            categories[i].append(line[:-1])
            clean_categories[i].append("".join([c for c in line[:-1] if c not in string.punctuation]).lower())



## Word  Embeddings

In [2]:
from gensim.models.wrappers import FastText
### lo siguiente dejó de funcionar sin razón aparente (posiblemente luego de actualización de OS)
### issue similara acá https://github.com/RaRe-Technologies/gensim/issues/1196
# model = FastText.load_fasttext_format('../../word_vectors/wiki.es')

### cargándolo con word2vec
### carga solo los wordembeddings de palabras de los textos de la constitución
model = FastText.load_word2vec_format('../../word_vectors/ca-vectors.vec')

In [3]:
### esta es la forma de hacerlo con FastText model
#index2word_set = set(model.wv.index2word)

### mientras se use word2vec format se debe hacer así
index2word_set = set(model.index2word)

In [4]:
import numpy as np
from nltk.metrics import distance

def avg_feature_vector(words, model, num_features, index2word_set):
        #function to average all words vectors in a given paragraph
        featureVec = np.zeros((num_features,), dtype="float32")
        nwords = 0

        #list containing names of words in the vocabulary
        #index2word_set = set(model.index2word) this is moved as input param for performance reasons
        for word in words:
            if word in index2word_set:
                nwords = nwords+1
                featureVec = np.add(featureVec, model[word])

        if(nwords>0):
            featureVec = np.divide(featureVec, nwords)
        return featureVec

In [5]:
from scipy import spatial

def similarity(string1,string2):
    vec1 = avg_feature_vector(string1.split(), model=model, num_features=300, index2word_set=index2word_set)
    vec2 = avg_feature_vector(string2.split(), model=model, num_features=300, index2word_set=index2word_set)
    return 1 - spatial.distance.cosine(vec1,vec2)

### Word embeddings con los strings de los conceptos

In [6]:
we_prediction = {}

for i in temas:
    we_prediction[i] = []
    for case in test_x_concepto[i]:
        ### calcula la categoría mas similar
        ranking_we = sorted(range(len(categories[i])), 
            key=lambda k: similarity(case,clean_categories[i][k]), reverse=True)
        we_prediction[i].append(ranking_we)

In [7]:
t_ex = 3
c_ex = 1650

print("To clasify:\t" + test_x_concepto[t_ex][c_ex])
print("Manual class.:\t" + clean_categories[t_ex][test_y[t_ex][c_ex]])
print("Word_embedding:\t" + clean_categories[t_ex][we_prediction[t_ex][c_ex][0]])

To clasify:	hacerse cargo de las propuestas
Manual class.:	responsabilidad
Word_embedding:	de satisfacer cargas públicas


### Word embeddings con los strings de los conceptos más los fundamentos

In [8]:
we_ca_prediction = {}

for i in temas:
    we_ca_prediction[i] = []
    for case_concepto,case_fundamento in zip(test_x_concepto[i],test_x_fundamento[i]):
        ### calcula la categoría mas similar
        ranking_we = sorted(range(len(categories[i])), 
            key=lambda k: similarity(" ".join([case_concepto,case_fundamento]),clean_categories[i][k]), reverse=True)
        we_ca_prediction[i].append(ranking_we)

In [40]:
t_ex = 1
c_ex = 700

print("To clasify:\t" + " ".join([test_x_concepto[t_ex][c_ex],test_x_fundamento[t_ex][c_ex]]))
print("Manual class.:\t" + clean_categories[t_ex][test_y[t_ex][c_ex]])
print("Word_embedding:\t" + clean_categories[t_ex][we_ca_prediction[t_ex][c_ex][0]])

To clasify:	democracia paritaria y proporcional en la toma de poder y decisiones debe existir igualdad de género para la discusión publica la legislación y las políticas públicas
Manual class.:	equidad de género
Word_embedding:	respeto  conservación de la naturaleza o medio ambiente


### Word embeddings con los strings de conceptos con peso por frecuencia inversa y quitando componente principal

La componente principal se quita para los conjuntos de oraciones de cada tema después de normalizar por frecuencias. La constante de weighting es considerada (en general) como 0.001

#### Calcula probabilidades y frecuencias usando el texto completo de los arugmentos constitucionales

In [11]:
# primero computa tokes, vocabulario y frecuencias

from gensim.utils import simple_preprocess
from collections import Counter

_text_data_file = '../../data/texto_ca.txt'

all_text = read_text_file_for_ft_input(_text_data_file)

long_text = " ".join(all_text)
tokens = long_text.split()
frequencies = Counter(tokens)
probabilities = {}
for word in set(tokens):
    probabilities[word] = frequencies[word] / len(tokens)

In [12]:
#print(len((tokens)))
#print(len(set(tokens)))
#print(len(index2word_set))
#print(len(frequencies))

#not_in_voc = [w for w in set(tokens) if w not in index2word_set]
#in_voc = [w for w in set(tokens) if w in index2word_set]

#total_not_in_voc = len(not_in_voc)
#total_in_voc = len(in_voc)

#print(total_not_in_voc)
#print(total_in_voc)

In [13]:
#voc_out_file = '../../word_vectors/ca-vectors.voc'
#with open(voc_out_file,'w') as f:
#    for word,_ in frequencies.most_common():
#        f.write(word)
#        f.write("\n")

In [14]:
###### almacena vectores en word2vec fromat solo para las palabras que aparecen en los textos de la constitución
###### el formato de los archivos comienza con el tamaño del vocabulario más la dimensión de los vectores
#vectors_out_file = '../../word_vectors/ca-vectors.vec'

#first_line = str(len(frequencies)) + " " + str(300) + "\n"

#with open(vectors_out_file) as f:
#    f.write(first_line)
#    for word,_ in frequencies.most_common():
#        f.write(word + " " + str())
#return out

#### Computa los embeddings con peso

In [15]:
# funcion auxiliar para calcular el embedding normalizado por frecuencia para cada posible palabra

import numpy as np

def avg_feature_vector_weighted(words, model, num_features, index2word_set, word_probabilities, weighting_parameter=0.001):
        #function to average all words vectors in a given paragraph
        featureVec = np.zeros((num_features,), dtype="float32")
        nwords = 0

        for word in words:
            if word in index2word_set:
                nwords = nwords+1
                to_add = np.multiply(
                    weighting_parameter / (weighting_parameter + word_probabilities[word]), 
                    model[word])
                featureVec = np.add(featureVec, to_add)

        if(nwords>0):
            featureVec = np.divide(featureVec, nwords)
        return featureVec

In [16]:
def compute_predictions_from_embeddings(classes, to_classify_set, distance):
    predictions = []
    
    for case in to_classify_set:
        ### calcula las predicciones por categorías mas similares
    
        ranking_weighted = sorted(
            range(len(classes)),
            key=lambda k: distance(case, classes[k])
        )

        predictions.append(ranking_weighted)
    return predictions

In [17]:
temas = [1,2,3,4]
alpha = 0.001

normalized_embeddings = {}
embeddings = {}
original_concepts_normalized_embeddings = {}
original_concepts_embeddings = {}


for i in temas:    
 
    embeddings[i] = [
        avg_feature_vector(sentence.split(), model, 300, index2word_set)
        for sentence in test_x_concepto[i]
    ]
    
    normalized_embeddings[i] = [
        avg_feature_vector_weighted(sentence.split(), model, 300, index2word_set, probabilities, alpha)
        for sentence in test_x_concepto[i]
    ]
    
    original_concepts_embeddings[i] = [
        avg_feature_vector(sentence.split(), model, 300, index2word_set)
        for sentence in clean_categories[i]
    ]
    
    original_concepts_normalized_embeddings[i] = [
        avg_feature_vector_weighted(sentence.split(), model, 300, index2word_set, probabilities, alpha)
        for sentence in clean_categories[i]
    ]

### Calcula predicciones solo usando el weightening

In [18]:
weighted_prediction = {}

for i in temas:
    # computa las predicciones
    weighted_prediction[i] = compute_predictions_from_embeddings(
        original_concepts_normalized_embeddings[i], normalized_embeddings[i], spatial.distance.cosine)

In [30]:
t_ex = 3
c_ex = 1650

print("To clasify:\t" + " ".join([test_x_concepto[t_ex][c_ex]]))
print("Manual class.:\t" + clean_categories[t_ex][test_y[t_ex][c_ex]])
print("Word_embedding_weighted:\t" + clean_categories[t_ex][weighted_prediction[t_ex][c_ex][0]])

To clasify:	hacerse cargo de las propuestas
Manual class.:	responsabilidad
Word_embedding_weighted:	de satisfacer cargas públicas


### Computa los PCA

In [20]:
from sklearn.decomposition import PCA

pca = {}
pca_weight = {}

for i in temas:
    # compute PCAs for embeddings
    pca[i] = PCA(n_components=1).fit(
        np.concatenate((embeddings[i], original_concepts_embeddings[i]))
    ).components_[0]
    
    pca_weight[i] = PCA(n_components=1).fit(
        np.concatenate((normalized_embeddings[i], original_concepts_normalized_embeddings[i]))
    ).components_[0]
    

In [21]:
u = pca[1][None].T
M = u @ u.T
R = M @ u
R.shape

(300, 1)

### Calcula predicciones usando el PCA

In [22]:
# primero computa los nuevos embeddings
pca_embeddings = {}
pca_original_concepts_embeddings = {}
pca_weighted_embeddings = {}
pca_original_concepts_normalized_embeddings = {}

for i in temas:
    
    u = pca[i][None].T
    M = u @ u.T
    
    pca_embeddings[i] = [
        v - M @ v 
        for v in embeddings[i]
    ]
    
    pca_original_concepts_embeddings[i] = [
        v - M @ v
        for v in original_concepts_embeddings[i]
    ]
    
    u = pca_weight[i][None].T
    M = u @ u.T

    pca_weighted_embeddings[i] = [
        v - M @ v
        for v in normalized_embeddings[i]
    ]
    
    pca_original_concepts_normalized_embeddings[i] = [
        v - M @ v
        for v in original_concepts_normalized_embeddings[i]
    ]

pca_prediction = {}
pca_weighted_prediction = {}

for i in temas:
        
    # computa las predicciones
    pca_prediction[i] = compute_predictions_from_embeddings(
        pca_original_concepts_embeddings[i], pca_embeddings[i], spatial.distance.cosine)
    
    pca_weighted_prediction[i] = compute_predictions_from_embeddings(
        pca_original_concepts_normalized_embeddings[i], pca_weighted_embeddings[i], spatial.distance.cosine)

In [39]:
t_ex = 1
c_ex = 700

print("To clasify:\t" + " ".join([test_x_concepto[t_ex][c_ex]]))
print("Manual class.:\t" + clean_categories[t_ex][test_y[t_ex][c_ex]])
#print("Word_embedding_PCA:\t" + clean_categories[t_ex][pca_prediction[t_ex][c_ex][0]])
print("Word_embedding_PCA_Weight:\t" + clean_categories[t_ex][pca_weighted_prediction[t_ex][c_ex][0]])

To clasify:	democracia paritaria y proporcional
Manual class.:	equidad de género
Word_embedding_PCA_Weight:	democracia


##  Edit Distance baseline

In [24]:
import editdistance

edit_prediction = {}

for i in temas:
    edit_prediction[i] = [] 
    for case in test_x_concepto[i]:
        ### calcula la categoría mas similar
        ranking_edit = sorted(range(len(categories[i])), 
            key=lambda k: editdistance.eval(case,clean_categories[i][k]))
        edit_prediction[i].append(ranking_edit)        

In [32]:
t_ex = 3
c_ex = 1650

print("To clasify:\t" + test_x_concepto[t_ex][c_ex])
print("Manual class.:\t" + clean_categories[t_ex][test_y[t_ex][c_ex]])
print("Edit distance:\t" + clean_categories[t_ex][edit_prediction[t_ex][c_ex][0]])

To clasify:	hacerse cargo de las propuestas
Manual class.:	responsabilidad
Edit distance:	servicio a la comunidad


## Calcula métricas

En las listas iniciales (predictions y model_names) se deben indicar las predicciones que se utilizarán para calcular las métricas

In [26]:
# utility function to compute top k accuracy

def first_predictions(predictions_list):
    return [li[0] for li in predictions_list]
    

def top_k_accuracy(gold,predicted,k):
    '''
    #Arguments
        gold: the true labels of the test cases (size N = number of test cases)
        predicted: ranked list of label predictions for every test case (size N x L, where L is assumed to be >= k)
        k: the number of elements in the predicted lists that should be considered to compute the metric
    #Returns
        The portion of cases (between 0 and 1) in which the true label value was among the first k predicted labels
    '''
    count = 0
    for g,pred_labels in zip(gold,predicted):
        if g in pred_labels[:k]:
                count += 1
    return count/len(gold)

In [27]:
import sys
from sklearn import metrics

# lista para promedios
temas = [1,2,3,4]

# lista de predicciones
predictions = [
    edit_prediction,
    we_prediction,
    we_ca_prediction,
    weighted_prediction,
    pca_prediction,
    pca_weighted_prediction
]

# nombres de los modelos (solo para efectos de presentacion)
model_names = [
  'Edit-distance $(c)$',
  'Word-embeddings $(c)$',
  'Word-embeddings $(c,a)$',
  'Weighted $(c)$',
  'PCA $(c)$',
  'Weighted-PCA $(c)$',
]

print("acc, top-5")
for prediction,model_name in zip(predictions,model_names):
    sys.stdout.write(model_name + " & ")
    avga = 0
    avgt = 0
    for i in temas:
        gold = test_y[i]
        acc = round(100*metrics.accuracy_score(gold,first_predictions(prediction[i])),1)
        avga += acc/len(temas)
        top_k = round(100*top_k_accuracy(gold,prediction[i],5),1)
        avgt += top_k/len(temas)
        sys.stdout.write(format(acc,"04.1f") + " & " + format(top_k,"04.1f") + " & ")
    avga = round(avga,1)
    avgt = round(avgt,1)
    sys.stdout.write(" & " + "%04.1f" %(avga) + " & " + "%04.1f" %(avgt) + " \\\\ \n")        
  

acc, top-5
Edit-distance $(c)$ & 41.2 & 60.6 & 30.9 & 46.7 & 41.6 & 64.9 & 22.7 & 38.6 &  & 34.1 & 52.7 \\ 
Word-embeddings $(c)$ & 60.2 & 86.3 & 58.8 & 79.1 & 60.4 & 80.8 & 45.5 & 86.1 &  & 56.2 & 83.1 \\ 
Word-embeddings $(c,a)$ & 18.4 & 62.6 & 29.7 & 64.7 & 47.5 & 67.8 & 46.1 & 76.3 &  & 35.4 & 67.9 \\ 
Weighted $(c)$ & 62.2 & 88.0 & 58.3 & 82.6 & 62.9 & 84.1 & 39.0 & 84.0 &  & 55.6 & 84.7 \\ 
PCA $(c)$ & 61.5 & 86.2 & 57.9 & 77.9 & 52.9 & 79.4 & 44.2 & 85.3 &  & 54.1 & 82.2 \\ 
Weighted-PCA $(c)$ & 67.2 & 89.4 & 60.6 & 83.1 & 58.6 & 81.5 & 40.0 & 86.6 &  & 56.6 & 85.2 \\ 


In [28]:
#print("top-5 per topic")
#print(temas)
#for prediction,model_name in zip(predictions,model_names):
#    sys.stdout.write(model_name + " & ")
#    avgt = 0
#    for i in temas:
#        gold = test_y[i]
#        top_k = round(100*top_k_accuracy(gold,prediction[i],5),1)
#        avgt += top_k/4
#        sys.stdout.write(str(top_k) + " & ")
#    avgt = round(avgt,1)
#    sys.stdout.write(str(avgt) + " \\\\ \n")