## Objetivo: 

- Obtener conversión única de los argumentos desde palabras a números 
- Crear matriz (embedding layer) con los vectores de fasttext en las posiciones respectivas a la conversión anterior.

In [10]:
import pandas as pd
import numpy as np
import logging
from itertools import product
import unicodedata
import re

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from keras.utils import np_utils
from keras import regularizers
from keras.models import Sequential
from keras.optimizers import SGD
from keras import regularizers
from keras.models import load_model, Model
from keras.layers import Dense, Dropout, Input, Embedding, Lambda
from keras import backend as K
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [11]:
from gensim.models.wrappers import FastText
model = FastText.load_fasttext_format('wiki.es')

In [12]:
EMBEDDING_DIM = 300

### Load data

In [2]:
df = {'tarea1':[], 'tarea2':[]}
for num in ['1','2','3','4']:
    parts = {}
    for set_data in ['train', 'test', 'dev']:
        filename = '../../data/x_'+set_data+'_tema_'+num+'_categorias_pnud_0.txt'
        with open(filename) as f:
            data = f.readlines()
        parts[set_data] = [row[:-1] for row in data]
    df['tarea1'].append(parts)
    
for num in ['1','2','3','4']:
    parts = {}
    for set_data in ['train', 'test']:
        filename = '../../data/tarea2/x_'+set_data+'_tema_'+num+'_categorias_pnud_'
        filename += '0.txt' if set_data=='train' else '1.txt'
        with open(filename) as f:
            data = f.readlines()
        parts[set_data] = [row[:-1] for row in data]
    df['tarea2'].append(parts)

In [3]:
print df['tarea1'][0]['train'][0]
print df['tarea1'][0]['dev'][0]
print df['tarea1'][1]['train'][0]

los gobiernos deben ser trasparentes informando antes de gastar los dineros, informar publicitar los proyectos que se ejecutaran
La base de la democracia para que esta sea activa.
esencial en cualquier régimen democrático


In [26]:
df_x = []
indices = {}
for tarea in ['tarea1', 'tarea2']: 
    for tema in range(0, 4):
        for set_ in ['train', 'dev', 'test']:
            if tarea == 'tarea2' and set_ == 'dev':
                continue
            indices[tarea+str(tema)+set_] = len(df_x)
            df_x += df[tarea][tema][set_]

In [27]:
print len(df_x)
print df_x[indices['tarea10dev']]

347919
La base de la democracia para que esta sea activa.


In [28]:
# arguments (X) from unicode to str
# ignore puntuaction, ñ, accents
arguments = []
for argument in df_x:
    in_unicode = argument.decode('utf-8')
    in_string = unicodedata.normalize('NFKD', in_unicode).encode('ascii','ignore')
    arguments.append(in_string)
print arguments[indices['tarea10dev']]

La base de la democracia para que esta sea activa.


In [29]:
# words to numbers (start with 1)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(arguments) # ignore blank spaces, commas, points etc.
X = tokenizer.texts_to_sequences(arguments)
print X[0]

[7, 316, 39, 23, 11058, 8498, 941, 1, 6421, 7, 1909, 1529, 8545, 7, 578, 4, 11, 18731]


In [30]:
import pickle
pickle.dump(tokenizer.word_index, open("dan_preprocessing_data/dict_word_to_num.p", "wb"))

In [31]:
# embedding matrix
# matrix with fasttext vectors, where vector in index i is the corresponding of word i in the tokenizer
index2word_set = set(model.wv.index2word)
word_index = tokenizer.word_index
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    if word in index2word_set: 
        # words not found in embedding index will be all-zeros.
        embedding_vector = model[word]
        embedding_matrix[i] = embedding_vector

In [34]:
# save weights matrix
np.savetxt('dan_preprocessing_data/embedding_matrix.txt', embedding_matrix, fmt='%f')
# b = np.loadtxt('my_data/embedding_matrix.txt', dtype=int)

In [33]:
print len(word_index)
print embedding_matrix.shape

38224
(38225, 300)
