# Tarea 2

In [1]:
import pandas as pd
import numpy as np
import logging
from itertools import product
import unicodedata
import re
import pickle

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from keras.utils import np_utils
from keras import regularizers
from keras.models import Sequential
from keras.optimizers import SGD
from keras import regularizers
from keras.models import load_model, Model
from keras.layers import Dense, Dropout, Input, Embedding, Lambda
from keras import backend as K
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import text_to_word_sequence

Using TensorFlow backend.


## Load data

In [6]:
df = []
for tema in range(0, 4):
    dict_column = {}
    for column in ['x', 'y']:
        dict_set = {}
        for set_ in ['train', 'test']:
            filename = '../../data/tarea2/'+column+'_'+set_+'_tema_'+str(tema+1)+'_categorias_pnud_'
            filename += '0.txt' if set_=='train' else '1.txt'
            with open(filename) as f:
                data = f.readlines()
            dict_set[set_] = [row[:-1] for row in data]
        dict_column[column] = dict_set
    df.append(dict_column)

In [7]:
print df[0].keys()
print df[0]['x'].keys()

['y', 'x']
['test', 'train']


In [8]:
# Matrix with fasttext vectors for all words in the data set
embedding_matrix = np.loadtxt('../../task1/DAN/dan_preprocessing_data/embedding_matrix.txt', dtype=float)

In [9]:
# conversion from string to int for all words in the data set
word_to_num = pickle.load(open("../../task1/DAN/dan_preprocessing_data/dict_word_to_num.p", "rb"))

## Prepare data

In [10]:
EMBEDDING_DIM = 300  # size fasttext vectors
global encoder       # to detect number of classes

In [11]:
def get_y_sets(NUM_DF):
    TRAIN_SIZE = len(df[NUM_DF]['x']['train'])
    TEST_SIZE = len(df[NUM_DF]['x']['test'])
    df_y = np.array(df[NUM_DF]['y']['train'] + df[NUM_DF]['y']['test'])
    # one hot vector label for clasification
    global encoder
    encoder = LabelEncoder()
    encoder.fit(df_y) # to know how many classes 
    labels = encoder.transform(df_y)
    Y = to_categorical(np.asarray(labels))
    
    y_train = Y[0 : TRAIN_SIZE]
    y_test = Y[TRAIN_SIZE : ]
    return y_train, y_test

In [12]:
global df_x, sequences, MAX_SEQUENCE_LENGTH
df_x = [None, None, None, None]              # argumentos de train+dev+test para cada tema
sequences = [None, None, None, None]         # argumentos como listas de palabras para cada tema
MAX_SEQUENCE_LENGTH = 0                      # tamaño máximo de vector de entrada

In [13]:
def preprocess_x(num_df):
    '''
    Converts arguments in word sequence saving it in global sequences array 
    Updates the global max sequence length.
    
    Arguments:
        num_df: theme number to process
    '''
    global MAX_SEQUENCE_LENGTH, df_x, sequences
    df_x[num_df] = np.array(df[num_df]['x']['train'] + df[num_df]['x']['test'])
    # to list of words
    sequences[num_df] = []
    for argument_j in range(0, df_x[num_df].shape[0]):
        in_unicode = df_x[num_df][argument_j].decode('utf-8')
        in_string = unicodedata.normalize('NFKD', in_unicode).encode('ascii','ignore')
        if argument_j == 7291:
            especial = text_to_word_sequence(in_string)
        sequences[num_df].append(text_to_word_sequence(in_string))
    # search for the biggest
    for sequence in sequences[num_df]:
        if len(sequence) > MAX_SEQUENCE_LENGTH:
            MAX_SEQUENCE_LENGTH = len(sequence)

def get_x_sets(num_df):
    '''
    Replaces word in sequences for corresponding numbers.
    Arguments:
        num_df: theme number from which to get the sets
    Returns:
        Train, development and test set
    '''
    global df_x, sequences
    # every X[i] with max size
    # replace words by numbers with world_dict
    X = np.zeros((df_x[num_df].shape[0], MAX_SEQUENCE_LENGTH)).astype(int)
    for i in range(0, len(sequences[num_df])):
        for j in range(0, len(sequences[num_df][i])):
            X[i][-len(sequences[num_df][i])+j] = word_to_num[sequences[num_df][i][j]]
    # divide sets for answer
    TRAIN_SIZE = len(df[num_df]['x']['train'])
    TEST_SIZE = len(df[num_df]['x']['test'])
    X_train = X[0 : TRAIN_SIZE]
    X_test = X[TRAIN_SIZE : ]
    return X_train, X_test 

In [14]:
for i in range(0, 4):
    preprocess_x(i)

In [15]:
sequences[3][2]

['la',
 'tecnologia',
 'hoy',
 'nos',
 'permite',
 'pensar',
 'en',
 'instrumentos',
 'efectivos',
 'de',
 'participacion',
 'directa']

In [16]:
MAX_SEQUENCE_LENGTH

357

In [17]:
MAX_SEQUENCE_LENGTH = 500

## Train best configuration models

Obs: 
- DAN ocupa los sets globales  
- como no hay set de validación el de entrenamiento es dividio.

In [19]:
def dan(relu_layers=3, hidden_units=300, p_dropout=0.3, dropout_input=False, 
        my_regularizer=regularizers.l2(1e-5), my_optimizer='adam', 
        epochs=150, batch_size=200):
    '''
    Creates and fit NN. 
    NN Arquitecture: 
        Input: vector with numbers representing index in embedding layer
        Embedding layer: matrix multiplication to obtain vectors for each index in the input
        Dropout: optional words dropout
        Mean: Averages the vectors of the embedding output, returning one averaged vector
        Fully connected: Fully connected layers with relu as activation function
                         optional neuron dropout
        Fully connected: output layer with softmax function
    
    Arguments:
        relu_layers: Number of fully connected layers with relu 
        hidden_units: Number of neurons on the relu layers
        p_dropout: dropout probability for the relu layers
        dropout_input: dropout probability for words 
        my_regularizer: kernel_regularizer for fully connected layers
        my_optimizer: optimizer for back propagation
        epochs: number of epochs to train
        batch_size: batch_size for trainning
    '''
    sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int64')
    x = Embedding(len(word_to_num) + 1, EMBEDDING_DIM, mask_zero=True,
                  weights=[embedding_matrix], trainable=False)(sequence_input)
    if dropout_input:
        x = Dropout(0.5)(x)
    x = Lambda(lambda x: K.mean(x, axis=1), 
               output_shape=(embedding_matrix.shape[1],))(x)
    for i in range(0, relu_layers):
        x = Dropout(p_dropout)(x)
        x = Dense(units=hidden_units, activation='relu', kernel_regularizer=my_regularizer)(x)
    preds = Dense(units=len(encoder.classes_), activation='softmax', 
                  kernel_regularizer=my_regularizer)(x)
    
    m = Model(sequence_input, preds)
    m.compile(loss='categorical_crossentropy', optimizer=my_optimizer, 
              metrics=['accuracy', 'top_k_categorical_accuracy'])
    print "Starting trainning"
    # use 20% as validation
    m.fit(X_train, y_train, validation_split=0.2,
          shuffle=True, 
          epochs=epochs, batch_size=batch_size,
          verbose=1)
    return m

## Train models for each theme

In [None]:
for i in range(0, 1):
    print "----------------------- TEMA "+str(i+1)+"--------------------------"
    y_train, y_test = get_y_sets(i)
    X_train, X_test = get_x_sets(i)
    batch_size = 30
    epochs = 80
    l2 = None if i != 1 else regularizers.l2(1e-5) # the best has not regularizator in theme 2
    m = dan(relu_layers=2, hidden_units=200, 
            p_dropout=0.2, my_regularizer=l2,
            epochs=epochs, batch_size=batch_size)
    m.save('models/tema'+str(i+1)+'.h5')
    score = m.evaluate(X_train, y_train)
    print "\nTRAIN:"
    print "-", m.metrics_names[0], score[0]
    print "-", m.metrics_names[1], score[1]
    print "-", m.metrics_names[2], score[2]
    score = m.evaluate(X_test, y_test)
    print "\nTEST:"
    print "-", m.metrics_names[0], score[0]
    print "-", m.metrics_names[1], score[1]
    print "-", m.metrics_names[2], score[2], '\n'

----------------------- TEMA 1--------------------------
Starting trainning
Train on 36886 samples, validate on 9222 samples
Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epoch 17/80
Epoch 18/80
Epoch 19/80
Epoch 20/80
Epoch 21/80
Epoch 22/80
Epoch 23/80
Epoch 24/80
Epoch 25/80
Epoch 26/80
Epoch 27/80
Epoch 28/80
Epoch 29/80
Epoch 30/80
Epoch 31/80
Epoch 32/80
Epoch 33/80
Epoch 34/80
Epoch 35/80
Epoch 36/80
Epoch 37/80
Epoch 38/80
Epoch 39/80
Epoch 40/80
Epoch 41/80
Epoch 42/80
Epoch 43/80

## Use trained models

In [20]:
for i in range(0, 1):
    print "----------------------- TEMA "+str(i+1)+"--------------------------"
    y_train, y_test = get_y_sets(i)
    X_train, X_test = get_x_sets(i)
    m = load_model("../../task1/DAN/models/tema"+str(i+1)+'.h5')
    score = m.evaluate(X_train, y_train)
    print "\nTRAIN:"
    print "-", m.metrics_names[0], score[0]
    print "-", m.metrics_names[1], score[1]
    print "-", m.metrics_names[2], score[2]
    score = m.evaluate(X_test, y_test)
    print "\nTEST:"
    print "-", m.metrics_names[0], score[0]
    print "-", m.metrics_names[1], score[1]
    print "-", m.metrics_names[2], score[2], '\n'

----------------------- TEMA 1--------------------------
TRAIN:
- loss 1.1321175075
- acc 0.672724906746
- top_k_categorical_accuracy 0.915741303012
TEST:
- loss 1.38201149446
- acc 0.609875203602
- top_k_categorical_accuracy 0.895279435703 

