In [1]:
import pandas as pd
import numpy as np
import logging
from itertools import product
import unicodedata
import re
import pickle
import itertools

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from keras.utils import np_utils
from keras import regularizers
from keras.models import Sequential
from keras.optimizers import SGD
from keras import regularizers
from keras.models import load_model, Model
from keras.layers import Dense, Dropout, Input, Embedding, Lambda
from keras import backend as K
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import text_to_word_sequence

Using TensorFlow backend.


In [2]:
df = {}
for column in ['x', 'y']:
    dict_set = {}
    for set_ in ['train', 'dev', 'test']:
        filename = '../../data/tarea3/'+column+'_'+set_+'_modo.txt'
        with open(filename) as f:
            data = f.readlines()
        dict_set[set_] = [row[:-1] for row in data]
    df[column] = dict_set

In [3]:
df['x']['train'][0]

'Que se respete el espíritu de la ley y esta no se pueda interpretar de forma fraudulenta o malintencionada. Implica leyes bien redactadas.'

### Vectors and Embedding layer

In [4]:
from gensim.models.wrappers import FastText
model = FastText.load_word2vec_format('../../word_vectors/ca-vectors.vec')

In [5]:
EMBEDDING_DIM = 300
vocab_size = len(model.vocab)
embedding_matrix = np.zeros((vocab_size+1, EMBEDDING_DIM))
for word in model.vocab:
    index = model.vocab[word].index
    embedding_matrix[index+1, :] = model[word]

## Prepare data

In [6]:
global encoder       # to detect number of classes
def get_y_sets():
    TRAIN_SIZE = len(df['x']['train'])
    DEV_SIZE = len(df['x']['dev'])
    TEST_SIZE = len(df['x']['test'])
    df_y = np.array(df['y']['train'] + df['y']['dev'] + df['y']['test'])
    # one hot vector label for clasification
    global encoder
    encoder = LabelEncoder()
    encoder.fit(df_y) # to know how many classes 
    labels = encoder.transform(df_y)
    Y = to_categorical(np.asarray(labels))
    
    y_train = Y[0 : TRAIN_SIZE]
    y_dev = Y[TRAIN_SIZE : TRAIN_SIZE+DEV_SIZE]
    y_test = Y[TRAIN_SIZE+DEV_SIZE : ]
    return y_train, y_dev, y_test

In [7]:
global df_x, sequences, MAX_SEQUENCE_LENGTH
df_x = [None, None, None, None]              # argumentos de train+dev+test para cada tema
sequences = []                               # argumentos como listas de palabras para cada tema
MAX_SEQUENCE_LENGTH = 0                      # tamaño máximo de vector de entrada

In [8]:
def preprocess_x():
    '''
    Converts arguments in word sequence saving it in global sequences array 
    Updates the global max sequence length.
    
    Arguments:
        num_df: theme number to process
    '''
    global MAX_SEQUENCE_LENGTH, df_x, sequences
    df_x = np.array(df['x']['train'] + df['x']['dev'] + df['x']['test'])
    # to list of words
    sequences = []
    for argument_j in range(0, df_x.shape[0]):
        in_string = df_x[argument_j]
        sequences.append(text_to_word_sequence(in_string))
    # search for the biggest
    for sequence in sequences:
        if len(sequence) > MAX_SEQUENCE_LENGTH:
            MAX_SEQUENCE_LENGTH = len(sequence)

In [18]:
def get_x_sets():
    '''
    Replaces word in sequences for corresponding numbers.
    Arguments:
        num_df: theme number from which to get the sets
    Returns:
        Train, development and test set
    '''
    global df_x, sequences
    # every X[i] with max size
    # replace words by numbers with world_dict
    index2word_set = set(model.index2word)
    X = np.zeros((df_x.shape[0], MAX_SEQUENCE_LENGTH)).astype(int)
    for i in range(0, len(sequences)):
        for j in range(0, len(sequences[i])):
            word = sequences[i][j]
            if word in index2word_set: 
                X[i][-len(sequences[i])+j] = model.vocab[word].index+1
            else:
                X[i][-len(sequences[i])+j] = 0
    # divide sets for answer
    TRAIN_SIZE = len(df['x']['train'])
    DEV_SIZE = len(df['x']['dev'])
    TEST_SIZE = len(df['x']['test'])
    X_train = X[0 : TRAIN_SIZE]
    X_dev = X[TRAIN_SIZE : TRAIN_SIZE+DEV_SIZE]
    X_test = X[TRAIN_SIZE+DEV_SIZE : ]
    return X_train, X_dev, X_test 

In [10]:
preprocess_x()

In [11]:
print(sequences[0])
print(sequences[0][4])

['que', 'se', 'respete', 'el', 'espíritu', 'de', 'la', 'ley', 'y', 'esta', 'no', 'se', 'pueda', 'interpretar', 'de', 'forma', 'fraudulenta', 'o', 'malintencionada', 'implica', 'leyes', 'bien', 'redactadas']
espíritu


In [12]:
MAX_SEQUENCE_LENGTH

311

In [13]:
y_train, y_dev, y_test = get_y_sets()
print(df['y']['train'][0], df['y']['train'][10])
print(y_train[0], y_train[10])

1 0
[ 0.  1.  0.] [ 1.  0.  0.]


## Train best configuration models

### F1 macro
Metric no longer available in Keras  
Functions taken from old code
A callback was created to calculate it and save checkpoint accord to it

In [30]:
from keras.callbacks import Callback
from sklearn import metrics

class ModelCheckpointF1Macro(Callback):
    '''Sets f1-macro as monitor, and calculates the metric with scikit functions'''
    def __init__(self, filepath, monitor='f1-macro', verbose=0, 
                 save_best_only=False, save_weights_only=False):
        super(Callback, self).__init__()
        self.monitor = monitor
        self.verbose = verbose
        self.filepath = filepath
        self.save_best_only = save_best_only
        self.save_weights_only = save_weights_only
        self.best = -np.Inf

    def on_epoch_end(self, epoch, logs={}):
        if self.save_best_only:
            # this part was added
            scores_pred = self.model.predict(X_dev, batch_size=30) # get scores for each class
            index_pred = np.argmax(scores_pred, axis=1) # get index qith max score
            y_true = np.argmax(y_dev, axis=1)
            current = metrics.f1_score(y_true, index_pred, average='macro')
            # --
            if current is None:
                warnings.warn("Can save best model only with %s available, skipping." % (self.monitor), RuntimeWarning)
            else:
                if current > self.best:
                    if self.verbose > 0:
                        print("Epoch %05d: %s improved from %0.5f to %0.5f, saving model to %s"
                            % (epoch, self.monitor, self.best, current, self.filepath))
                    self.best = current
                    if self.save_weights_only:
                        self.model.save_weights(self.filepath, overwrite=True)
                    else:
                        self.model.save(self.filepath, overwrite=True)
                else:
                    if self.verbose > 0:
                        print("Epoch %05d: %s did not improve" % (epoch, self.monitor))
        else:
            if self.verbose > 0:
                print("Epoch %05d: saving model to %s" % (epoch, self.filepath))
            self.model.save_weights(self.filepath, overwrite=True)

In [15]:
def dan(relu_layers=3, hidden_units=300, p_dropout=0.3, dropout_input=False, 
        my_regularizer=regularizers.l2(1e-5), my_optimizer='adam', 
        epochs=150, batch_size=200, trainable=False, filepath=''):
    '''
    Creates and fit NN. 
    NN Arquitecture: 
        Input: vector with numbers representing index in embedding layer
        Embedding layer: matrix multiplication to obtain vectors for each index in the input
        Dropout: optional words dropout
        Mean: Averages the vectors of the embedding output, returning one averaged vector
        Fully connected: Fully connected layers with relu as activation function
                         optional neuron dropout
        Fully connected: output layer with softmax function
    
    Arguments:
        relu_layers: Number of fully connected layers with relu 
        hidden_units: Number of neurons on the relu layers
        p_dropout: dropout probability for the relu layers
        dropout_input: dropout probability for words 
        my_regularizer: kernel_regularizer for fully connected layers
        my_optimizer: optimizer for back propagation
        epochs: number of epochs to train
        batch_size: batch_size for trainning
    '''
    sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int64')
    x = Embedding(len(model.vocab)+ 1, EMBEDDING_DIM, mask_zero=True,
                  weights=[embedding_matrix], trainable=trainable)(sequence_input)
    if dropout_input:
        x = Dropout(0.2)(x)
    x = Lambda(lambda x: K.mean(x, axis=1), 
               output_shape=(embedding_matrix.shape[1],))(x)
    for i in range(0, relu_layers):
        x = Dropout(p_dropout)(x)
        x = Dense(units=hidden_units, activation='relu', kernel_regularizer=my_regularizer)(x)
    preds = Dense(units=len(encoder.classes_), activation='softmax', 
                  kernel_regularizer=my_regularizer)(x)
    
    m = Model(sequence_input, preds)
    m.compile(loss='categorical_crossentropy', optimizer=my_optimizer, 
              metrics=['accuracy'])
    print ("Starting trainning")
    
    checkpoint = ModelCheckpointF1Macro(filepath, verbose=1, save_best_only=True)
    callbacks_list = [checkpoint]
    m.fit(X_train, y_train,
          validation_data=(X_dev, y_dev),
          shuffle=True, 
          epochs=epochs, batch_size=batch_size,
          verbose=0,
          callbacks=callbacks_list)
    return m

In [16]:
final_hyperparam = [(2, 200, None, 0.2, 30, 130)]

In [19]:
y_train, y_dev, y_test = get_y_sets()
X_train, X_dev, X_test = get_x_sets()
best = 0

In [32]:
for relu_layers, hidden_units, l2, dropout, batch_size, epochs in final_hyperparam:
    print(">>Checking combination: "+str((relu_layers, hidden_units, l2, dropout)))
    m = dan(relu_layers=relu_layers, hidden_units=hidden_units, 
            p_dropout=dropout, my_regularizer=l2,
            epochs=120, batch_size=batch_size,
            filepath='models/checkpoint.h5')
    
    m = load_model('models/checkpoint.h5')
    scores_pred = m.predict(X_dev, batch_size=batch_size) # get scores for each class
    index_pred = np.argmax(scores_pred, axis=1) # get index qith max score
    y_true = np.argmax(y_dev, axis=1)
    val = metrics.f1_score(y_true, index_pred, average='macro')
    print('f1-macro dev: '+str(val))
    if val > best:
        print("UPDATING BEST MODEL "+str(val))
        best = val
        m.save('models/best.h5')
print("DONE")

>>Checking combination: (2, 200, None, 0.2)
Starting trainning
Epoch 00000: f1-macro improved from -inf to 0.49888, saving model to models/checkpoint.h5
Epoch 00001: f1-macro improved from 0.49888 to 0.51435, saving model to models/checkpoint.h5
Epoch 00002: f1-macro improved from 0.51435 to 0.55985, saving model to models/checkpoint.h5
Epoch 00003: f1-macro did not improve
Epoch 00004: f1-macro improved from 0.55985 to 0.56404, saving model to models/checkpoint.h5
Epoch 00005: f1-macro improved from 0.56404 to 0.59040, saving model to models/checkpoint.h5
Epoch 00006: f1-macro did not improve
Epoch 00007: f1-macro did not improve
Epoch 00008: f1-macro did not improve
Epoch 00009: f1-macro improved from 0.59040 to 0.59316, saving model to models/checkpoint.h5
Epoch 00010: f1-macro did not improve
Epoch 00011: f1-macro did not improve
Epoch 00012: f1-macro improved from 0.59316 to 0.59741, saving model to models/checkpoint.h5
Epoch 00013: f1-macro did not improve
Epoch 00014: f1-macro d

In [33]:
m_check = load_model('models/checkpoint.h5')
scores = m_check.predict(X_test, batch_size=30)
y_pred = np.argmax(scores, axis=1)
y_true = np.argmax(y_test, axis=1)
# precision recall f1
print("Precision: ", round(100*metrics.precision_score(y_true, y_pred, average='macro'), 1))
print("Recall: ", round(100*metrics.recall_score(y_true, y_pred, average='macro'), 1))
print("F1: ", round(100*metrics.f1_score(y_true, y_pred, average='macro') , 1))

Precision:  67.4
Recall:  58.6
F1:  61.9
