# Paso a paso creación de modelo simple.

In [51]:
from __future__ import print_function
import json
import h5py
import keras
from keras.models import Sequential
from keras.layers import *
from keras.optimizers import *
from util.preprocessing import perpareDataset, loadDatasetPickle
import random

## Carga de dataset desde pkl (preproceso)

In [18]:
embeddings, word2Idx, datasets = loadDatasetPickle("pkl/am_levy_deps.pkl")
dataset = datasets['am']
casing2Idx = dataset['mappings']['casing']
caseMatrix = np.identity(len(casing2Idx), dtype='float32')

## Capas de embeddings

In [19]:
tokens = Sequential()
tokens.add(Embedding(input_dim=embeddings.shape[0], output_dim=embeddings.shape[1],  weights=[embeddings], trainable=False, name='token_emd'))

casing = Sequential()
casing.add(Embedding(input_dim=caseMatrix.shape[0], output_dim=caseMatrix.shape[1], weights=[caseMatrix], trainable=False, name='casing_emd')) 

mergeLayers = [tokens, casing]
model = Sequential()
model.add(Merge(mergeLayers, mode='concat'))

model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
token_emd (Embedding)            (None, None, 300)     52205700    embedding_input_3[0][0]          
____________________________________________________________________________________________________
casing_emd (Embedding)           (None, None, 8)       64          embedding_input_4[0][0]          
Total params: 52,205,764
Trainable params: 0
Non-trainable params: 52,205,764
____________________________________________________________________________________________________


## Capa de LSTM Bidireccional

In [20]:
size = 150
model.add(Bidirectional(LSTM(size, return_sequences=True, dropout_W=0.25, dropout_U=0.25), name="varLSTM_"))
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
token_emd (Embedding)            (None, None, 300)     52205700    embedding_input_3[0][0]          
____________________________________________________________________________________________________
casing_emd (Embedding)           (None, None, 8)       64          embedding_input_4[0][0]          
____________________________________________________________________________________________________
varLSTM_ (Bidirectional)         (None, None, 300)     550800      merge_2[0][0]                    
Total params: 52,756,564
Trainable params: 550,800
Non-trainable params: 52,205,764
____________________________________________________________________________________________________


## Capa de clasificación, funcion de pérdida y optimizador.

In [11]:
model.add(TimeDistributed(Dense(len(dataset['mappings']['AM_TAG']), activation='softmax'), name='softmax_output'))
lossFct = 'sparse_categorical_crossentropy'
optimizer = 'nadam'

In [21]:
model.compile(loss=lossFct, optimizer=optimizer)
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
token_emd (Embedding)            (None, None, 300)     52205700                                     
____________________________________________________________________________________________________
casing_emd (Embedding)           (None, None, 8)       64                                           
____________________________________________________________________________________________________
varLSTM_ (Bidirectional)         (None, None, 300)     550800      merge_2[0][0]                    
Total params: 52,756,564
Trainable params: 550,800
Non-trainable params: 52,205,764
____________________________________________________________________________________________________


## Entrenamiento por lotes

In [42]:
def online_iterate_dataset(dataset): 
    idxRange = list(range(0, len(dataset)))
    random.shuffle(idxRange)

    for idx in idxRange:
            labels = []                
            features = ['tokens', 'casing']                

            labels = dataset[idx]["AM_TAG"]
            labels = [labels]
            labels = np.expand_dims(labels, -1)  

            inputData = {}
            for name in features:
                inputData[name] = np.asarray([dataset[idx][name]])
            yield [labels] + [inputData[name] for name in features] 

In [43]:
trainMatrix = dataset['trainMatrix']
iterator = online_iterate_dataset(trainMatrix)

In [49]:
for n, batch in enumerate(iterator):
    labels = batch[0]
    nnInput = batch[1:]
    #print(nnInput, labels)
    model.train_on_batch(nnInput, labels)
    if n > 2:
        break

## Guardar modelo entrenado

In [50]:
savePath = "./testModel.h5"
model.save(savePath, False)
import json
import h5py
mappingsJson = json.dumps(dataset['mappings'])
with h5py.File(savePath, 'a') as h5file:
    h5file.attrs['mappings'] = mappingsJson

In [55]:
model = keras.models.load_model(savePath)
with h5py.File(savePath, 'r') as f:
    mappings = json.loads(f.attrs['mappings'])

In [68]:
model.predict(nnInput, verbose=False).argmax(axis=-1)

array([[149,   0,   0,   0,   0,   0,   0,   0,   3,   3,   3,   3,   3,
          3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,
          3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,
          3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,
          3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,
          3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,
          3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,
          3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,
          3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,
          3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,
          3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,
          3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,
          3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,
          3,   3,   3,   3,   3,   3,   3,   3,   3