# Paso a paso creación de modelo simple.

In [1]:
from __future__ import print_function
import json
import h5py
import keras
from keras.models import Sequential, Model
from keras.layers import *
from keras.optimizers import *
from util.preprocessing import perpareDataset, loadDatasetPickle
import random

Using TensorFlow backend.


## Carga de dataset desde pkl (preproceso)

In [2]:
embeddings, word2Idx, datasets = loadDatasetPickle("pkl/am_levy_deps.pkl")
dataset = datasets['am']
casing2Idx = dataset['mappings']['casing']
caseMatrix = np.identity(len(casing2Idx), dtype='float32')

## Capas de embeddings

In [9]:
tokens = Sequential()
tokens.add(Embedding(input_dim=embeddings.shape[0], output_dim=embeddings.shape[1],  weights=[embeddings], trainable=False, name='token_emd'))

casing = Sequential()
casing.add(Embedding(input_dim=caseMatrix.shape[0], output_dim=caseMatrix.shape[1], weights=[caseMatrix], trainable=False, name='casing_emd')) 

mergeLayersInput = [tokens.input, casing.input]
mergeLayersOut = [tokens.output, casing.output]
merged = Concatenate()(mergeLayersOut)

## Capa de LSTM Bidireccional

In [10]:
size = 150
lstmLayer = Bidirectional(LSTM(size, return_sequences=True, dropout=0.25, recurrent_dropout=0.25), name="main_LSTM")(merged)

## Capa de clasificación, funcion de pérdida y optimizador.

In [13]:
activationLayer = TimeDistributed(Dense(len(dataset['mappings']['AM_TAG']), activation='softmax'), name='softmax_output')(lstmLayer)
lossFct = 'sparse_categorical_crossentropy'
optimizer = 'nadam'

In [15]:
model = Model(mergeLayersInput, activationLayer)
model.compile(loss=lossFct, optimizer=optimizer)
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
token_emd_input (InputLayer)    (None, None)         0                                            
__________________________________________________________________________________________________
casing_emd_input (InputLayer)   (None, None)         0                                            
__________________________________________________________________________________________________
token_emd (Embedding)           (None, None, 300)    52205700    token_emd_input[0][0]            
__________________________________________________________________________________________________
casing_emd (Embedding)          (None, None, 8)      64          casing_emd_input[0][0]           
__________________________________________________________________________________________________
concatenat

## Entrenamiento por lotes

In [16]:
def online_iterate_dataset(dataset): 
    idxRange = list(range(0, len(dataset)))
    random.shuffle(idxRange)

    for idx in idxRange:
            labels = []                
            features = ['tokens', 'casing']                

            labels = dataset[idx]["AM_TAG"]
            labels = [labels]
            labels = np.expand_dims(labels, -1)  

            inputData = {}
            for name in features:
                inputData[name] = np.asarray([dataset[idx][name]])
            yield [labels] + [inputData[name] for name in features] 

In [17]:
trainMatrix = dataset['trainMatrix']
iterator = online_iterate_dataset(trainMatrix)

In [18]:
for n, batch in enumerate(iterator):
    labels = batch[0]
    nnInput = batch[1:]
    #print(nnInput, labels)
    model.train_on_batch(nnInput, labels)
    if n > 2:
        break

## Guardar / Cargar modelo entrenado

In [19]:
savePath = "./testModel.h5"
model.save(savePath, False)
import json
import h5py
mappingsJson = json.dumps(dataset['mappings'])
with h5py.File(savePath, 'a') as h5file:
    h5file.attrs['mappings'] = mappingsJson

In [20]:
model = keras.models.load_model(savePath)
with h5py.File(savePath, 'r') as f:
    mappings = json.loads(f.attrs['mappings'])

In [21]:
model.predict(nnInput, verbose=False).argmax(axis=-1)

array([[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 0, 3, 0, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0,
        3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 