# Language Modeling. Hemos vuelto.

Ahora que hemos visto un montón de arquitecturas, y sus aplicaciones al lenguaje natural, si quisieramos hacer language modeling... cuál escogeríais?

![](https://cdn-ak.f.st-hatena.com/images/fotolife/a/aki-don/20170929/20170929085607.png)

Hay mucísimas formas de intentarlo.

Implementaremos algunas de ellas , para que veais que en deep learning, hay libertad para intentarlo practicamente todo. Predecir palabras? Predecir carácteres. Usar embeddings? No Usarlos? Usar Bidirectional LSTMs? Stackearlas? 

Básicamente, seguiremos el mismo pipeline que hasta ahora.

Un poco de preproceso, preparar los datos, escoger la arquitectura a usar, y entrenar/predecir.

Esta vez os daré un snippet de código para visualizar en directo lo que esta aprendiendo la red, y poder leer ejemplos de aquello que aprende.

Lo haremos todo con keras, como siempre!

## Imports

In [0]:
!pip install spacy
!python -m spacy download en_core_web_sm

In [0]:
import spacy
import numpy as np

import pickle
import json
import os
import csv
import pprint as pp

import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd

from random import shuffle, choice, sample

from sklearn.model_selection import StratifiedShuffleSplit

from copy import copy

import warnings
warnings.filterwarnings('ignore')

data_path = '../datasets/data/'

nlp = spacy.load('en_core_web_sm')

import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import pylab as pl
from IPython import display

sns.set(color_codes=True)

import warnings
warnings.filterwarnings('ignore')


%matplotlib inline
%load_ext autoreload
%autoreload 2
%matplotlib notebook

In [0]:
from keras.models import Model, Sequential
from keras.layers import Input, CuDNNLSTM, Dense, LSTM
from keras.layers import Bidirectional
from keras.layers import Embedding
from keras.layers import Merge, Dot, Concatenate, Flatten, Permute, Multiply, dot, concatenate
from keras.layers import TimeDistributed
from keras.layers import Activation
from keras.preprocessing import sequence
from keras.callbacks import Callback
from keras.optimizers import SGD
from keras.models import load_model

from keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

Using TensorFlow backend.


## Preprocess

In [0]:
uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

747


In [0]:
df = pd.read_csv(io.StringIO(uploaded['spam.csv'].decode('ISO-8859-1')))
drop_col = df.columns[2:]
df = df.drop(columns=drop_col, axis=1)
df

In [0]:
spam_dataset = []
for index, row in df.iterrows():
    if index>0:
        sentence = row[1]
        spam_dataset.append(sentence)
print(spam_dataset[0])
len(spam_dataset)

In [0]:
tokenized = [list(x) for x in spam_dataset]

In [0]:
init_chars = [x[:5] for x in tokenized]
for i in range(len(init_chars)):
    tmp = init_chars[i]
    tmp.insert(0, '<SOS>')
    init_chars[i] = tmp[:5]    

In [0]:
init_chars[0]

['<SOS>', 'G', 'o', ' ', 'u']

In [0]:
start_token = [s[1] for s in tokenized if len(s) > 1]
len(start_token)

747

In [0]:
maxlen = max([len(x) for x in tokenized])
avglen = sum([len(x) for x in tokenized])/len(tokenized)
print(maxlen, avglen)

223 138.429718875502


In [0]:
from collections import Counter

In [0]:
SAMPLE_EVERY = 3
PLOT_EVERY = 1

## Helper functions

In [0]:
def sample_pred(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [0]:
class Sampletest(Callback):
    def on_epoch_end(self, epoch, logs):
        if epoch % SAMPLE_EVERY == 0  and epoch>0:
            data_test = []
            nb_samples = 1
            
            params = {
                'maxlen': maxlen,
                'vocab': nb_vocab,
                'use_embeddings': True
                }
            for _ in range(nb_samples):
                data_test = choice(init_chars)
                x_pred = np.zeros((1, params['maxlen'], params['vocab']), dtype=np.bool)
                for diversity in [0.2, 0.6, 1.2]:
                    print('----- diversity:', diversity)
                    sentence = copy(data_test)
                    generated = copy(data_test)
                    for i in range(len(data_test), 400):
                        x_pred = np.zeros((1, params['maxlen'], params['vocab']))
                        for t, char in enumerate(sentence):
                            x_pred[0, t, w2id[char] if char in w2id else w2id['<UNK>']] = 1.
                        preds = self.model.predict(x_pred, verbose=0)[0]
                        next_index = sample_pred(preds, diversity)
                        next_char = id2w[next_index]
                        if next_char == '<EOS>':
                            break
                        generated += [next_char]
                        sentence = sentence[1:] 
                        sentence += [next_char]
                    print(''.join(generated))


In [0]:
class HistoryDisplay(Callback):
    
    def on_train_begin(self, logs={}):
        self.losses = []
        self.accs = []
        self.epochs = []
        self.fig, self.ax = plt.subplots()
        #plt.show()
        
        plt.ion()
        self.fig.show()
        self.fig.canvas.draw()
    
    def on_epoch_end(self, epoch, logs):
        self.epochs.append(epoch)
        self.losses.append(logs['loss'])
        self.accs.append(logs['acc'])
        if epoch % PLOT_EVERY == 0:
            
            self.ax.clear()
            self.ax.plot(self.epochs, self.accs, 'g', label='acc')
            self.ax.plot(self.epochs, self.losses, 'b', label='loss')
            legend = self.ax.legend(loc='upper right', shadow=True, fontsize='x-large')
            #display.clear_output(wait=True)
            #display.display(pl.gcf())
            self.fig.canvas.draw()
            
            #plt.draw()
        

In [0]:
class TimeHistory(Callback):
    def on_train_begin(self, logs={}):
        self.times = []

    def on_epoch_begin(self, batch, logs={}):
        self.epoch_time_start = time.time()

    def on_epoch_end(self, batch, logs={}):
        self.times.append(time.time() - self.epoch_time_start)

## Decidir arquitectura y preparar el train y el predict

In [0]:
class LM:
    def __init__(self, **kwargs):
        self.params = kwargs.pop('params', None)
    
    def compile_bidirectional(self, params={}):
        None              
        
    def train(self, model, data, params={}):
        
        callbacks = self._get_callbacks()

    def predict(self, model, data, params={}):        
        None
    
    def load(self, model_path='seq2seq_attn.h5'):
        return load_model(model_path)
    
    def _get_callbacks(self, model_path='seq2seq_attn.h5'):
        es = EarlyStopping(monitor='loss', patience=4, mode='auto', verbose=0)
        save_best = ModelCheckpoint(model_path, monitor='loss', verbose = 0, save_best_only=True, save_weights_only=False, period=2)
        st = Sampletest()
        hd = HistoryDisplay()
        rlr = ReduceLROnPlateau(monitor='loss', factor=0.2, patience=3, min_lr=0.0001, verbose=0)
        return [st, save_best, hd]

## Hyperparametros

In [0]:
LOAD_MODEL = False
bTrain = True

## Compilar

In [0]:
lm = LM()
if LOAD_MODEL:
    path = 'final_{}.h5'.format(dtype)
    lm_model = lm.load(model_path=path)
    lm_model.summary()
else:
    lm_model = lm.compile_bidirectional(params=compile_params)    

## Entrenar

In [0]:
train_params = {
    'epochs': 500,
    'batch_size': 512,
    'shuffle': True,
    'vocab': nb_vocab,
    'maxlen': maxlen,
    'use_embeddings': True
}
pp.pprint(train_params)
if bTrain:
    lm.train(model=lm_model, data=data_train, params=train_params)