# DMA Final Project - Rap Language Model

This notebook: language modeling

In [1]:
import os, json, re
from nltk import word_tokenize, regexp_tokenize
from collections import Counter
import numpy as np

### Load Data

In [588]:
def read_data(file):
    artists = []
    seqs = []
    targets = []
    with open(file, 'r') as f:
        for line in f:
            line = line.strip().split('\t')
            artists.append(int(line[0]))
            seqs.append(line[1].split())
            targets.append(line[2])
           
    return (np.array(artists), seqs, targets)

def read_map(file):
    a2b = {}
    with open(file, 'r') as f:
        for line in f:
            line = line.strip().split('\t')
            a2b[line[0]] = line[1]
    return a2b
    
# train = read_data(os.path.join(_dir, 'train.tsv'))
# val = read_data(os.path.join(_dir, 'val.tsv'))
# test = read_data(os.path.join(_dir, 'test.tsv'))

In [586]:
_dir = 'data/rap_max100_10'
#
# a_{train|val|test} = artist ids
# x_{train|val|test} = token sequences of length seq_length
# y_{train|val|test} = target tokens for each sequence
#
a_train, x_train, y_train = read_data(os.path.join(_dir, 'train.tsv'))
a_val, x_val, y_val = read_data(os.path.join(_dir, 'val.tsv'))
a_test, x_test, y_test = read_data(os.path.join(_dir, 'test.tsv'))

In [589]:
artist2id = read_map(os.path.join(_dir, 'artist2id.tsv'))

In [336]:
rap_vocab = Counter([t for sample in x_train for t in sample])
len(rap_vocab)

30098

In [362]:
# Check distribution of tokens - 
# want to make sure that <BR> and <UNK> are not dominating
ys = Counter()
n = len(y_train)
for word in y_train:
    ys[word] += 100./n

### Use pretrained Glove embeddings

***Selecting Vocabulary Size***

The rap corpus contains 55538 unique tokens (printed out in cell above). Here I load in pretrained Glove embeddings and check how much coveragewe get when using 100k, 200k,... tokens. Corpus coverage takes into account token frequency in our corpus.

| vocab size | token coverage   | corpus coverage |
|------|------|------|
| 50k  | 45.43% | 95.46% | 
| 100k | 56.51% | 96.67% | 
| 200k | 63.60% | 98.05%|
| 300k | % | %|
| 400k | % | %|

Looks like using all 400k tokens does not give us a huge advantage in terms of corpus coverage (87% vs 85%). Limiting the vocabulary size will make the model a little bit easier to train.  Lets start with vocab size of **50k**.

Examining tokens for which we don't have coverage - most are strange spellings or words in other languages. These will be replaced with <UNK> token during model training.
    
**Unknown tokens**: {'irreputable',
 '77777777',
 'heini',
 'wallabees',
 'motherfuckerss',
 'jip',
 'gyrate',
 'lights…',
 'pigsties',
 'hassans',
 'muthaphukka',
 'jiggy',
 "'months",
 'allergenic',
 'девки',
 'nestlé',
 'westbank',
 'boggles',
 'tieing',...}
 
 **Most frequent unknown tokens**: ('nigga', 21525),
 ('niggas', 18183),
 ("'all", 6122),
 ('!)', 4568),
 ("',", 4442),
 ('bitches', 4082),
 ('hoes', 3003),
 ("'ma", 2211),
 ('pussy', 2117),
 ("'bout", 2092),
 ('..', 1893),
 ('motherfucker', 1848),
 ('?)', 1811),
 ('tryna', 1776),
 ('ooh', 1727),
 ("'mma", 1501),
 ('motherfuckers', 1373)


In [78]:
def load_embeddings(filename, max_vocab_size, emb_dim):

    vocab={}
    embeddings=[]
    with open(filename) as file:
        
        cols=file.readline().split(" ")
        num_words=int(cols[0])
        size=int(cols[1])
        embeddings.append(np.zeros(size))  # 0 = 0 padding if needed
        embeddings.append(np.random.uniform(-1,1,emb_dim))  # 1 = UNK
        embeddings.append(np.random.uniform(-1,1,emb_dim))  # 1 = <BR>
        vocab["<PAD>"]=0
        vocab["<UNK>"]=1
        vocab["<BR>"]=2
        
        for idx,line in enumerate(file):

            if idx+3 >= max_vocab_size:
                break

            cols=line.rstrip().split(" ")
            val=np.array(cols[1:])
            word=cols[0]
            
            embeddings.append(val)
            vocab[word]=idx+3

    return np.array(embeddings), vocab, size

def tok_to_id(tok, vocab):
    if tok in vocab:
        return vocab[tok]
    
    if tok[-1]=='n' and tok+'g' in vocab:
        # 'growin' -> 'growing'
        # 'obeyin' -> 'obeying'
        return vocab[tok+'g']
    
    return vocab['<UNK>']
    
def check_glove_coverage(vocab, tokens, vocab_map={}):  
    in_vocab = set()
    out_vocab = set()
    all_vocab = set()
    
    in_count = 0
    out_count = 0
    all_count = 0
    
    out_counter = Counter()
    for tok in tokens:
        
        if tok in vocab_map:
            tok = vocab_map[tok]
        
        if tok in vocab or tok == '<BR>':
            in_vocab.add(tok)
            in_count += tokens[tok]            
        elif tok[-1]=='n' and tok+'g' in vocab:
            # 'growin' -> 'growing'
            # 'obeyin' -> 'obeying'
            in_vocab.add(tok)
            in_count += tokens[tok]
        else:
            out_counter[tok] += tokens[tok]
            out_vocab.add(tok)
            out_count += tokens[tok]
        all_vocab.add(tok)
        all_count += tokens[tok]
        
    print("{:.2%} of unique tokens covered".format(len(in_vocab)/len(all_vocab)))
    print("{:.2%} of corpus covered".format(in_count/all_count))
    
    return in_vocab, out_vocab, all_vocab, out_counter

In [332]:
vocab_size = 50000
vocab_dim = 100
emb, tok2id, size = load_embeddings('data/glove/glove.6B/glove.6B.100d.w2v', vocab_size, vocab_dim)
in_, out_, all_, out_counter = check_glove_coverage(tok2id, rap_vocab, vocab_map)

58.78% of unique tokens covered
95.72% of corpus covered


In [319]:
vocab_map = {
    "'bout": 'about',
    "'ma": "am",
    "'mma": "am",
    "uhh": "uh",
    "'all": "all"
}
in_, out_, all_, out_counter = check_glove_coverage(tok2id, rap_vocab, vocab_map)

58.78% of unique tokens covered
95.72% of corpus covered


In [337]:
in_, out_, all_, out_counter = check_glove_coverage(tok2id, rap_vocab, {})

61.06% of unique tokens covered
95.90% of corpus covered


In [341]:
# DOMAIN SPECIFIC VOABULARY
print(len(out_), len(all_))
len([o for o in out_counter if out_counter[o]>50])

11720 30098


882

In [338]:
out_counter.most_common(100)

[('nigga', 46314),
 ('niggas', 39779),
 ("'all", 13262),
 ('bitches', 7929),
 ('hoes', 5849),
 ("'ma", 5296),
 ("'bout", 4778),
 ('motherfucker', 4221),
 ('pussy', 4111),
 ('ooh', 4034),
 ('tryna', 3719),
 ("'mma", 3298),
 ("'mon", 2989),
 ('uhh', 2853),
 ('motherfuckers', 2786),
 ('homie', 2599),
 ('motherfuckin', 2358),
 ('fucked', 2333),
 ('cuz', 2241),
 ('woah', 2177),
 ('biggie', 1980),
 ('holla', 1863),
 ('bullshit', 1828),
 ('outta', 1770),
 ('yea', 1675),
 ('thang', 1614),
 ('ayo', 1502),
 ('gat', 1362),
 ('ayy', 1341),
 ('homies', 1161),
 ('whatchu', 1110),
 ('motherfucking', 1103),
 ('tw', 1103),
 ('ballin', 1080),
 ('aight', 1041),
 ('busta', 1039),
 ('haters', 1031),
 ('jockin', 1021),
 ('pimpin', 968),
 ('jiggy', 967),
 ('rakim', 936),
 ("'fore", 924),
 ('twerk', 903),
 ("'round", 900),
 ('luda', 884),
 ('shawty', 884),
 ('haha', 829),
 ('tical', 808),
 ('dawg', 793),
 ('izz', 774),
 ('hmm', 755),
 ('gots', 745),
 ('goddamn', 737),
 ('thats', 731),
 ('smalls', 724),
 ('ohh

In [339]:
domain_words = [x[0] for x in out_counter.most_common(1000)]

In [480]:
idx = len(tok2id)
for word in domain_words:
    tok2id[word] = idx
    idx += 1

### Vectorize Tokens

In [77]:
len(tok2id)

50000

In [105]:
x_train_id = np.array([[tok_to_id(t, tok2id) for t in seq] for seq in x_train])
y_train_id = np.array([tok_to_id(t, tok2id) for t in y_train])
x_val_id = np.array([[tok_to_id(t, tok2id) for t in seq] for seq in x_val])
y_val_id = np.array([tok_to_id(t, tok2id) for t in y_val])

In [106]:
print(x_train_id.shape, y_train_id.shape, x_val_id.shape, y_val_id.shape)

(3955008, 10) (3955008,) (494376, 10) (494376,)


### Model

In [573]:
import keras
import numpy as np
from sklearn import preprocessing
from keras.layers import Dense, Input, Embedding, Lambda, Layer, Multiply, \
Dropout, Dot, Bidirectional, LSTM, concatenate, Flatten
from keras.models import Model
from keras import backend as K
import tensorflow as tf
from keras.callbacks import ModelCheckpoint, EarlyStopping, Callback
import pandas as pd
from scipy.stats import norm
from math import sqrt

In [92]:
# disable annoying tensorflow "deprecated" messages
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'

In [471]:
import functools
top10_acc = functools.partial(keras.metrics.sparse_top_k_categorical_accuracy, k=10)
top10_acc.__name__ = 'top10_acc'
top5_acc = functools.partial(keras.metrics.sparse_top_k_categorical_accuracy, k=5)
top5_acc.__name__ = 'top5_acc'

In [592]:
class AttentionLayerMasking(Layer):

    def __init__(self, output_dim, **kwargs):
        self.output_dim = output_dim
        super(AttentionLayerMasking, self).__init__(**kwargs)


    def build(self, input_shape):
        input_embedding_dim=input_shape[-1]
        
        self.kernel = self.add_weight(name='kernel', 
                            shape=(input_embedding_dim,1),
                            initializer='uniform',
                            trainable=True)
        super(AttentionLayerMasking, self).build(input_shape)

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        
        # dot product 
        x=K.dot(x, self.kernel)
        # exponentiate
        x=K.exp(x)
        
        # zero out elements that are masked
        if mask is not None:
            mask = K.cast(mask, K.floatx())
            mask = K.expand_dims(mask, axis=-1)
            x = x * mask
        
        # normalize by sum
        x /= K.sum(x, axis=1, keepdims=True)
        x=K.squeeze(x, axis=2)

        return x

    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[1])
    
def get_model(embeddings, lstm_size=25, dropout_rate=0.25):

    vocab_size, word_embedding_dim=embeddings.shape 
    word_sequence_input = Input(shape=(None,), dtype='int32')
    
    word_embedding_layer = Embedding(vocab_size,
                                    word_embedding_dim,
                                    weights=[embeddings],
                                    trainable=False)

    
    embedded_sequences = word_embedding_layer(word_sequence_input)
    bilstm_output = Bidirectional(LSTM(lstm_size, 
                                       return_sequences=True, 
                                       activation='tanh', 
                                       dropout=dropout_rate), merge_mode='concat')(embedded_sequences)

    # first let's transform each word embedding into a new vector to use for measuring its importance
    attention_key_dim=300
    attention_input=Dense(attention_key_dim, activation='tanh')(bilstm_output)

    # next we'll pass those transformed inputs through an attention layer, getting back a normalized
    # attention value a_i for each token i; \forall i, 0 <= a_i <= 1; for a document with N words, 
    # \sum_{i=0}^N a_i = 1
    
    attention_output = AttentionLayerMasking(word_embedding_dim, name="attention")(attention_input)
    
    # now let's multiply those attention weights by original inputs to get a weighted average over them
    document_representation = Lambda(lambda x: K.batch_dot(x[0], x[1], axes=1), name='dot')([attention_output,bilstm_output])

    x=Dense(vocab_size, activation="softmax")(document_representation)

    model = Model(inputs=word_sequence_input, outputs=x)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
    
    return model

def get_simple_lstm(embeddings, lstm_size=25, dropout_rate=0.1):
    vocab_size, word_embedding_dim=embeddings.shape 
    word_sequence_input = Input(shape=(None,), dtype='int32')
    
    word_embedding_layer = Embedding(vocab_size,
                                    word_embedding_dim,
                                    weights=[embeddings],
                                    trainable=False)

    
    # input - embeddings
    embedded_sequences = word_embedding_layer(word_sequence_input)
    # lstm layer
    lstm_output = LSTM(lstm_size, 
                       return_sequences=False, 
                       activation='tanh', 
                       dropout=dropout_rate)(embedded_sequences)
    # + dense layer
#     dense_output = Dense(128, activation='tanh')(lstm_output)
    # + droupout
#     seq_representation = Dropout(dropout_rate)(dense_output)
    # final output - softmax over all vocabulary
    x=Dense(vocab_size, activation="softmax")(lstm_output)
    
    model = Model(inputs=word_sequence_input, outputs=x)
    model.compile(loss='sparse_categorical_crossentropy', 
                  optimizer='adam', 
                  metrics=['sparse_categorical_accuracy', top5_acc, top10_acc, perplexity])
    
    return model


def get_lstm_source(embeddings, lstm_size=25, dropout_rate=0.1, source_n=10, source_dim=50, dense_dim=25):
    
    # word embeddings
    vocab_size, word_embedding_dim=embeddings.shape
    word_embedding_layer = Embedding(vocab_size,
                                    word_embedding_dim,
                                    weights=[embeddings],
                                    trainable=True,
                                    name='word_emb')
    # source embeddings
    source_embedding_layer = Embedding(source_n, 
                                       source_dim, 
                                       input_length=1, 
                                       trainable=True,
                                       name='source_emb')
    
    # inputs
    word_sequence_input = Input(shape=(None,), dtype='int32')
    source_input = Input(shape=(1,), dtype='int32')
    
    # build model
    embedded_sequences = word_embedding_layer(word_sequence_input) # (batch_size x seq_length x embedding_dim)
    embedded_sources = Flatten()(source_embedding_layer(source_input)) # (batch_size x source_dim)
    
    # pass sequences through lstm
    lstm_output = LSTM(lstm_size, 
                       return_sequences=False, 
                       activation='tanh', 
                       dropout=dropout_rate,
                       name='lstm')(embedded_sequences)
    
    # concat with source embeddings
    combined = concatenate([embedded_sources, lstm_output])
    
    # Dense layer over concat -> predict
    combined = Dense(dense_dim, activation="tanh", name='dense')(combined)
    x=Dense(vocab_size, activation="softmax", name='predict')(combined)
    
    # compile model
    model = Model(inputs=[word_sequence_input, source_input], outputs=x)
    model.compile(loss='sparse_categorical_crossentropy', 
                  optimizer='adam', 
                  metrics=['sparse_categorical_accuracy', top5_acc, top10_acc])
    
    return model

lstm_source_model = get_lstm_source(emb, lstm_size=25, source_n=len(artist2id))
print(lstm_source_model.summary())

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_61 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
input_60 (InputLayer)           (None, None)         0                                            
__________________________________________________________________________________________________
source_emb (Embedding)          (None, 1, 50)        1000        input_61[0][0]                   
__________________________________________________________________________________________________
word_emb (Embedding)            (None, None, 100)    5100000     input_60[0][0]                   
__________________________________________________________________________________________________
flatten_6 

In [593]:
batch_size = 256
ntrain = batch_size*1
nval = batch_size*1
lstm_source_model.fit([x_train_id[:ntrain], a_train[:ntrain]], y_train_id[:ntrain], 
            validation_data=([x_val_id[:nval], a_val[:ntrain]], y_val_id[:nval]),
            epochs=10, batch_size=batch_size)

Train on 256 samples, validate on 256 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1c4de914a8>

In [579]:
x_source_id

NameError: name 'x_source_id' is not defined

In [234]:
lstm_model = get_simple_lstm(emb, lstm_size=25)
print(lstm_model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_14 (InputLayer)        (None, None)              0         
_________________________________________________________________
embedding_14 (Embedding)     (None, None, 100)         5000000   
_________________________________________________________________
lstm_7 (LSTM)                (None, 25)                12600     
_________________________________________________________________
dense_19 (Dense)             (None, 50000)             1300000   
Total params: 6,312,600
Trainable params: 1,312,600
Non-trainable params: 5,000,000
_________________________________________________________________
None


In [235]:
lstm_model.load_weights("lstm_model.hdf5")

In [152]:
modelName="simple_lstm.hdf5"
checkpoint = ModelCheckpoint(modelName, monitor='val_loss', verbose=0, save_best_only=True, mode='min')

batch_size = 1024
ntrain = batch_size*10
nval = batch_size*5
lstm_model.fit(x_train_id[:ntrain], y_train_id[:ntrain], 
            validation_data=(x_val_id[:nval], y_val_id[:nval]),
            epochs=10, batch_size=batch_size,
            callbacks=[checkpoint])

Train on 10240 samples, validate on 5120 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1c373074a8>

### Attention Model

In [472]:
import functools
def perplexity(y_true, y_pred):
    """
    The perplexity metric. Why isn't this part of Keras yet?!
    https://stackoverflow.com/questions/41881308/how-to-calculate-perplexity-of-rnn-in-tensorflow
    https://github.com/keras-team/keras/issues/8267
    """
    cross_entropy = K.sparse_categorical_crossentropy(y_true, y_pred)
    perplexity = K.exp(cross_entropy)
    return perplexity

top10_acc = functools.partial(keras.metrics.sparse_top_k_categorical_accuracy, k=10)
top10_acc.__name__ = 'top10_acc'

def simple_attention_model(embeddings):
    vocab_size, word_embedding_dim=embeddings.shape 
    word_sequence_input = Input(shape=(None,), dtype='int32')
    
    word_embedding_layer = Embedding(vocab_size,
                                    word_embedding_dim,
                                    weights=[embeddings],
                                    trainable=False)

    
    # input - embeddings
    embedded_sequences = word_embedding_layer(word_sequence_input)
    
    # reduce embedding dimensionality
    attention_key_dim=25
    attention_input=Dense(attention_key_dim, activation='tanh')(embedded_sequences)
    
    attention_output = AttentionLayerMasking(word_embedding_dim, name="attention")(attention_input)
    
    # now let's multiply those attention weights by original inputs to get a weighted average over them
    document_representation = Lambda(lambda x: K.batch_dot(x[0], x[1], axes=1), 
                                     name='dot')([attention_output,attention_input])
    
    x=Dense(vocab_size, activation="softmax")(document_representation)
    
    model = Model(inputs=word_sequence_input, outputs=x)
    model.compile(loss='sparse_categorical_crossentropy', 
                  optimizer='adam',
                  metrics=['sparse_categorical_accuracy', top10_acc, perplexity])
    
    return model

In [229]:
attn_model = simple_attention_model(emb)
print(attn_model.summary())

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_13 (InputLayer)           (None, None)         0                                            
__________________________________________________________________________________________________
embedding_13 (Embedding)        (None, None, 100)    5000000     input_13[0][0]                   
__________________________________________________________________________________________________
dense_17 (Dense)                (None, None, 25)     2525        embedding_13[0][0]               
__________________________________________________________________________________________________
attention (AttentionLayerMaskin (None, None)         25          dense_17[0][0]                   
__________________________________________________________________________________________________
dot (Lambd

In [230]:
modelName="simple_attn_lstm.hdf5"
checkpoint = ModelCheckpoint(modelName, monitor='val_perplexity', verbose=0, save_best_only=True, mode='min')

batch_size = 256
ntrain = batch_size*20
nval = batch_size*5
attn_model.fit(x_train_id[:ntrain], y_train_id[:ntrain], 
            validation_data=(x_val_id[:nval], y_val_id[:nval]),
            epochs=10, batch_size=batch_size,
            callbacks=[checkpoint])

Train on 5120 samples, validate on 1280 samples
Epoch 1/10
Epoch 2/10

KeyboardInterrupt: 

## Predict

In [594]:
# domain_emb = []
# for i in range (1000):
#     domain_emb.append(np.zeros(emb.shape[1]))
    
# emb = np.concatenate((emb, np.array(domain_emb)))

# lstm_model = get_simple_lstm(emb, lstm_size=100)
lstm_model.load_weights("lstm100/lstm100_best_ppx.hdf5")

In [600]:
id2tok = {v:k for k,v in tok2id.items()}
def sample_vocab_topk(probs, k=10):
    
    # the token at index 1 is <UNK>
    # remove it since it will be the most probably token all the time
    probs = np.delete(probs, 1)
    
    idx = np.argsort(probs) # sorts in ascending oder
    probs = probs[idx[-k:]] # look at top k most probable predictions
    probs /= sum(probs) # normalize so probabilities sum up to 1
    
    sample = np.random.choice(idx[-k:], p=probs)
    return sample+1 if sample>0 else 0 # adjust because removed idx 1 above

#     return np.random.choice(idx[-k:], p=probs) # sample from top k predictions

def sample_vocab(probs):
    return np.random.choice(np.arange(len(probs)), p=probs)

def predict_batch(model, seed):
#     n = 100
#     seed = np.array(x_val_id[100:100+n])
    n = len(seed)
    yhat = model.predict_on_batch(seed)
    predictions = [sample_vocab(row) for row in yhat]
    for i in range(n):
        print([id2tok[j] for j in seed[i]], id2tok[predictions[i]])

def generate(model, seed, n):

#     seed = x_train_id[4000]
    # seedStr = '<PAD> <PAD> <PAD> <PAD> <PAD> <PAD> show me where '
    # seed = [tok2id[w] for w in seedStr.split()]
    # seed = [0,0,0,0,0,0,0,0,0]
    l = len(seed)
    # print([id2tok[j] for j in seed])
    s = ' '.join([id2tok[j] for j in seed])
    print(s)
    for i in range(n):
        seed1 = seed[-l:]
#         print(' '.join([id2tok[j] for j in seed1]))
        
        pred = lstm_model.predict(np.array([seed1]))
        
#         next_tok = sample_vocab(pred[0])
        next_tok = sample_vocab_topk(pred[0], 100)
        
        
        tok = id2tok[next_tok]
        if tok == '<PAD>':
            break
        s += ' ' + tok
        seed.append(next_tok)

    print()
    for tok in s.split():
        if tok=='<PAD>':
            continue
        elif tok=='<BR>':
            print()
        else:
            print(tok, end=' ')

seed = "<PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> good night"
seed = [tok2id[t] for t in seed.split()]
# generate(lstm_model, list(x_train_id[500]), 200) 
generate(lstm_model, seed, 100) 

<PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> good night

good night 
( come from the life of god ) the fucking face 
'cause it 's go , where you wo n't no rap and go 

i 'm gonna change to just 
i take you to my head from the track 
i got my a life in the top 
my girl and do n't stop real 
so i just take it 
from my time to death , you can be where your heart we want ? ) 
we made it 
i seen on you up a nigga 
never should rather 

In [502]:
def sparse_cross_ent(y_true, y_pred):
    n = len(y_true)
    cross_ent = 0
    for i in range(n):
        cross_ent -= np.log(y_pred[i][y_true[i]])
    return cross_ent/n, np.exp(cross_ent/n)
    
        
def perplexity_check(model, test_x, test_y):
    yhat = model.predict_on_batch(seed)
    n = len(yhat)
    
    neg_log_prob = 0
    for i in range(n):
        neg_log_prob -= np.log(yhat[i][test_y[i]])
#         neg_log_prob -= yhat[i][test_y[i]]
    print(neg_log_prob)
    return np.exp(neg_log_prob/n)
    
    
print(perplexity_check(lstm_model, np.array(x_val_id), np.array(y_val_id)))
print(perplexity_check(lstm_model, np.array(x_train_id), np.array(y_train_id)))

# predict_batch(lstm_model, np.array(x_val_id[100:110]))

124.78662014007568
262672.1653450545
122.75528049468994
214384.85130782082


In [520]:
def sparse_cross_ent(y_true, y_pred):
    n = len(y_true)
    cross_ent = 0
    for i in range(n):
        cross_ent -= np.log(y_pred[i][y_true[i]])
    return cross_ent/n, np.exp(cross_ent/n)

y_pred = lstm_model.predict_on_batch(x_val_id[:200])
sparse_cross_ent(y_val_id[:200], y_pred)

(4.941591503880918, 139.99287155951305)

In [522]:
lstm_model.evaluate(x_train_id[:300], y_train_id[:300], verbose=1)



[4.4862969144185385,
 0.24666666686534883,
 0.4633333333333333,
 0.5566666674613953,
 393326.2157340495]

In [539]:
K.eval(K.exp(10.3358))

30816.334

In [541]:
np.exp(4.5188)

91.7254613557728