# Load the predictor model
----
(C) Maxim Gansert, Mindscan

In [None]:
import sys
sys.path.insert(0, '../src')

In [None]:
import os
import random

import pandas as pd
import numpy as np
import tensorflow as tf

from de.mindscan.fluentgenesis.bpe.bpe_model import BPEModel

##  Load the Keras model

A checkpoint is provided and the whole model is instantiated

In [None]:

checkpoint = '../data/checkpoints/20200516_1750/predict_m'

m_model = tf.keras.models.load_model(checkpoint)
m_model.summary()


In [None]:
bpemodel = BPEModel("16K-full", "../src/de/mindscan/fluentgenesis/bpe/")
bpemodel.load_hparams()

dataset_directory = bpemodel.get_data_source_path() + "_ml1_bl1_to_bl24"

model_vocabulary = bpemodel.load_tokens()
model_vocabulary_inv = {v: k for k, v in model_vocabulary.items()}

# add the Padding symbol to the model vocabulary.
PAD = 0
UNK = 0
model_vocabulary[PAD] = '<PAD>'

print (model_vocabulary[PAD])
print (len(model_vocabulary))

MODEL_VOCABULARY_LENGTH = len(model_vocabulary)

## Load the data

In [None]:
LENGTH_TO_PREDICT = 100
TRAINING_DATA_SIZE = 60000


# where to get the training data
m_training_data_fullFilename = os.path.join(dataset_directory, 'training_data.jsonl')

m_dataframe = pd.read_json(m_training_data_fullFilename, lines=True)

# only use the first TRAINING_DATA_SIZE for training and testing (and exploring) the model, whether it can converge
m_data = np.stack(m_dataframe['encoded_body'][:TRAINING_DATA_SIZE],axis=0)
m_labels = m_dataframe['encoded_class_label'][:TRAINING_DATA_SIZE]

## predict something


In [None]:
START = random.randint(0,TRAINING_DATA_SIZE-LENGTH_TO_PREDICT)

print ("Start: "+str(START))
topredict=np.stack(m_data[START:START+LENGTH_TO_PREDICT])


In [None]:
K = 5

probs = m_model.predict(topredict)

def top_k(p,k):
    probabilities=p.copy()
    result = []
    for _ in range(0,k):
        first_class = np.argmax(probabilities, axis=-1)
        probabilities[first_class]=0.0
        result.append(first_class)
    return result


def map_token(index, vocab):
    if index not in vocab:
        return None
    else:
        return vocab[index]

def tokens(indexes, vocab):
    result = []
    for x in indexes:
        r = map_token(x, vocab)
        if r is not None:
            result.append(r)
    return result


for i in range(0,LENGTH_TO_PREDICT):
    truth = model_vocabulary_inv[m_labels[i+START]]
    topk = str(tokens(top_k(probs[i],K), model_vocabulary_inv))
    source = str(tokens(m_data[i+START],model_vocabulary_inv))
    print("truth: "+ truth + " predicted: " + topk)
    print("{ " + source+ " }\n")
    