# Model prelim notebook

Use this notebook to walk through a model to mak

## Preprocess raw data and save to local

Download from:
https://archive.org/details/stackexchange

- comes in *.tar.7z
- you can download 7zip from their downloads page for linux
- you can unzip using bzip2
- you can run 7z binary, for example: `p7zip_16.02/bin/7z e philosophy.stackexchange.com.7z`

In [None]:
cd ..

In [None]:
import os
import warnings
from random import shuffle
warnings.filterwarnings("ignore", category=UserWarning, module='bs4')

from backend.parse_stack_exchange import parse_xml_doc, preprocess_a_text, get_texts, write_data

In [None]:
xmls = {'Badges.xml': None,
        'Comments.xml': 'Text',
        'PostHistory.xml': None,
        'PostLinks.xml': None,
        'Posts.xml': 'Body',
        'Tags.xml': None,
        'Users.xml': None,
        'Votes.xml': None}
stack_dir = os.path.join('data', 'stack_exchange')
stack_list= ['monero.stackexchange.com',
             'movies.stackexchange.com',
             'philosophy.stackexchange.com',
             'politics.stackexchange.com']
datatype = 'Comments.xml'
childs = [None, 'Text', None, None, 'Body', None, None, None]

In [None]:
ls data/

In [None]:
path = os.path.join(stack_dir, stack_list[2], datatype)
docs = parse_xml_doc(path, xmls[datatype])

In [None]:
preprocessed_docs = [preprocess_a_text(doc) for doc in docs]

questions = get_texts(preprocessed_docs, get_questions=True)
non_questions = get_texts(preprocessed_docs, get_questions=False)

shuffle(questions)
shuffle(non_questions)

# equalize list lengths
non_questions = non_questions[:len(questions)]

In [None]:
write_data(questions, non_questions)

## Model - Attention 1

If you take a vector and normalize it so that it sums to 1 (prob distirbution style) then you can take a measure: Entropy.

In [None]:
cd ..

In [1]:
import io
import os
import numpy as np
import tensorflow as tf
import pickle

from models.attention_1.model import convert_prediction_to_sentence
from backend.preprocess import create_features, get_raw_vocab, read_files, process_glove
from backend.data_gather import gather_stack_exchange_from_file, make_model_dir

from models.attention_1.model import model_fn, input_fn, pred_input_fn, serving_input_receiver_fn, convert_prediction_to_sentence

In [None]:
file_names = ["./data/QUESTIONS.txt", "./data/NON_QUESTIONS.txt"]

BATCH_SIZE = 5
GLOVE = './data/glove.6B.50d.txt'
MAX_SEQ_LEN = 25
questions, non_questions = gather_stack_exchange_from_file(*file_names)
vocab, vectors, word2index = process_glove(glove_file=GLOVE)
index2word = {idx: word for word, idx in word2index.items()}

In [None]:
OVERWRITE = False
if not os.path.exists('features.pkl') or not os.path.exists('labels.pkl') or OVERWRITE:
    features, labels = create_features(questions,
                                       non_questions,
                                       word2index,
                                       max_seq_length=MAX_SEQ_LEN,
                                       size=5)
    with open('features.pkl', 'wb+') as fe, open('labels.pkl', 'wb+') as la:
        pickle.dump(features, fe)
        pickle.dump(labels, la)
else:
    with open('features.pkl', 'rb') as fe, open('labels.pkl', 'rb') as la:
        features = pickle.load(fe)
        labels = pickle.load(la)

## Make toy data

In [None]:
OVERWRITEsm = False
if not os.path.exists('small_features.pkl') or not os.path.exists('small_labels.pkl') or OVERWRITEsm:

    smallword2index = dict()
    smallvocab = list()
    idx_set = list(set([x for a in features[0] for x in a] + [0, 1, 2, word2index['<UNK>']] + [z for b in labels[0] for z in b]))
    for idx in idx_set:

        if idx in word2index.values():
            smallword2index[index2word[idx]] = idx
            smallvocab.append(index2word[idx])

    s2 = {idx: word for word, idx in smallword2index.items()}

    newidx2word = {idx: word for idx, word in enumerate(list(s2.values()))}
    newword2idx = {word: idx for idx, word in newidx2word.items()}

    newlist = list()
    for f in features[0]:
        words = [index2word[x] for x in f]
        newlist.append([newword2idx[x] for x in words])

    newlist2 = list()
    for g in labels[0]:
        words = [index2word[x] for x in g]
        newlist2.append([newword2idx[ii] for ii in words])

    small_features = (newlist, features[1], features[2], features[3])
    small_labels = (newlist2, labels[1], labels[2])

    with open('small_features.pkl', 'wb+') as fe, open('small_labels.pkl', 'wb+') as la:
        pickle.dump(features, fe)
        pickle.dump(labels, la)
else:
    with open('notebooks/small_features.pkl', 'rb') as fe, open('notebooks/small_labels.pkl', 'rb') as la:
        small_features = pickle.load(fe)
        small_labels = pickle.load(la)

        smallword2index = dict()
        smallvocab = list()
        idx_set = list(set([x for a in small_features[0] for x in a] + [0, 1, 2, word2index['<UNK>']] + [z for b in small_labels[0] for z in b]))
        for idx in idx_set:

            if idx in word2index.values():
                smallword2index[index2word[idx]] = idx
                smallvocab.append(index2word[idx])
        s2 = {idx: word for word, idx in smallword2index.items()}

        newidx2word = {idx: word for idx, word in enumerate(list(s2.values()))}
        newword2idx = {word: idx for idx, word in newidx2word.items()}

In [None]:
graph_params = {
    'num_classes': 4,
    'vocab_size': len(smallvocab),#len(vocab),
    'embed_dim': 50,
    'num_units': 50,
    'input_max_length': MAX_SEQ_LEN,
    'output_max_length': MAX_SEQ_LEN,
    'forget_bias': 0.6,
    'batch_size': BATCH_SIZE,
    'glove_file': GLOVE,
    'embedding_vectors': vectors,
    'word2index': newword2idx,#word2index,
    'index2word': newidx2word,#index2word,
    'vocab': smallvocab,#vocab,
    'decode_mode': True

}
data_params = {
    'shuffle': 1,
    'glove_file': GLOVE,
    'repeat': -1,
    'batch_size': BATCH_SIZE
}
model_dir = make_model_dir(name='attention1-1_trainable_embeds_log_finalstate',
                           overwrite=True)
config_dict = {'model_dir': model_dir,
               'tf_random_seed': 42,
               'save_summary_steps': 1000,
               'save_checkpoints_steps': 200,
               'keep_checkpoint_max': 5,
               'keep_checkpoint_every_n_hours': 10000,
               'log_step_count_steps': 2000,
               'train_distribute': None,
         }
config = tf.estimator.RunConfig(**config_dict)

In [None]:
classifier = tf.estimator.Estimator(model_fn=model_fn, params=graph_params, config=config)
classifier.train(steps=100000, input_fn=lambda: input_fn(small_features, small_labels, data_params));

### Export the model for serving

In [None]:
def serving_input_receiver_fn1():
    feature_placeholders  ={
      'encoder_inputs' : tf.placeholder(tf.int32, [None, MAX_SEQ_LEN]),
      'encoder_input_lengths' : tf.placeholder(tf.int32, [None])}
    features = {
            key: tensor
            for key, tensor in feature_placeholders.items()
        }
 
    return tf.estimator.export.ServingInputReceiver(features, 
                                                    feature_placeholders)

In [None]:
export_dir = classifier.export_savedmodel('TEST_EXPORT',
                                          serving_input_receiver_fn=serving_input_receiver_fn1)

## Reload classifier

In [None]:
classifier = tf.estimator.Estimator(model_fn=model_fn, params=graph_params, model_dir=model_dir, config=config)

## Make predictions

In [None]:
def to_sent(ind, index2word):
    return " ".join([index2word[x] for x in ind.tolist()])

In [None]:
x = classifier.predict(input_fn=lambda: pred_input_fn(small_features),
                       checkpoint_path=None)


In [None]:
ha = list()
for i in x:
    ha.append(convert_prediction_to_sentence(i, newidx2word))

In [None]:
ha

In [None]:
for u in small_features[0]:
    print(" ".join([newidx2word[x] for x in u])); print()

In [None]:
for u in small_labels[0]:
    print(" ".join([newidx2word[x] for x in u])); print()

#### Sandbox

In [2]:
tf.initializers.truncated_normal(0.0, .001, dtype=tf.float32)

<tensorflow.python.ops.init_ops.TruncatedNormal at 0x1a940a88080>

In [None]:
tf.contrib.seq2seq.dynamic_decode()