# Environment Preparation.ipynb

# Importing Packages

In [None]:
from tensorflow.python.client import device_lib

import matplotlib
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd

import os
import re
import sys
import warnings
from slugify import slugify
import textwrap
import unicodedata
from IPython.display import SVG

plt.style.use('fivethirtyeight')
%matplotlib inline
print(device_lib.list_local_devices())


wrapper = textwrap.TextWrapper(width=100)

In [None]:
current_dir = os.path.abspath('/tmp/var-attention/')

if not os.path.exists(current_dir):
    os.makedirs(current_dir)

print('current_dir:', current_dir)

In [1]:
# Load the data from kaggle

In [None]:
# Print out the data structure 

!ls /tmp/var-attention/data

# Configuration

In [None]:
wrapper = textwrap.TextWrapper(width=100)

data_dir = current_dir + '/data/'
outputs_dir = current_dir + '/outputs/'
arch_dir = outputs_dir + slugify(config['model']) + '/'
outputs_data_dir = outputs_dir + 'data/'

logs_dir = arch_dir + 'summary/'
log_str_dir = arch_dir + 'outcome/'
model_checkpoint_dir = arch_dir + 'checkpoints/var-seq2seq-with-atten-'
bleu_path = arch_dir + 'bleu/det-seq2seq-var-attn'
w2v_dir = outputs_data_dir
w2v_path = w2v_dir + 'w2v_model_news.pkl'


if not os.path.exists(data_dir):
    os.makedirs(data_dir)

if not os.path.exists(outputs_dir):
    os.makedirs(outputs_dir)

if not os.path.exists(arch_dir):
    os.makedirs(arch_dir)
    
if not os.path.exists(w2v_dir):
    os.makedirs(w2v_dir)

print('Data dir:', data_dir)
print('Outputs dir:', outputs_dir)
print('Arch dir:', arch_dir)

config['data_dir'] = data_dir
config['logs_dir'] = logs_dir
config['log_str_dir'] = log_str_dir
config['model_checkpoint_dir'] = model_checkpoint_dir
config['bleu_path'] = bleu_path
config['wrapper'] = wrapper
config['w2v_dir'] = w2v_dir

In [None]:
input_type = 'content'
output_type = 'title'
decoder_filters = encoder_filters = '!"#$%&()*+/:;<=>@[\\]^`{|}~\t\n'

# Dataset Preparation

In [None]:
from src.preparation import create_news_data

print('[INFO] Importing the data')
data_sources = [
    os.path.join(config['data_dir'], 'articles1.csv'),
    os.path.join(config['data_dir'], 'articles2.csv'),
    os.path.join(config['data_dir'], 'articles3.csv'),
]
data = create_news_data(
    data_sources,
    num_samples=config['num_samples'],
    preprocessing=config['preprocessing'])

# Word2vec

In [4]:
# from src.preparation import create_news_data


# def load_data(num_samples=None, preprocessing=True):
#     data_sources = [
#         os.path.join(config['data_dir'], 'articles1.csv'),
#         os.path.join(config['data_dir'], 'articles2.csv'),
#         os.path.join(config['data_dir'], 'articles3.csv'),
#     ]

#     data = create_news_data(data_sources, num_samples=num_samples, preprocessing=preprocessing)

#     return data


# def create_w2v(sentences):
#     np.random.shuffle(sentences)
#     sentences = [WhitespaceTokenizer().tokenize(s) for s in sentences]
#     w2v_model = gensim.models.Word2Vec(
#         sentences,
#         size=300,
#         min_count=1,
#         iter=50)
#     w2v_model.save(config['w2v_dir'] + 'w2v_model_news.pkl')
    

# def init_w2v():
#     data = load_data(num_samples=None, preprocessing=True)
#     data = data['title'] + ' ' + data['content']
#     create_w2v(data)
#     print('Word2Vec created successfully.')

In [None]:
print('[INFO] Embeddings vector and matrix')

word2vec = {}
with open(os.path.join(config['data_dir'], 'numberbatch-en-17.06.txt'), encoding='utf-8') as infile:
    for line in infile:
        values = line.split()
        word = values[0]
        vec = np.asarray(values[1:], dtype='float32')
        word2vec[word] = vec

print('Found %s word vectors.' % len(word2vec))

# Tokenization

In [None]:
from src.preparation import tokenize_sequence


print('[INFO] Tokenizing input and output sequences')
input_sentences = data[input_type].values
output_sentences = data[output_type].values

print('Inputs:', len(input_sentences))
print('Outputs:', len(output_sentences))

x, word2idx_inputs, x_sen = tokenize_sequence(
    sentences=input_sentences,
    max_num_words=config['encoder_num_tokens'],
    max_vocab_size=config['encoder_vocab'],
    filters=encoder_filters)

y, word2idx_outputs, y_sen = tokenize_sequence(
    sentences=output_sentences,
    max_num_words=config['decoder_num_tokens'],
    max_vocab_size=config['decoder_vocab'],
    filters=decoder_filters)

# Splitter

In [None]:
from src.preparation import create_data_split


print('[INFO] Split data into train-valid-test sets')
train_data, valid_data, test_data = create_data_split(
    x=[x, x_sen],
    y=[y, y_sen],
    valid_size=.3,
    test_size=.5,
    verbose=True)
(x_train, y_train, x_sen_train, y_sen_train) = train_data
(x_valid, y_valid, x_sen_valid, y_sen_valid) = valid_data
(x_test, y_test, x_sen_test, y_sen_test) = test_data

# Embedding Matrix

In [None]:
from src.preparation import create_embedding_matrix


print('[INFO] Embeddings vector and matrix')

encoder_embeddings_matrix = create_embedding_matrix(
    word_index=word2idx_inputs,
    embedding_dim=config['embedding_size'],
    w2v_path=word2vec)

decoder_embeddings_matrix = create_embedding_matrix(
    word_index=word2idx_outputs,
    embedding_dim=config['embedding_size'],
    w2v_path=word2vec)


print('encoder_embeddings_matrix:', encoder_embeddings_matrix.shape)
print('decoder_embeddings_matrix:', decoder_embeddings_matrix.shape)

# Re-calculate the vocab size based on the word_idx dictionary
config['encoder_vocab'] = len(word2idx_inputs)
config['decoder_vocab'] = len(word2idx_outputs)

# Model

In [None]:
import tensorflow as tf
from src.ved import VarSeq2SeqVarAttnModel


tf.reset_default_graph()
tf_config = tf.ConfigProto()
tf_config.gpu_options.allow_growth = True
session = tf.Session(config=tf_config)


model = VarSeq2SeqVarAttnModel(
    config=config,
    encoder_embeddings_matrix=encoder_embeddings_matrix,
    decoder_embeddings_matrix=decoder_embeddings_matrix,
    encoder_word_index=word2idx_inputs,
    decoder_word_index=word2idx_outputs)

model.train(x_train, y_train, x_valid, y_valid, y_sen_valid)

# Prediction and Evaluation

In [None]:
if config['load_checkpoint'] != 0:
    checkpoint = config['model_checkpoint_dir'] + str(config['load_checkpoint']) + '.ckpt'
else:
    checkpoint = tf.train.get_checkpoint_state(os.path.dirname('models/checkpoint')).model_checkpoint_path

print('checkpoint:', checkpoint)

preds = model.predict(checkpoint, x_test, y_test, y_sen_test)

In [None]:
count = 5
model.show_output_sentences(
    preds[:count],
    y_test[:count],
    x_sen_test[:count],
    y_sen_test[:count])

model.get_diversity_metrics(checkpoint, x_test, y_test)