## Imports + Settings

In [1]:
import sys

import os
from tempfile import TemporaryDirectory
import logging
import papermill as pm
import scrapbook as sb
import tensorflow as tf
tf.get_logger().setLevel('ERROR') # only show error messages

from recommenders.datasets.download_utils import maybe_download
from recommenders.datasets.mind import (download_mind, 
                                     extract_mind, 
                                     read_clickhistory, 
                                     get_train_input, 
                                     get_valid_input, 
                                     get_user_history,
                                     get_words_and_entities,
                                     generate_embeddings) 
from recommenders.models.deeprec.deeprec_utils import prepare_hparams
from recommenders.models.deeprec.models.dkn import DKN
from recommenders.models.deeprec.io.dkn_iterator import DKNTextIterator

print(f"System version: {sys.version}")
print(f"Tensorflow version: {tf.__version__}")

System version: 3.7.0 (default, Oct  9 2018, 10:31:47) 
[GCC 7.3.0]
Tensorflow version: 1.15.5


In [2]:
# Temp dir
tmpdir = TemporaryDirectory()

In [3]:
# Mind parameters
MIND_SIZE = "small"

# DKN parameters
epochs = 10
history_size = 50
batch_size = 100

# Paths
data_path = os.path.join(tmpdir.name, "mind-dkn")
train_file = os.path.join(data_path, "train_mind.txt")
valid_file = os.path.join(data_path, "valid_mind.txt")
user_history_file = os.path.join(data_path, "user_history.txt")
infer_embedding_file = os.path.join(data_path, "infer_embedding.txt")

## Data import

In [4]:
train_zip, valid_zip = download_mind(size=MIND_SIZE, dest_path=data_path)
train_path, valid_path = extract_mind(train_zip, valid_zip)

100%|██████████| 51.7k/51.7k [00:48<00:00, 1.06kKB/s]
100%|██████████| 30.2k/30.2k [00:20<00:00, 1.50kKB/s]


In [6]:
train_session, train_history = read_clickhistory(train_path, "behaviors.tsv")
valid_session, valid_history = read_clickhistory(valid_path, "behaviors.tsv")
get_train_input(train_session, train_file)
get_valid_input(valid_session, valid_file)
get_user_history(train_history, valid_history, user_history_file)


<class 'list'>


In [7]:
train_news = os.path.join(train_path, "news.tsv")
valid_news = os.path.join(valid_path, "news.tsv")
news_words, news_entities = get_words_and_entities(train_news, valid_news)

In [9]:
train_entities = os.path.join(train_path, "entity_embedding.vec")
valid_entities = os.path.join(valid_path, "entity_embedding.vec")
news_feature_file, word_embeddings_file, entity_embeddings_file = generate_embeddings(
    data_path,
    news_words,
    news_entities,
    train_entities,
    valid_entities,
    max_sentence=10,
    word_embedding_dim=100,
)

100%|██████████| 842k/842k [04:15<00:00, 3.30kKB/s] 
