# DKN : Deep Knowledge-Aware Network for News Recommendation

## Global settings and imports

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import logging
from pathlib import Path
import zipfile
from time import time
import polars as pl

import tensorflow as tf
tf.get_logger().setLevel('ERROR') # only show error messages

logging.basicConfig(level=logging.INFO)
LOG = logging.getLogger(__name__)
LOG.setLevel(logging.INFO)

from recommenders.datasets.download_utils import maybe_download
from recommenders.models.deeprec.deeprec_utils import prepare_hparams
from recommenders.models.deeprec.models.dkn import DKN
from recommenders.models.deeprec.io.dkn_iterator import DKNTextIterator

from group_33.util import train_test_split
from group_33.dkn import transform_behaviors, transform_history, tokenize_articles, create_feature_file, transform_behaviors_test, calculate_rankings

print(f"System version: {sys.version}")
print(f"Tensorflow version: {tf.__version__}")

2024-07-08 20:16:06.764966: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-08 20:16:06.765041: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-08 20:16:06.767604: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-08 20:16:06.782166: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


System version: 3.11.8 | packaged by conda-forge | (main, Feb 16 2024, 20:53:32) [GCC 12.3.0]
Tensorflow version: 2.15.0


In [3]:
# DKN parameters
epochs = 10
history_size = 50
batch_size = 1000

DATASET_NAME = "small" # one of: demo, small, large
GROUP_PATH = Path.home() / "shared/194.035-2024S/groups/Gruppe_33/Group_33"


# prepare tmp dir
tmp_path = GROUP_PATH / "tmp" / "dkn" #Path("..", "tmp", "dkn")
tmp_data_path = tmp_path / DATASET_NAME
(tmp_data_path / "validation").mkdir(exist_ok=True, parents=True)
(tmp_data_path / "train").mkdir(exist_ok=True, parents=True)
(tmp_data_path / "evaluation").mkdir(exist_ok=True, parents=True)

tmp_test_path = tmp_path / "test"
tmp_test_path.mkdir(exist_ok=True, parents=True)

# train & validation & evaluation
data_path = GROUP_PATH / "data" / DATASET_NAME
train_file = tmp_data_path / "train" / "behaviours.txt"
valid_file = tmp_data_path / "validation" / "behaviours.txt"
evaluation_file = tmp_data_path / "evaluation" / "behaviors.txt"
user_history_file = tmp_data_path / "user_history.txt"
articles_file = data_path / "articles.parquet"
articles_tokenized_file = tmp_data_path / "articles_tokenized.parquet"
word_embeddings_file = tmp_data_path / "word_embeddings.npy"
entity_embeddings_file = tmp_data_path / "entity_embeddings.npy"
context_embeddings_file = tmp_data_path / "context_embeddings.npy"
news_feature_file = tmp_data_path / "news_feature.txt"
infer_embedding_file = tmp_data_path / "infer_embedding.txt"

# test
test_raw_file = data_path / ".." / "ebnerd_testset" / "test" / "behaviors.parquet"
test_file = tmp_test_path / "behavior.txt"
test_articles_file = GROUP_PATH / "data" / "test" / "articles.parquet"
test_articles_tokenized_file = tmp_test_path / "articles_tokenized.parquet"

# prediction
indexed_behaviors_file = tmp_data_path / "indexed_behaviors.parquet"
scores_file = tmp_data_path / "scores.txt"
predictions_file = tmp_data_path / "predictions.txt"

LOG.info(data_path)
LOG.info(tmp_path)

pl.Config.set_tbl_rows(100)

run_train = True if os.environ.get("TRAIN") else False
print(run_train)

## Data preparation

In [4]:
get_context_embedding("Q35")

array([-1.08683494e+00,  1.00016955e+00,  5.65126327e-01, -1.18931012e-01,
        4.29616674e-01,  6.04400804e-01, -1.76152259e-01,  2.30443105e-01,
        2.69060423e-02,  1.90025512e-01,  5.58039486e-02, -3.05032716e-01,
        7.14906749e-01, -1.30265762e-01,  5.02504432e-01,  1.45054273e+00,
        4.83627211e-02, -3.23556334e-01, -5.30043628e-01, -1.10440839e+00,
       -9.31051620e-01, -6.29733293e-01, -8.72917512e-01,  1.29576482e+00,
        8.90542467e-01,  1.53463695e-01,  4.87522537e-02, -4.03002256e-01,
       -4.29287135e-01, -6.67051489e-01, -1.38137103e-01, -8.26372647e-01,
       -2.51646338e-01,  2.54731861e-02, -3.92829812e-01,  8.01710678e-01,
        4.68979134e-01, -1.37049981e-01, -5.95778783e-01, -6.10334950e-02,
        4.29620545e-01, -9.37379515e-01, -6.25330604e-01, -7.16531807e-01,
       -9.62215591e-02, -1.02356705e-01, -4.54664016e-01,  3.51949342e-01,
       -3.81492264e-01, -1.60995698e-01,  1.22625123e-01,  4.72695653e-01,
       -2.55172412e-01,  

In [5]:
if not (tmp_path / "model.bin").exists():
    maybe_download("http://vectors.nlpl.eu/repository/20/38.zip", tmp_path / "word2vec.zip")

    with zipfile.ZipFile(tmp_path / "word2vec.zip", 'r') as zip_ref:
        zip_ref.extractall(tmp_path)

In [6]:
from group_33.util import train_test_split
pl.Config.set_streaming_chunk_size(500_000)
force_reload = False

if not train_file.exists() or force_reload:
    train = transform_behaviors(pl.scan_parquet(data_path / 'train' / 'behaviors.parquet'))
    train.sink_csv(train_file, separator=' ', quote_style='never', include_header=False)
    # train_test.collect(streaming=True).write_csv(valid_file, separator=' ', quote_style='never', include_header=False)

if not evaluation_file.exists() or force_reload:
    validation_behaviors = pl.scan_parquet(data_path / 'validation' / 'behaviors.parquet')
    validation, evaluation = train_test_split(validation_behaviors, 0.5)

    validation_transformed = transform_behaviors(validation)
    validation_transformed.collect(streaming=True).write_csv(evaluation_file, separator=' ', quote_style='never', include_header=False)

    evaluation_transformed = transform_behaviors(evaluation)
    evaluation_transformed.collect(streaming=True).write_csv(valid_file, separator=' ', quote_style='never', include_header=False)

if not user_history_file.exists() or force_reload:
    user_history = transform_history(
        data_path / 'train' / 'history.parquet',
        data_path / 'validation' / 'history.parquet',
        data_path / '..' / 'ebnerd_testset' / 'test' / 'history.parquet'
    )
    user_history.sink_csv(user_history_file, separator=' ', quote_style='never', include_header=False)

if not articles_tokenized_file.exists() or force_reload:
    tokenize_articles(articles_file, articles_tokenized_file)

if not test_articles_tokenized_file.exists() or force_reload:
    tokenize_articles(test_articles_file, test_articles_tokenized_file)

if not news_feature_file.exists() or force_reload:
    create_feature_file(
        tmp_path / "model.bin",
        articles_tokenized_file, test_articles_tokenized_file,
        word_embeddings_file, entity_embeddings_file,
        context_embeddings_file, news_feature_file, 10
    )

## Create hyper-parameters

In [7]:
yaml_file = maybe_download(url="https://recodatasets.z20.web.core.windows.net/deeprec/deeprec/dkn/dkn_MINDsmall.yaml",
                           work_directory=data_path)
hparams = prepare_hparams(yaml_file,
                          seed=33,
                          show_step=100,
                          news_feature_file=news_feature_file.as_posix(),
                          user_history_file=user_history_file.as_posix(),
                          wordEmb_file=word_embeddings_file.as_posix(),
                          entityEmb_file=entity_embeddings_file.as_posix(),
                          contextEmb_file=context_embeddings_file.as_posix(),
                          epochs=epochs,
                          save_model=True,
                          MODEL_DIR=(tmp_path / "model" / f"{int(time())}_e{epochs}_h{history_size}").as_posix(),
                          history_size=history_size,
                          batch_size=batch_size)

## Train the DKN model

In [9]:
model = DKN(hparams, DKNTextIterator)

  curr_attention_layer = tf.compat.v1.layers.batch_normalization(
2024-07-08 20:24:39.260401: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-07-08 20:24:39.276302: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-07-08 20:24:39.279703: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torv

In [11]:
if run_train:
    model.fit(train_file, valid_file)
else:
    model.load_model(str( GROUP_PATH / "models/DKN/epoch_10"))

## Evaluate the DKN model

In [12]:
res = model.run_eval(str(evaluation_file))
print(res)

2024-07-08 20:25:43.806092: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8902
2024-07-08 20:25:43.808961: I external/local_tsl/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2024-07-08 20:25:44.089851: I external/local_tsl/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory


{'auc': 0.6956, 'group_auc': 0.6917, 'mean_mrr': 0.4934, 'ndcg@5': 0.5387, 'ndcg@10': 0.5919}


## Predict for RecSys Challenge Testdata

In [None]:
if not Path(test_file).exists():
    transform_behaviors_test(str(test_raw_file), indexed_behaviors_file, test_file)

In [None]:
model.predict(str(test_file), scores_file)

In [None]:
rankings = calculate_rankings(indexed_behaviors_file, scores_file)
rankings.write_csv(predictions_file, separator=" ", include_header=False)
print(f"Created predictions file at {predictions_file}")

Sort the rankings by the original order (given by the index) and persist the resulting rankings.



In [None]:
# Check if produced prediction matches with original impression row order
# (pl.scan_parquet(test_raw_file)
#     .select("impression_id", "article_ids_inview")
#     .with_row_index()
#     .with_columns(pl.col("article_ids_inview").map_elements(lambda el: len(el)).alias("len_a_ids"))
#    # .filter(pl.col("impression_id") == 0)
# ).collect().head(1)

In [None]:
# d = (pl.scan_csv(test_file, has_header=False, separator=' ')
#     .filter(pl.col("column_3").str.contains("6451383"))
# 
# )
#     
# d.collect()

## Document embedding inference API

After training, you can get document embedding through this document embedding inference API. The input file format is same with document feature file. The output file fomrat is: `[Newsid] [embedding]`

In [None]:
model.run_get_embedding(news_feature_file, infer_embedding_file)

## References

\[1\] Hongwei Wang, Fuzheng Zhang, Xing Xie and Minyi Guo, "DKN: Deep Knowledge-Aware Network for News Recommendation", in Proceedings of the 2018 World Wide Web Conference (WWW), 2018, https://arxiv.org/abs/1801.08284. <br>
\[2\] Knowledge Graph Embeddings including TransE, TransH, TransR and PTransE. https://github.com/thunlp/KB2E <br>
\[3\] Fangzhao Wu et al., "MIND: A Large-scale Dataset for News Recommendation", Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, 2020, https://msnews.github.io/competition.html. <br>
\[4\] GloVe: Global Vectors for Word Representation. https://nlp.stanford.edu/projects/glove/