In [1]:
from google.colab import drive
drive._mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
%tensorflow_version 1.x 

TensorFlow 1.x selected.


In [3]:
%cd /content/drive/MyDrive/20211/rec-sys/mind-recomendation/naml

/content/drive/MyDrive/20211/rec-sys/mind-recomendation/naml


In [4]:
import sys
import os
import numpy as np
import zipfile
from tqdm import tqdm 
from tempfile import TemporaryDirectory
import tensorflow as tf
tf.get_logger().setLevel('ERROR') # only show error messages

from recommenders.models.deeprec.deeprec_utils import download_deeprec_resources 
from recommenders.models.newsrec.newsrec_utils import prepare_hparams
from recommenders.models.newsrec.models.naml_lstm import NAMLModel
from recommenders.models.newsrec.io.mind_all_iterator import MINDAllIterator
from recommenders.models.newsrec.newsrec_utils import get_mind_data_set

print("System version: {}".format(sys.version))
print("Tensorflow version: {}".format(tf.__version__))


Using TensorFlow backend.


System version: 3.7.12 (default, Sep 10 2021, 00:21:48) 
[GCC 7.5.0]
Tensorflow version: 1.15.2


## Prepare Parameters

In [5]:
epochs = 1
seed = 42
batch_size = 32

# Options: demo, small, large
MIND_type = 'small'

## Download and load data

In [6]:
data_path = data_path = "/content/drive/MyDrive/20211/rec-sys/mind-recomendation/data/small"

train_news_file = os.path.join(data_path, 'train', r'news.tsv')
train_behaviors_file = os.path.join(data_path, 'train', r'behaviors.tsv')
valid_news_file = os.path.join(data_path, 'valid', r'news.tsv')
valid_behaviors_file = os.path.join(data_path, 'valid', r'behaviors.tsv')
wordEmb_file = os.path.join(data_path, "utils", "embedding_all.npy")
userDict_file = os.path.join(data_path, "utils", "uid2index.pkl")
wordDict_file = os.path.join(data_path, "utils", "word_dict_all.pkl")
vertDict_file = os.path.join(data_path, "utils", "vert_dict.pkl")
subvertDict_file = os.path.join(data_path, "utils", "subvert_dict.pkl")
yaml_file = os.path.join(data_path, "utils", r'naml.yaml')

mind_url, mind_train_dataset, mind_dev_dataset, mind_utils = get_mind_data_set(MIND_type)

if not os.path.exists(train_news_file):
    download_deeprec_resources(mind_url, os.path.join(data_path, 'train'), mind_train_dataset)
    
if not os.path.exists(valid_news_file):
    download_deeprec_resources(mind_url, \
                               os.path.join(data_path, 'valid'), mind_dev_dataset)
if not os.path.exists(yaml_file):
    download_deeprec_resources(r'https://recodatasets.z20.web.core.windows.net/newsrec/', \
                               os.path.join(data_path, 'utils'), mind_utils)

## Create hyper-parameters

In [7]:
hparams = prepare_hparams(yaml_file, 
                          wordEmb_file=wordEmb_file,
                          wordDict_file=wordDict_file, 
                          userDict_file=userDict_file,
                          vertDict_file=vertDict_file, 
                          subvertDict_file=subvertDict_file,
                          batch_size=batch_size,
                          epochs=epochs)
print(hparams)

HParams object with values {'support_quick_scoring': True, 'dropout': 0.2, 'attention_hidden_dim': 200, 'head_num': 4, 'head_dim': 100, 'filter_num': 400, 'window_size': 3, 'vert_emb_dim': 100, 'subvert_emb_dim': 100, 'gru_unit': 400, 'type': 'ini', 'user_emb_dim': 50, 'learning_rate': 0.0001, 'optimizer': 'adam', 'epochs': 1, 'batch_size': 32, 'show_step': 100000, 'title_size': 30, 'body_size': 50, 'his_size': 50, 'vert_num': 17, 'subvert_num': 249, 'data_format': 'naml', 'npratio': 4, 'metrics': ['group_auc', 'mean_mrr', 'ndcg@5;10'], 'word_emb_dim': 300, 'cnn_activation': 'relu', 'model_type': 'naml', 'dense_activation': 'relu', 'loss': 'cross_entropy_loss', 'wordEmb_file': '/content/drive/MyDrive/20211/rec-sys/mind-recomendation/data/small/utils/embedding_all.npy', 'wordDict_file': '/content/drive/MyDrive/20211/rec-sys/mind-recomendation/data/small/utils/word_dict_all.pkl', 'userDict_file': '/content/drive/MyDrive/20211/rec-sys/mind-recomendation/data/small/utils/uid2index.pkl', 'v

In [8]:
iterator = MINDAllIterator

## Train the NAML model

In [9]:
model = NAMLModel(hparams, iterator, seed=seed)

dau vao new encoder = Tensor("input_13:0", shape=(?, 82), dtype=int32)
shape vao (?, 50, 82)
Model: "news_encoder"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_13 (InputLayer)           [(None, 82)]         0                                            
__________________________________________________________________________________________________
lambda (Lambda)                 (None, 30)           0           input_13[0][0]                   
__________________________________________________________________________________________________
lambda_1 (Lambda)               (None, 50)           0           input_13[0][0]                   
__________________________________________________________________________________________________
lambda_2 (Lambda)               (None, 1)            0           input_13[0][0]              

In [10]:
model.model.summary()


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            [(None, 5, 30)]      0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            [(None, 5, 50)]      0                                            
__________________________________________________________________________________________________
input_7 (InputLayer)            [(None, 5, 1)]       0                                            
__________________________________________________________________________________________________
input_8 (InputLayer)            [(None, 5, 1)]       0                                            
______________________________________________________________________________________________

In [11]:
model._build_naml()[0].summary()

dau vao new encoder = Tensor("input_31:0", shape=(?, 82), dtype=int32)
shape vao (?, 50, 82)
Model: "news_encoder"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_31 (InputLayer)           [(None, 82)]         0                                            
__________________________________________________________________________________________________
lambda_4 (Lambda)               (None, 30)           0           input_31[0][0]                   
__________________________________________________________________________________________________
lambda_5 (Lambda)               (None, 50)           0           input_31[0][0]                   
__________________________________________________________________________________________________
lambda_6 (Lambda)               (None, 1)            0           input_31[0][0]              

In [12]:
# print(model.run_eval(valid_news_file, valid_behaviors_file))

In [13]:
%%time
model.fit(train_news_file, train_behaviors_file,valid_news_file, valid_behaviors_file)

7385it [1:10:34,  1.74it/s]
42386it [03:08, 225.28it/s]
73121it [53:06, 22.94it/s]
73152it [00:15, 4629.00it/s]


at epoch 1
train info: logloss loss:1.3964866371435658
eval info: group_auc:0.5143, mean_mrr:0.2265, ndcg@10:0.291, ndcg@5:0.2303
at epoch 1 , train time: 4234.2 eval time: 3487.8
CPU times: user 2h 5min 43s, sys: 6min 15s, total: 2h 11min 58s
Wall time: 2h 8min 42s


<recommenders.models.newsrec.models.naml_lstm.NAMLModel at 0x7f85e6f5ec50>

In [14]:
%%time
# res_syn = model.run_eval(valid_news_file, valid_behaviors_file)
# print(res_syn)

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 9.3 µs


In [15]:
# sb.glue("res_syn", res_syn)

## Save the model

In [16]:
model_path = os.path.join(data_path, "model-naml-lstm")
os.makedirs(model_path, exist_ok=True)

model.model.save_weights(os.path.join(model_path, "naml_ckpt"))

## Output Prediction File
This code segment is used to generate the prediction.zip file, which is in the same format in [MIND Competition Submission Tutorial](https://competitions.codalab.org/competitions/24122#learn_the_details-submission-guidelines).

Please change the `MIND_type` parameter to `large` if you want to submit your prediction to [MIND Competition](https://msnews.github.io/competition.html).

In [None]:
group_impr_indexes, group_labels, group_preds = model.run_fast_eval(valid_news_file, valid_behaviors_file)

42386it [03:07, 226.58it/s]
1793it [01:17, 23.14it/s]

In [None]:
with open(os.path.join(data_path, 'prediction.txt'), 'w') as f:
    for impr_index, preds in tqdm(zip(group_impr_indexes, group_preds)):
        impr_index += 1
        pred_rank = (np.argsort(np.argsort(preds)[::-1]) + 1).tolist()
        pred_rank = '[' + ','.join([str(i) for i in pred_rank]) + ']'
        f.write(' '.join([str(impr_index), pred_rank])+ '\n')

In [None]:
f = zipfile.ZipFile(os.path.join(data_path, 'prediction.zip'), 'w', zipfile.ZIP_DEFLATED)
f.write(os.path.join(data_path, 'prediction.txt'), arcname='prediction.txt')
f.close()

## Reference
\[1\] Chuhan Wu, Fangzhao Wu, Mingxiao An, Jianqiang Huang, Yongfeng Huang and Xing Xie: Neural News Recommendation with Attentive Multi-View Learning, IJCAI 2019<br>
\[2\] Wu, Fangzhao, et al. "MIND: A Large-scale Dataset for News Recommendation" Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics. https://msnews.github.io/competition.html <br>
\[3\] GloVe: Global Vectors for Word Representation. https://nlp.stanford.edu/projects/glove/