In [1]:
import os 
import torch
import numpy as np
import pandas as pd
import torch.nn as nn 
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
from utils.manager import Manager
from utils.util import load_pickle, save_pickle, BM25

In [2]:
command = """
-bs 2 -bse 2 -ef title abs -s demo -plm distilbert
"""
manager = Manager(command=command.strip().split(" "))
loaders = manager.prepare()

[2022-03-30 10:48:15,077] INFO (Manager) Hyper Parameters are:
{'scale': 'demo', 'batch_size': 2, 'batch_size_eval': 2, 'checkpoint': 'none', 'verbose': None, 'his_size': 50, 'impr_size': 20, 'negative_num': 4, 'dropout_p': 0.1, 'learning_rate': 1e-05, 'scheduler': 'none', 'warmup': 0.1, 'title_length': 32, 'abs_length': 64, 'enable_fields': ['title', 'abs'], 'newsEncoder': 'cnn', 'userEncoder': 'rnn', 'hidden_dim': 768, 'head_num': 12, 'k': 4, 'plm': 'distilbert', 'seed': 3407, 'world_size': 1, 'sequence_length': 96}

[2022-03-30 10:48:15,081] INFO (MIND_Train) Loading Cache at MINDdemo_train
[2022-03-30 10:48:15,981] INFO (MIND_Dev) Loading Cache at MINDdemo_dev
[2022-03-30 10:48:16,779] INFO (MIND_News) Loading Cache at MINDdemo_dev


In [4]:
t = AutoTokenizer.from_pretrained(manager.plm_dir)
m = AutoModel.from_pretrained(manager.plm_dir)

In [3]:
loader_train = loaders["train"]
loader_dev = loaders["dev"]
loader_news = loaders["news"]

dataset_train = loader_train.dataset
dataset_dev = loader_dev.dataset
dataset_news = loader_news.dataset

X1 = iter(loader_train)
X2 = iter(loader_dev)
X3 = iter(loader_news)
x = next(X1)
x2 = next(X2)
x3 = next(X3)

In [6]:
# check news
index = 1
cdd_token_id = x3['cdd_token_id'][index]
t.decode(cdd_token_id, skip_special_tokens=True)

"the brands queen elizabeth, prince charles, and prince philip swear by shop the notebooks, jackets, and more that the royals can't live without."

In [11]:
a = torch.rand(2,3)
a.expand(2,3,3)

RuntimeError: The expanded size of the tensor (3) must match the existing size (2) at non-singleton dimension 1.  Target sizes: [2, 3, 3].  Tensor sizes: [2, 3]

In [10]:
# check attention mask
index = (0, 0)
cdd_token_id = x['cdd_token_id'][index]
cdd_attn_mask = x["cdd_attn_mask"][index]
his_token_id = x["his_token_id"][index]
his_attn_mask = x["his_attn_mask"][index]

cdd_token = t.convert_ids_to_tokens(cdd_token_id)
his_token = t.convert_ids_to_tokens(his_token_id)

line = "{:20} a ".format(" "*20)
print(line)
for i in range(manager.sequence_length):
    line = "{:20} {}".format(cdd_token[i], cdd_attn_mask[i])
    print(line)
    if cdd_token[i] == "[PAD]":
        break

                     a 
[CLS]                1
what                 1
you                  1
need                 1
to                   1
know                 1
about                1
the                  1
c                    1
##8                  1
corvette             1
'                    1
s                    1
new                  1
dual                 1
-                    1
clutch               1
transmission         1
[SEP]                1
the                  1
new                  1
corvette             1
has                  1
an                   1
eight                1
-                    1
speed                1
tre                  1
##me                 1
##c                  1
dc                   1
##t                  1
.                    1
we                   1
weren                1
'                    1
t                    1
crazy                1
about                1
it                   1
in                   1
the                  1
pre       

In [None]:
# check train loader result
nid2index = load_pickle("/data/v-pezhang/Code/GateFormer/src/data/cache/MIND/MINDdemo_train/news/nid2index.pkl")
uid2index = load_pickle("/data/v-pezhang/Code/GateFormer/src/data/cache/MIND/uid2index.pkl")
nindex2id = {v:k for k,v in nid2index.items()}
uindex2id = {v:k for k,v in uid2index.items()}

# check behaviors.tsv
print([uindex2id[i] for i in x["user_index"].tolist()], (x["impr_index"] + 1).tolist())
# check news.tsv
print([nindex2id[i] for i in x["cdd_idx"][0][:5].tolist()])
print(t.batch_decode(x["cdd_token_id"][0][:5], skip_special_tokens=True))

print([nindex2id[i] for i in x["his_idx"][0][:5].tolist()])
print(t.batch_decode(x["his_token_id"][0][:5], skip_special_tokens=True))

In [None]:
# check dev loader result
nid2index = load_pickle("/data/v-pezhang/Code/GateFormer/src/data/cache/MIND/MINDdemo_dev/news/nid2index.pkl")
uid2index = load_pickle("/data/v-pezhang/Code/GateFormer/src/data/cache/MIND/uid2index.pkl")
nindex2id = {v:k for k,v in nid2index.items()}
uindex2id = {v:k for k,v in uid2index.items()}

# check behaviors.tsv
print([uindex2id[i] for i in x2["user_index"].tolist()], (x2["impr_index"] + 1).tolist())
# check news.tsv
print([nindex2id[i] for i in x2["cdd_idx"][0][:5].tolist()])
print(t.batch_decode(x2["cdd_token_id"][0][:5], skip_special_tokens=True))

print([nindex2id[i] for i in x2["his_idx"][0][:5].tolist()])
print(t.batch_decode(x2["his_token_id"][0][:5], skip_special_tokens=True))