In [1]:
import os 
import torch
import numpy as np
import pandas as pd
import torch.nn as nn 
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
from utils.manager import Manager
from utils.util import load_pickle, save_pickle, BM25

In [5]:
class config:
    epochs = 10
    scale = "large"
    mode = "test"
    device = 0
    batch_size = 2
    batch_size_encode = 2
    dropout_p = 0.1
    seed = 3407
    world_size = 1
    checkpoint = "best"

    data_root = "../../../Data"
    cache_root = "data/cache"

    his_size = 50
    impr_size = 20
    negative_num = 4

    max_title_length = 64
    max_abs_length = 256
    title_length = 32
    abs_length = 64

    plm = "bert"

    enable_fields = ["abs"]
    enable_gate = "weight"
    # enable_gate = "bm25"

    rank = 0
    verbose = None
    distributed = False
    debug = False

manager = Manager(config, notebook=True)
loaders = manager.prepare()

[2022-02-23 02:43:26,050] INFO (Manager) Hyper Parameters are:
{'scale': 'large', 'batch_size': 2, 'batch_size_encode': 2, 'dropout_p': 0.1, 'seed': 3407, 'world_size': 1, 'checkpoint': 'best', 'his_size': 50, 'impr_size': 20, 'negative_num': 4, 'title_length': 32, 'abs_length': 64, 'plm': 'bert', 'enable_fields': ['abs'], 'enable_gate': 'weight', 'verbose': None, 'sequence_length': 64}

[2022-02-23 02:43:26,053] INFO (MIND_Test) Loading Cache at MINDlarge_test
[2022-02-23 02:44:17,011] INFO (MIND_News) Loading Cache at MINDlarge_test


In [3]:
t = AutoTokenizer.from_pretrained(manager.plm_dir)
# m = AutoModel.from_pretrained(manager.plm_dir).to(0)

In [6]:
loaders["test"].dataset.imprs[0]

(0,
 ['N101071',
  'N15647',
  'N83400',
  'N124838',
  'N57092',
  'N64623',
  'N62785',
  'N112133',
  'N98744',
  'N55764',
  'N16531',
  'N54103',
  'N128905',
  'N2296',
  'N45689',
  'N87027'])

In [16]:
loader_train = loaders["train"]
loader_dev = loaders["dev"]
loader_news = loaders["news"]

dataset_train = loader_train.dataset
dataset_dev = loader_dev.dataset
dataset_news = loader_news.dataset

X1 = iter(loader_train)
X2 = iter(loader_dev)
X3 = iter(loader_news)
x = next(X1)
x2 = next(X2)
x3 = next(X3)

In [14]:
# check news
index = 1
cdd_token_id = x3['cdd_token_id'][index]
t.decode(cdd_token_id, skip_special_tokens=True)

NameError: name 't' is not defined

In [6]:
# check gate mask
index = (0, 0)
cdd_token_id = x['cdd_token_id'][index]
cdd_attn_mask = x["cdd_attn_mask"][index]
cdd_gate_mask = x["cdd_gate_mask"][index]
his_token_id = x["his_token_id"][index]
his_attn_mask = x["his_attn_mask"][index]
his_gate_mask = x["his_gate_mask"][index]

cdd_token = t.convert_ids_to_tokens(cdd_token_id)
his_token = t.convert_ids_to_tokens(his_token_id)

line = "{:15} a g".format(" "*15)
print(line)
for i in range(manager.sequence_length):
    line = "{:15} {} {}".format(cdd_token[i], cdd_attn_mask[i], cdd_gate_mask[i])
    print(line)
    if cdd_token[i] == "[PAD]":
        break

                a g
[CLS]           1 0
what            1 1
you             1 1
need            1 1
to              1 1
know            1 1
about           1 1
the             1 1
c               1 1
##8             1 1
corvette        1 1
'               1 1
s               1 1
new             1 1
dual            1 1
-               1 1
clutch          1 1
transmission    1 1
[SEP]           1 0
the             1 1
new             1 1
corvette        1 1
has             1 1
an              1 1
eight           1 1
-               1 1
speed           1 1
tre             1 1
##me            1 1
##c             1 1
dc              1 1
##t             1 1
.               1 1
we              1 1
weren           1 1
'               1 1
t               1 1
crazy           1 1
about           1 1
it              1 1
in              1 1
the             1 1
pre             1 1
-               1 1
production      1 1
c               1 1
##8             1 1
we              1 1
drove           1 1


In [None]:
# check train loader result
nid2index = load_pickle("/data/v-pezhang/Code/GateFormer/src/data/cache/MIND/MINDdemo_train/news/nid2index.pkl")
uid2index = load_pickle("/data/v-pezhang/Code/GateFormer/src/data/cache/MIND/uid2index.pkl")
nindex2id = {v:k for k,v in nid2index.items()}
uindex2id = {v:k for k,v in uid2index.items()}

# check behaviors.tsv
print([uindex2id[i] for i in x["user_index"].tolist()], (x["impr_index"] + 1).tolist())
# check news.tsv
print([nindex2id[i] for i in x["cdd_idx"][0][:5].tolist()])
print(t.batch_decode(x["cdd_token_id"][0][:5], skip_special_tokens=True))

print([nindex2id[i] for i in x["his_idx"][0][:5].tolist()])
print(t.batch_decode(x["his_token_id"][0][:5], skip_special_tokens=True))

In [None]:
# check dev loader result
nid2index = load_pickle("/data/v-pezhang/Code/GateFormer/src/data/cache/MIND/MINDdemo_dev/news/nid2index.pkl")
uid2index = load_pickle("/data/v-pezhang/Code/GateFormer/src/data/cache/MIND/uid2index.pkl")
nindex2id = {v:k for k,v in nid2index.items()}
uindex2id = {v:k for k,v in uid2index.items()}

# check behaviors.tsv
print([uindex2id[i] for i in x2["user_index"].tolist()], (x2["impr_index"] + 1).tolist())
# check news.tsv
print([nindex2id[i] for i in x2["cdd_idx"][0][:5].tolist()])
print(t.batch_decode(x2["cdd_token_id"][0][:5], skip_special_tokens=True))

print([nindex2id[i] for i in x2["his_idx"][0][:5].tolist()])
print(t.batch_decode(x2["his_token_id"][0][:5], skip_special_tokens=True))