In [1]:
import os
import sys
os.chdir('/home/peitian_zhang/Codes/NR')
sys.path.append('/home/peitian_zhang/Codes/NR')

import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.vocab import GloVe
from torch.utils.data import DataLoader
from utils.MIND import MIND_iter,MIND_map
from utils.utils import constructBasicDict, tailorData

## show data

In [2]:
hparams = {
    'npratio':4,
    'mode':'demo',
    'batch_size':3,
    'his_size':10,
    'title_size':15,
    'device':'cuda:1',
    'attrs': ['title','category','subcategory']
}

# customize your path here
news_file_train = '/home/peitian_zhang/Data/MIND/MIND'+hparams['mode']+'_train/news.tsv'
news_file_test = '/home/peitian_zhang/Data/MIND/MIND'+hparams['mode']+'_dev/news.tsv'
news_file_pair = (news_file_train,news_file_test)

behavior_file_train = '/home/peitian_zhang/Data/MIND/MIND'+hparams['mode']+'_train/behaviors.tsv'
behavior_file_test = '/home/peitian_zhang/Data/MIND/MIND'+hparams['mode']+'_dev/behaviors.tsv'
behavior_file_pair = (behavior_file_train,behavior_file_test)

save_path = 'models/model_params/NPA_'+ hparams['mode'] +'.model'

if not os.path.exists('data/dictionaries/vocab_{}_{}.pkl'.format(hparams['mode'],'_'.join(hparams['attrs']))):
    constructBasicDict(news_file_pair,behavior_file_pair,hparams['mode'],hparams['attrs'])

device = torch.device(hparams['device']) if torch.cuda.is_available() else torch.device("cpu")

dataset_train = MIND_map(hparams=hparams,news_file=news_file_train,behaviors_file=behavior_file_train)

dataset_test = MIND_iter(hparams=hparams,news_file=news_file_test,behaviors_file=behavior_file_test)

vocab = dataset_train.vocab
embedding = GloVe(dim=300,cache='.vector_cache')
vocab.load_vectors(embedding)

loader_train = DataLoader(dataset_train,batch_size=hparams['batch_size'],shuffle=True,pin_memory=True,num_workers=3,drop_last=True)
loader_test = DataLoader(dataset_test,batch_size=hparams['batch_size'],pin_memory=True,num_workers=0,drop_last=True)

In [3]:
a = next(iter(loader_train))
b = next(iter(loader_test))
a,b

({'user_index': tensor([[1263],
          [ 718],
          [ 384]]),
  'neg_pad': tensor([[2],
          [0],
          [0]]),
  'his_mask': tensor([[[False],
           [False],
           [False],
           [False],
           [ True],
           [ True],
           [ True],
           [ True],
           [ True],
           [ True]],
  
          [[False],
           [False],
           [False],
           [False],
           [False],
           [False],
           [False],
           [ True],
           [ True],
           [ True]],
  
          [[False],
           [False],
           [False],
           [False],
           [False],
           [False],
           [False],
           [False],
           [False],
           [False]]]),
  'clicked_title': tensor([[[ 1564, 15019, 21662,    18, 11544,     6,  1263,  3690,  2759,    20,
             9501,   490,   104,    97,  5219],
           [   89,   884,    10,   143,   152,    12,     9,    12,  3866,    17,
             1548,  

### tailor Data to demo size

In [None]:
tailorData('/home/peitian_zhang/Data/MIND/MINDsmall_dev/behaviors.tsv',500)
tailorData('/home/peitian_zhang/Data/MIND/MINDsmall_train/behaviors.tsv',2000)

### Analyze MIND Datasets
- average title length
- average abstract length
- average history length
- average impression capacity
- count of history exceeding 50
- count of multi-clicked impressions 

In [None]:
avg_title_length = 0
avg_abstract_length = 0
avg_his_length = 0
avg_imp_length = 0
cnt_his_lg_50 = 0
cnt_his_eq_0 = 0
cnt_imp_multi = 0

with open(news_file_train,"r",encoding='utf-8') as rd:
    count = 0
    for idx in rd:
        nid, vert, subvert, title, ab, url, _, _ = idx.strip("\n").split('\t')
        avg_title_length += len(title.split(' '))
        avg_abstract_length += len(ab.split(' '))
        count += 1
avg_title_length = avg_title_length/count
avg_abstract_length = avg_abstract_length/count

with open(behavior_file_train, "r", encoding='utf-8') as rd:
    count = 0
    for idx in rd:
        uid, time, history, impr = idx.strip("\n").split('\t')[-4:]
        his = history.split(' ')
        imp = impr.split(' ')
        if len(his) > 50:
            cnt_his_lg_50 += 1
        if len(imp) > 50:
            cnt_imp_multi += 1
        if not his[0]:
            cnt_his_eq_0 += 1
        avg_his_length += len(his)
        avg_imp_length += len(imp)
        count += 1
avg_his_length = avg_his_length/count
avg_imp_length = avg_imp_length/count

In [None]:
print("avg_title_length:{}\n avg_abstract_length:{}\n avg_his_length:{}\n avg_impr_length:{}\n cnt_his_lg_50:{}\n cnt_his_eq_0:{}\n cnt_imp_multi:{}".format(avg_title_length,avg_abstract_length,avg_his_length,avg_imp_length,cnt_his_lg_50,cnt_his_eq_0,cnt_imp_multi))