In [6]:
import os
import sys
os.chdir('/home/peitian_zhang/Codes/NR')
sys.path.append('/home/peitian_zhang/Codes/NR')

import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.vocab import FastText
from torch.utils.data import DataLoader
from utils.MIND import MIND_iter,MIND_map
from utils.utils import constructBasicDict

## show data

In [7]:
hparams = {
    'npratio':1,
    'mode':'demo',
    'batch_size':1,
    'his_size':2,
    'title_size':5,
    'gpu':'cuda:0',
    'attrs': ['title','category','subcategory']
}

# customize your path here
news_file_train = '/home/peitian_zhang/Data/MIND/MIND'+hparams['mode']+'_train/news.tsv'
news_file_test = '/home/peitian_zhang/Data/MIND/MIND'+hparams['mode']+'_dev/news.tsv'
behavior_file_train = '/home/peitian_zhang/Data/MIND/MIND'+hparams['mode']+'_train/behaviors.tsv'
behavior_file_test = '/home/peitian_zhang/Data/MIND/MIND'+hparams['mode']+'_dev/behaviors.tsv'
save_path = '/home/peitian_zhang/Codes/NR/models/model_param/NPA_'+ hparams['mode'] +'.model'

if not os.path.exists('data/dictionaries/vocab_{}_{}_{}.pkl'.format(hparams['mode'],'train','_'.join(hparams['attrs']))):
    constructBasicDict(news_file_train,behavior_file_train,hparams['mode'],'train',hparams['attrs'])
if not os.path.exists('data/dictionaries/vocab_{}_{}_{}.pkl'.format(hparams['mode'],'test','_'.join(hparams['attrs']))):
    constructBasicDict(news_file_test,behavior_file_test,hparams['mode'],'test',hparams['attrs'])

device = torch.device(hparams['gpu']) if torch.cuda.is_available() else torch.device("cpu")

vocab_train = dataset_train.vocab
embedding = FastText('simple',cache='.vector_cache')
vocab_train.load_vectors(embedding)

vocab_test = dataset_test.vocab
vocab_test.load_vectors(embedding)

In [8]:
# map style dataloader to return one log for training
dataset_train = MIND_map(hparams=hparams,mode='train',news_file=news_file_train,behaviors_file=behavior_file_train)
# iter style dataloader to return one candidate for training
dataset_test = MIND_iter(hparams=hparams,mode='test',news_file=news_file_test,behaviors_file=behavior_file_test)

dataset_train[0], next(iter(dataset_test))

({'user_index': array([1]),
  'clicked_title': array([[3275,   10, 3445, 4246, 3208],
         [ 325,  690,   28,  512, 1042]]),
  'clicked_category': array([[66],
         [ 4]]),
  'clicked_subcategory': array([[208],
         [ 42]]),
  'candidate_title': array([[1540, 2502,    6,  155,  185],
         [2428, 1145,   96,  302,  838]]),
  'candidate_category': array([[4],
         [3]]),
  'candidate_subcategory': array([[15],
         [13]]),
  'labels': array([1, 0])},
 {'impression_index': 0,
  'user_index': array([1]),
  'clicked_title': array([[4357,   10, 3816, 4712, 2547],
         [ 689,  650,  609,   31, 1099]]),
  'clicked_category': array([[61],
         [ 3]]),
  'clicked_subcategory': array([[200],
         [ 13]]),
  'candidate_title': array([[ 259, 6983,    5, 1321, 1349]]),
  'candidate_category': array([[4]]),
  'candidate_subcategory': array([[15]]),
  'labels': array([0])})

In [14]:
loader_train = DataLoader(dataset_train,batch_size=hparams['batch_size'],shuffle=True,pin_memory=True,num_workers=3,drop_last=True)
loader_test = DataLoader(dataset_test,batch_size=hparams['batch_size'],pin_memory=True,num_workers=0,drop_last=True)
a = next(iter(loader_train))
b = next(iter(loader_test))
a,b

({'user_index': tensor([[1558]]),
  'clicked_title': tensor([[[29361,  2736,    50,   206,   468],
           [ 1406,  3240,   737,    83,  3442]]]),
  'clicked_category': tensor([[[66],
           [24]]]),
  'clicked_subcategory': tensor([[[220],
           [488]]]),
  'candidate_title': tensor([[[7725, 1350,  796, 1179,    6],
           [ 151,   12,    9,   12, 2227]]]),
  'candidate_category': tensor([[[24],
           [ 3]]]),
  'candidate_subcategory': tensor([[[488],
           [ 13]]]),
  'labels': tensor([[1, 0]])},
 {'impression_index': tensor([0]),
  'user_index': tensor([[1]]),
  'clicked_title': tensor([[[4357,   10, 3816, 4712, 2547],
           [ 689,  650,  609,   31, 1099]]]),
  'clicked_category': tensor([[[61],
           [ 3]]]),
  'clicked_subcategory': tensor([[[200],
           [ 13]]]),
  'candidate_title': tensor([[[ 259, 6983,    5, 1321, 1349]]]),
  'candidate_category': tensor([[[4]]]),
  'candidate_subcategory': tensor([[[15]]]),
  'labels': tensor([[0]])})

### tailor Data to demo size

In [None]:
tailorData('D:/Data/NR_data/MINDsmall_dev/behaviors.tsv',300)
tailorData('D:/Data/NR_data/MINDsmall_train/behaviors.tsv',300)

### Analyze MIND Datasets
- average title length
- average abstract length
- average history length
- average impression capacity
- count of history exceeding 50
- count of multi-clicked impressions 

In [10]:
avg_title_length = 0
avg_abstract_length = 0
avg_his_length = 0
avg_imp_length = 0
cnt_his_lg_50 = 0
cnt_imp_multi = 0

with open(news_file_train,"r",encoding='utf-8') as rd:
    count = 0
    for idx in rd:
        nid, vert, subvert, title, ab, url, _, _ = idx.strip("\n").split('\t')
        avg_title_length += len(title.split(' '))
        avg_abstract_length += len(ab.split(' '))
        count += 1
avg_title_length = avg_title_length/count
avg_abstract_length = avg_abstract_length/count

with open(behavior_file_train, "r", encoding='utf-8') as rd:
    count = 0
    for idx in rd:
        uid, time, history, impr = idx.strip("\n").split('\t')[-4:]
        his = history.split(' ')
        imp = impr.split(' ')
        if len(his) > 50:
            cnt_his_lg_50 += 1
        if len(imp) > 50:
            cnt_imp_multi += 1
        avg_his_length += len(his)
        avg_imp_length += len(imp)
        count += 1
avg_his_length = avg_his_length/count
avg_imp_length = avg_imp_length/count

In [11]:
print("avg_title_length:{}\n avg_abstract_length:{}\n avg_his_length:{}\n avg_impr_length:{}\n cnt_his_lg_50:{}\n cnt_imp_multi:{}".format(avg_title_length,avg_abstract_length,avg_his_length,avg_imp_length,cnt_his_lg_50,cnt_imp_multi))

avg_title_length:10.67731736385395
 avg_abstract_length:36.4448570331045
 avg_his_length:32.99787212887438
 avg_impr_length:37.40116394684935
 cnt_his_lg_50:447829
 cnt_imp_multi:567571
