In [5]:
import os
import sys
os.chdir('/home/peitian_zhang/Codes/NR')
sys.path.append('/home/peitian_zhang/Codes/NR')

import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.vocab import GloVe
from torch.utils.data import DataLoader
from utils.MIND import MIND_iter,MIND_map
from utils.utils import constructBasicDict

## show data

In [8]:
hparams = {
    'npratio':4,
    'mode':'demo',
    'batch_size':5,
    'his_size':2,
    'title_size':5,
    'gpu':'cuda:0',
    'attrs': ['title','category','subcategory']
}

# customize your path here
news_file_train = '/home/peitian_zhang/Data/MIND/MIND'+hparams['mode']+'_train/news.tsv'
news_file_test = '/home/peitian_zhang/Data/MIND/MIND'+hparams['mode']+'_dev/news.tsv'
news_file_pair = (news_file_train,news_file_test)

behavior_file_train = '/home/peitian_zhang/Data/MIND/MIND'+hparams['mode']+'_train/behaviors.tsv'
behavior_file_test = '/home/peitian_zhang/Data/MIND/MIND'+hparams['mode']+'_dev/behaviors.tsv'
behavior_file_pair = (behavior_file_train,behavior_file_test)

save_path = '/home/peitian_zhang/Codes/NR/models/model_param/NPA_'+ hparams['mode'] +'.model'

if not os.path.exists('data/dictionaries/vocab_{}_{}.pkl'.format(hparams['mode'],'_'.join(hparams['attrs']))):
    constructBasicDict(news_file_pair,behavior_file_pair,hparams['mode'],hparams['attrs'])

device = torch.device(hparams['gpu']) if torch.cuda.is_available() else torch.device("cpu")

dataset_train = MIND_map(hparams=hparams,news_file=news_file_train,behaviors_file=behavior_file_train)

dataset_test = MIND_iter(hparams=hparams,news_file=news_file_test,behaviors_file=behavior_file_test)

vocab = dataset_train.vocab
embedding = GloVe(dim=300,cache='.vector_cache')
vocab.load_vectors(embedding)

# vocab_test = dataset_test.vocab
# vocab_test.load_vectors(embedding)

loader_train = DataLoader(dataset_train,batch_size=hparams['batch_size'],shuffle=True,pin_memory=True,num_workers=3,drop_last=True)
loader_test = DataLoader(dataset_test,batch_size=hparams['batch_size'],pin_memory=True,num_workers=0,drop_last=True)

In [9]:
a = next(iter(loader_train))
b = next(iter(loader_test))
a,b

({'user_index': tensor([[ 233],
          [ 821],
          [1461],
          [1481],
          [  30]]),
  'clicked_title': tensor([[[ 2632,  2172,  1261,    11, 14704],
           [  450,    21, 31928,   116,  1925]],
  
          [[ 2366,  1217,  5202,    35,  5965],
           [  450,  1923,    40,  1387,    11]],
  
          [[  367, 11233,   365,   205,     5],
           [  747,   799,   665,   847,  4014]],
  
          [[    8,   429,    10,  6047,   471],
           [ 3965,   229,   752,    40,     9]],
  
          [[   15,   157,    91,  3080,     7],
           [   98,   528,   565,  1102,  2501]]]),
  'clicked_category': tensor([[[ 3],
           [ 3]],
  
          [[ 3],
           [ 4]],
  
          [[ 3],
           [ 3]],
  
          [[34],
           [28]],
  
          [[ 3],
           [28]]]),
  'clicked_subcategory': tensor([[[ 13],
           [ 43]],
  
          [[ 43],
           [ 45]],
  
          [[ 23],
           [ 23]],
  
          [[189],
        

### tailor Data to demo size

In [None]:
tailorData('D:/Data/NR_data/MINDsmall_dev/behaviors.tsv',300)
tailorData('D:/Data/NR_data/MINDsmall_train/behaviors.tsv',300)

### Analyze MIND Datasets
- average title length
- average abstract length
- average history length
- average impression capacity
- count of history exceeding 50
- count of multi-clicked impressions 

In [10]:
avg_title_length = 0
avg_abstract_length = 0
avg_his_length = 0
avg_imp_length = 0
cnt_his_lg_50 = 0
cnt_imp_multi = 0

with open(news_file_train,"r",encoding='utf-8') as rd:
    count = 0
    for idx in rd:
        nid, vert, subvert, title, ab, url, _, _ = idx.strip("\n").split('\t')
        avg_title_length += len(title.split(' '))
        avg_abstract_length += len(ab.split(' '))
        count += 1
avg_title_length = avg_title_length/count
avg_abstract_length = avg_abstract_length/count

with open(behavior_file_train, "r", encoding='utf-8') as rd:
    count = 0
    for idx in rd:
        uid, time, history, impr = idx.strip("\n").split('\t')[-4:]
        his = history.split(' ')
        imp = impr.split(' ')
        if len(his) > 50:
            cnt_his_lg_50 += 1
        if len(imp) > 50:
            cnt_imp_multi += 1
        avg_his_length += len(his)
        avg_imp_length += len(imp)
        count += 1
avg_his_length = avg_his_length/count
avg_imp_length = avg_imp_length/count

In [11]:
print("avg_title_length:{}\n avg_abstract_length:{}\n avg_his_length:{}\n avg_impr_length:{}\n cnt_his_lg_50:{}\n cnt_imp_multi:{}".format(avg_title_length,avg_abstract_length,avg_his_length,avg_imp_length,cnt_his_lg_50,cnt_imp_multi))

avg_title_length:10.67731736385395
 avg_abstract_length:36.4448570331045
 avg_his_length:32.99787212887438
 avg_impr_length:37.40116394684935
 cnt_his_lg_50:447829
 cnt_imp_multi:567571
