In [1]:
import os
import sys
os.chdir('../')
sys.path.append('../')

import torch
from utils.utils import prepare,analyse

hparams = {
    'npratio':4,
    'scale':'demo',
    'batch_size':10,
    'his_size':50,
    'title_size':15,
    'device':'cuda:1',
    'attrs': ['title'],
    'news_id':True,
    'k': ''
}

device = torch.device(hparams['device'])
torch.cuda.set_device(device)

## show data

In [2]:
_, loader_train, loader_test = prepare(hparams)

In [None]:
a = next(iter(loader_test))

In [None]:
a

### Analyze MIND Datasets
- average title length
- average abstract length
- average history length
- average impression capacity
- count of history exceeding 50
- count of empty history
- count of multi-clicked impressions 

In [None]:
hparams['scale'] = 'large'
hparams['mode'] = 'tra'
analyse(hparams)

### tailor Data to demo size

In [None]:
tailorData('/home/peitian_zhang/Data/MIND/MINDsmall_dev/behaviors.tsv',500)
tailorData('/home/peitian_zhang/Data/MIND/MINDsmall_train/behaviors.tsv',2000)

In [None]:
f = open('/home/peitian_zhang/Data/MIND/MINDlarge_test/news.tsv','r',encoding='utf-8')

nid2index = {}
for line in f:
    nid,_,_,_,_,_,_,_ = line.strip("\n").split('\t')

    if nid in nid2index:
        continue
    nid2index[nid] = len(nid2index) + 1

f.close()
h = open('/home/peitian_zhang/Codes/News-Recommendation/data/dictionaries/nid2idx_large_dev.json','w',encoding='utf-8')
json.dump(nid2index,h,ensure_ascii=False)
h.close()

In [None]:
import json
f = open('/home/peitian_zhang/Data/MIND/MINDlarge_test/behaviors.tsv','r',encoding='utf-8')

uid2index = {}
for line in f:
    _,uid,_,_,_ = line.strip("\n").split('\t')

    if uid in uid2index:
        continue
    uid2index[uid] = len(uid2index) + 1

f.close()

h = open('/home/peitian_zhang/Codes/News-Recommendation/data/dictionaries/uid2idx_large_dev.json','w',encoding='utf-8')
json.dump(uid2index,h,ensure_ascii=False)
h.close()

In [None]:
constructVocab(['/home/peitian_zhang/Data/MIND/MINDlarge_dev/news.tsv','/home/peitian_zhang/Data/MIND/MINDlarge_test/news.tsv','/home/peitian_zhang/Data/MIND/MINDlarge_train/news.tsv'], '/home/peitian_zhang/Codes/News-Recommendation/data/dictionaries/vocab_demo_title_category_subcategory.pkl', ['title','category','subcategory'])

In [None]:
from utils.utils import constructNid2idx

In [None]:
constructNid2idx('/home/peitian_zhang/Data/MIND/MINDsmall_train/news.tsv','/home/peitian_zhang/Data/MIND/MINDsmall_dev/news.tsv','small')

In [None]:
constructNid2idx('/home/peitian_zhang/Data/MIND/MINDlarge_train/news.tsv','/home/peitian_zhang/Data/MIND/MINDlarge_dev/news.tsv','large')


In [None]:
constructNid2idx('/home/peitian_zhang/Data/MIND/MINDdemo_train/news.tsv','/home/peitian_zhang/Data/MIND/MINDdemo_dev/news.tsv','demo')

In [5]:
import numpy as np
from torch.utils.data import Dataset,IterableDataset, DataLoader, get_worker_info
from utils.utils import newsample,getId2idx,word_tokenize_vocab,getVocab

In [11]:
class MIND_news(Dataset):
    """ Map style dataset

    Args:
        hparams(dict): pre-defined dictionary of hyper parameters
        mode(str): train/test
        news_file(str): path of news_file
        behaviors_file(str): path of behaviors_file
    """
    def __init__(self,hparams,news_file,col_spliter='\t'):
        # initiate the whole iterator
        self.npratio = hparams['npratio']
        self.news_file = news_file
        self.col_spliter = col_spliter        
        self.batch_size = hparams['batch_size']
        self.title_size = hparams['title_size']
        self.his_size = hparams['his_size']
        self.attrs = hparams['attrs']
        self.k = hparams['k']

        self.vocab = getVocab('data/dictionaries/vocab_{}_{}.pkl'.format(hparams['scale'],'_'.join(hparams['attrs'])))
        self.nid2index = getId2idx('data/dictionaries/nid2idx_{}_train.json'.format(hparams['scale']))
        self.uid2index = getId2idx('data/dictionaries/uid2idx_{}.json'.format(hparams['scale']))
    
    def __len__(self):
        if not hasattr(self, "news_title_array"):
            self.init_news()

        return len(self.news_title_array)
    
    def init_news(self):
        """ 
            init news information given news file, such as news_title_array.
        """

        title_token = []
        # category_token = [[0]]
        # subcategory_token = [[0]]

        title_pad = [[self.title_size]]
        
        with open(self.news_file,"r",encoding='utf-8') as rd:

            for idx in rd:
                nid, vert, subvert, title, ab, url, _, _ = idx.strip("\n").split(
                    self.col_spliter
                )

                title = word_tokenize_vocab(title,self.vocab)
                title_token.append(title[:self.title_size] + [0] * (self.title_size - len(title)))
                title_pad.append([max(self.title_size - len(title), 0)])
                # category_token.append([self.vocab[vert]])
                # subcategory_token.append([self.vocab[subvert]])
        
        self.news_title_array = np.asarray(title_token)
        # self.news_category_array = np.asarray(category_token)
        # self.news_subcategory_array = np.asarray(subcategory_token)

        self.title_pad = np.asarray(title_pad)

    def __getitem__(self, idx):
        """ parse behavior log No.idx to training example

        Args:
            idx (int): impression index, start from zero

        Returns:
            dict of training data, including |npratio+1| candidate news word vector, |his_size+1| clicked news word vector etc.
        """
        if not hasattr(self, "news_title_array"):
            self.init_news()

        return {
            "candidate_title": self.news_title_array[idx]
        }

In [12]:
mind = MIND_news(hparams,'/home/peitian_zhang/Data/MIND/MINDdemo_train/news.tsv')

In [13]:
mind.__getitem__(1)

{'candidate_title': array([ 345,  698, 3561,    9, 3803, 3033,    0,    0,    0,    0,    0,
           0,    0,    0,    0])}