In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.vocab import GloVe
from utils.preprocess import MINDIterator
from utils.utils import getVocab,getLoss,getLabel,constructBasicDict
from utils.utils import run_eval

In [None]:
hparams = {
    'mode':'small',
    'batch_size':5,
    'title_size':18,
    'his_size':50,   
    'npratio':4,     
    'dropout_p':0.2,
    'filter_num':150,
    'embedding_dim':300,
    'metrics':'group_auc,ndcg@4,mean_mrr',
    'gpu':'cuda:0',
    'load_mode':3
}

# customize your path here
news_file = r'D:\Data\NR_data\dev\news.tsv'
behavior_file_train = r'D:\Data\NR_data\dev\behaviors_train.tsv'
behavior_file_test = r'D:\Data\NR_data\dev\behaviors_test.tsv'

# if user2id,word2id,news2id hasn't been constructed
if not os.path.exists('data/vocab_'+hparams['mode']+'.pkl'):
    constructBasicDict(news_file,behavior_file_train,hparams['mode'])

device = torch.device(hparams['gpu']) if torch.cuda.is_available() else torch.device("cpu")

iterator = MINDIterator(hparams=hparams)

# torchtext.Vocab.vocab object
vocab = iterator.word_dict
embedding = GloVe(dim=300,cache='.vector_cache')
vocab.load_vectors(embedding)

In [None]:
class FIMModel(nn.Module):
    def __init__(self,hparam,vocab):
        super().__init__()
        self.npratio = hparams['npratio']
        self.dropout_p = hparams['dropout_p']
        self.metrics = hparams['metrics']

        self.batch_size = hparams['batch_size']
        self.title_size = hparams['title_size']
        self.his_size =hparams['his_size']

        self.filter_num = hparams['filter_num']
        self.embedding_dim = hparams['embedding_dim']
        self.user_dim = hparams['user_dim']
        self.preference_dim =hparams['preference_dim']

        self.device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')

        # pretrained embedding
        self.embedding = vocab.vectors
        # elements in the slice along dim will sum up to 1 
        self.softmax = nn.functional.softmax
        
        self.CNN_d1 = nn.Conv1d(in_channel=self.embedding_dim,out_channel=self.filter_num,kernel_size = self.kernel_size,dilation=1,padding=1)
        self.CNN_d2 = nn.Conv1d(in_channel=self.filter_num,out_channel=self.filter_num,kernel_size = self.kernel_size,dilation=2,padding=2)
        self.CNN_d1 = nn.Conv1d(in_channel=self.filter_num,out_channel=self.filter_num,kernel_size = self.kernel_size,dilation=3,padding=3)

        self.RELU = nn.ReLU()
        self.LayerNorm = nn.LayerNorm(self.filter_num,self.embedding_dim)

    def _HDC(self,news_embedding_batch):
        """ stack 1d CNN with dilation rate expanding from 1 to 3
        
        Returns:
            dict: news_repr of in different lexical level(dilation rate)
        """
        news_embedding_d1 = self.CNN_d1(news_embedding_batch)
        news_embedding_d1 = self.LayerNorm(news_embedding_d1)
        news_embedding_d1 = self.ReLU(news_embedding_d1)

        news_embedding_d2 = self.CNN_d1(news_embedding_d1)
        news_embedding_d2 = self.LayerNorm(news_embedding_d2)
        news_embedding_d1 = self.ReLU(news_embedding_d2)        

        news_embedding_d3 = self.CNN_d1(news_embedding_d2)
        news_embedding_d3 = self.LayerNorm(news_embedding_d3)
        news_embedding_d1 = self.ReLU(news_embedding_d3)
        
        return {
            'd1':news_embedding_d1,
            'd2':news_embedding_d2,
            'd3':news_embedding_d3
            }
        
    def _news_encoder(self,news_set):
        """ encode set of news to news representation of [batch_size * filter_num * signal_length(title_size + category_length + subcategory_length)]
        
        Args:
            news_set:
        
        Returns:
            news_repr:
        """
        news_embedding = self.embedding[news_set].permute(0,2,1).to(self.device)
        news_embedding_stack = self._HDC(news_embedding)
        return news_embedding_stack

In [None]:
train = iterator.load_data_from_file(news_file,behavior_file_train)
record = next(train)
record

In [1]:
from utils.mind import MINDDataset
from utils.utils import news_token_generator_group,constructBasicDict
from torchtext.data.functional import simple_space_split,numericalize_tokens_from_iterator
from torchtext.data.utils import get_tokenizer
from torchtext.data import Dataset
from torchtext.vocab import build_vocab_from_iterator

hparams = {
    'mode':'small',
    'batch_size':5,
    'title_size':18,
    'his_size':50,   
    'npratio':4,     
    'dropout_p':0.2,
    'filter_num':150,
    'embedding_dim':300,
    'metrics':'group_auc,ndcg@4,mean_mrr',
    'gpu':'cuda:0',
    'load_mode':3,
    'col_spliter':'\t'
}
news_file = r'D:\Data\NR_data\dev\news.tsv'
behavior_file_train = r'D:\Data\NR_data\dev\behaviors_train.tsv'
behavior_file_test = r'D:\Data\NR_data\dev\behaviors_test.tsv'

if not os.path.exists('data/vocab_'+hparams['mode']+'.pkl'):
    constructBasicDict(news_file,behavior_file_train,hparams['mode'],['title','category','subcategory'])

mindDS = MINDDataset(hparams=hparams)
tokenizer = get_tokenizer('basic_english')
vocab = mindDS.vocab

In [4]:
mindDS._init_news(news_file)

In [6]:
mindDS.news_title_array

array([[    0,     0,     0, ...,     0,     0,     0],
       [    8,  3518,  1288, ...,     0,     0,     0],
       [  370,   722,  3556, ...,     0,     0,     0],
       ...,
       [  239,    12, 25014, ...,     0,     0,     0],
       [   53,     8,  2699, ...,     0,     0,     0],
       [   70,     4,   146, ...,     0,     0,     0]])

In [7]:
mindDS.subcategory_title_array.shape

(51282, 1)