In [1]:
from torch.utils.data import Dataset
import pandas as pd
from ast import literal_eval
from os import path
import numpy as np
from config_nghia import model_name
import importlib
import torch
from torch.utils.data import DataLoader

In [2]:
try:
    config = getattr(importlib.import_module('config_nghia'), f"{model_name}Config")
except AttributeError:
    print(f"{model_name} not included!")
    exit() # load the config

In [43]:
class BaseDataset(Dataset):
    def __init__(self, behaviors_path , news_path):
        ''' 
        behaviors_path: path to behaviors.tsv
        news_path: path to news.tsv
        '''
        super().__init__()
        self.behavior_parsed = pd.read_table(behaviors_path) 
        self.news_parsed = pd.read_table(
            news_path,
            index_col='id', #neu them id vao thi se try suat bang id chu khong 
            usecols= ['id' , 'category' , 'subcategory' , 'title' , 'abstract'],
            #"news": ['category', 'subcategory', 'title', 'abstract'],
            converters={attribute: literal_eval
                for attribute in ['category', 'subcategory', 'title', 'abstract']
            }
        )
        self.id2int = { id : i  for i , id in enumerate(self.news_parsed.index)}
        self.news2dict = self.news_parsed.to_dict('index')
        for id in self.news2dict:
            for name_col in self.news2dict[id]:
                self.news2dict[id][name_col] = torch.tensor(self.news2dict[id][name_col])
        # create padding 
        padding = {
            'category' : 0,
            'subcategory' : 0,
            'title' : [0] * config.num_words_title,
            'abstract' : [0] * config.num_words_abstract,
        }
        for index in padding:
            padding[index] = torch.tensor(padding[index])
        self.padding = padding
                
    def __len__(self):
        return len(self.behavior_parsed)
    def __getitem__(self, index):
        # print(self.behavior_parsed.iloc[index])
        item = {}
        row = self.behavior_parsed.iloc[index]
        item['clicked'] = list(map(int , row.clicked.split()))
        item['clicked_news']  = [self.news2dict[id] if id in self.news2dict else self.padding
                                 for id in row.candidate_news.split()]
        print(self.news_parsed.iloc[index])
        return (self.behavior_parsed.user_id.iloc[index], self.news_parsed.category.iloc[index])
        

In [46]:
behavior_path = './save_process/behaviors_processed.tsv'
news_path = './save_process/news_processed.tsv'
dataset = BaseDataset(behavior_path, news_path)
train_loader = DataLoader(dataset, batch_size=1, shuffle=False)
for i, data in enumerate(train_loader):
    print(data) 
    break

category                                                       1
subcategory                                                    2
title          [1, 2, 3, 4, 5, 6, 7, 5, 8, 6, 0, 0, 9, 0, 0, ...
abstract       [10, 1, 0, 5, 0, 5, 8, 11, 12, 1, 13, 14, 15, ...
Name: N55528, dtype: object
[tensor([0]), tensor([1])]


In [55]:
df = pd.read_table(
            news_path,
            index_col='id', #neu them id vao thi se try suat bang id chu khong 
            usecols= ['id' , 'category' , 'subcategory' , 'title' , 'abstract'], #"news": ['category', 'subcategory', 'title', 'abstract'],
            converters={attribute: literal_eval
                for attribute in ['category', 'subcategory', 'title', 'abstract']
            }
        )
df.head()
id2int = {id: i for i , id in enumerate(df.index)}
news2dict = df.to_dict('index')

In [67]:
for index in news2dict:
    for name_col in news2dict[index].keys(): 
        # print(name_col)
        news2dict[index][name_col] = torch.tensor(news2dict[index][name_col])
        print(news2dict[index][name_col]) # attribute of dictitonnary
        
    # print(index)
    break

tensor(1)
tensor(2)
tensor([1, 2, 3, 4, 5, 6, 7, 5, 8, 6, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0])
tensor([10,  1,  0,  5,  0,  5,  8, 11, 12,  1, 13, 14, 15, 16, 17, 18,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0])


  after removing the cwd from sys.path.


In [57]:
behavior_parsed = pd.read_table(behavior_path) 
# behavior_parsed.iloc[1]
row = behavior_parsed.iloc[1]
row

user_id                                                           1
history           N58715 N32109 N51180 N33438 N54827 N28488 N611...
candidate_news                                 N23513 N31958 N46976
clicked                                                       1 0 0
Name: 1, dtype: object

In [61]:
padding = {
            'category' : 0,
            'subcategory' : 0,
            'title' : [0] * config.num_words_title,
            'abstract' : [0] * config.num_words_abstract,
        }
list_temp = [
    news2dict[x] if x in news2dict else padding
    for x in row.candidate_news.split()]


In [66]:
list(map(int , row.clicked.split()))

[1, 0, 0]