In [None]:
import os
from typing import Callable, Optional

import numpy as np
import torch

from torch_geometric.data import (
    HeteroData,
    InMemoryDataset,
    download_url,
    extract_zip,
)

class Taobao(InMemoryDataset):
    r"""Taobao is a dataset of user behaviors from Taobao offered by Alibaba,
    provided by the `Tianchi Alicloud platform
    <https://tianchi.aliyun.com/dataset/649>`_.

    Taobao is a heterogeneous graph for recommendation.
    Nodes represent users with user IDs, items with item IDs, and categories
    with category ID.
    Edges between users and items represent different types of user behaviors
    towards items (alongside with timestamps).
    Edges between items and categories assign each item to its set of
    categories.

    Args:
        root (str): Root directory where the dataset should be saved.
        transform (callable, optional): A function/transform that takes in an
            :obj:`torch_geometric.data.HeteroData` object and returns a
            transformed version. The data object will be transformed before
            every access. (default: :obj:`None`)
        pre_transform (callable, optional): A function/transform that takes in
            an :obj:`torch_geometric.data.HeteroData` object and returns a
            transformed version. The data object will be transformed before
            being saved to disk. (default: :obj:`None`)

    """
    url = ('https://alicloud-dev.oss-cn-hangzhou.aliyuncs.com/'
           'UserBehavior.csv.zip')

    def __init__(
        self,
        root,
        transform: Optional[Callable] = None,
        pre_transform: Optional[Callable] = None,
    ):
        super().__init__(root, transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_paths[0])

    @property
    def raw_file_names(self) -> str:
        return 'UserBehavior.csv'

    @property
    def processed_file_names(self) -> str:
        return 'data.pt'

    def download(self):
        path = download_url(self.url, self.raw_dir)
        extract_zip(path, self.raw_dir)
        os.remove(path)

    def process(self):
        import pandas as pd

        cols = ['userId', 'itemId', 'categoryId', 'behaviorType', 'timestamp']
        df = pd.read_csv(self.raw_paths[0], names=cols)

        # Time representation (YYYY.MM.DD-HH:MM:SS -> Integer)
        # start: 1511539200 = 2017.11.25-00:00:00
        # end:   1512316799 = 2017.12.03-23:59:59
        start = 1511539200
        end = 1512316799
        df = df[(df["timestamp"] >= start) & (df["timestamp"] <= end)]

        df = df.drop_duplicates()

        behavior_dict = {'pv': 0, 'cart': 1, 'buy': 2, 'fav': 3}
        df['behaviorType'] = df['behaviorType'].map(behavior_dict)

        num_entries = {}
        for col in ['userId', 'itemId', 'categoryId']:
            # Map IDs to consecutive integers:
            value, df[col] = np.unique(df[[col]].values, return_inverse=True)
            num_entries[col] = value.shape[0]

        data = HeteroData()

        data['user'].num_nodes = num_entries['userId']
        data['item'].num_nodes = num_entries['itemId']
        data['category'].num_nodes = num_entries['categoryId']

        row = torch.from_numpy(df['userId'].values)
        col = torch.from_numpy(df['itemId'].values)
        data['user', 'item'].edge_index = torch.stack([row, col], dim=0)
        data['user', 'item'].time = torch.from_numpy(df['timestamp'].values)
        behavior = torch.from_numpy(df['behaviorType'].values)
        data['user', 'item'].behavior = behavior

        df = df[['itemId', 'categoryId']].drop_duplicates()
        row = torch.from_numpy(df['itemId'].values)
        col = torch.from_numpy(df['categoryId'].values)
        data['item', 'category'].edge_index = torch.stack([row, col], dim=0)

        data = data if self.pre_transform is None else self.pre_transform(data)

        torch.save(self.collate([data]), self.processed_paths[0])

In [None]:
data = Taobao(root='Taobao/') #download Taobao from PyG

In [None]:
dataset = data[0]

In [None]:
dataset

In [None]:
import random
items = random.sample(range(4161138), 250_000) #sample 250k items

In [None]:
from torch_geometric.utils import subgraph

In [None]:
a = dataset['user','to','item'].time
b = dataset['user','to','item'].behavior
dataset['user','to','item'].edge_attr = torch.stack((a,b),dim=1) #utility to obtain timestamp and relation types as edge_attr

In [None]:
sub_uti = subgraph(torch.Tensor(items).long(),\
                   dataset['user','to','item'].edge_index,\
                   dataset['user','to','item'].edge_attr) #obtain the subgraph induced by items for user-to-item

In [None]:
sub_itc = subgraph(torch.Tensor(items).long(),\
                   dataset['item','to','category'].edge_index) #obtain the subgraph induced by items for item-to-category

In [None]:
max_user_id = torch.max(sub_uti[0][0])
max_items_id = max(int(torch.max(sub_uti[0][1])), int(torch.max(sub_itc[0][0])))
max_category_id = torch.max(sub_itc[0][1])

In [None]:
from torch_geometric.transforms import RemoveIsolatedNodes

In [None]:
subdata = HeteroData()
subdata['user'].x = torch.Tensor([[1] for i in range(max_user_id+1)]) #constant encoders for node features
subdata['item'].x = torch.Tensor([[1] for i in range(max_items_id+1)])
subdata['category'].x = torch.Tensor([[1] for i in range(max_category_id+1)])
subdata['user','to','item'].edge_index = sub_uti[0]
t, behavior = torch.unbind(sub_uti[1], dim=1) #obtain times and types of the edges in the subgraph
subdata['user','to','item'].t = t
subdata['user','to','item'].behavior = behavior
subdata['item','to','category'].edge_index = sub_itc[0]
remove_isolated = RemoveIsolatedNodes()
subdata = remove_isolated(subdata)

In [None]:
behaviors = { #map behaviours to relation types
    0: 'pageview',
    1: 'buy',
    2: 'cart',
    3: 'fav'
}

In [None]:
#1 timestamp al secondo -> raggruppiamo in ore, prendiamo solo il primo giorno

min_t = torch.min(subdata['user','to','item'].t)
max_t = torch.max(subdata['user','to','item'].t)


edge_index = subdata['user','to','item'].edge_index
behavior = subdata['user','to','item'].behavior
timestamps = subdata['user','to','item'].t

torch.save(subdata['user'].x, 'TAOBAO-5/user.pt')
torch.save(subdata['category'].x,'TAOBAO-5/category.pt')
torch.save(subdata['item'].x, 'TAOBAO-5/item.pt')
torch.save(subdata['item','to','category'].edge_index, 'TAOBAO-5/itc_edge_index.pt')

count = 1
snap = HeteroData()
for k,v in behaviors.items():
    snap['user',v,'item'].edge_index = torch.Tensor([[],[]])
split = 60*5 #snap every 5min
#end = 86400 #process the first day
end = max_t-min_t #process all the snap
snapshots = [HeteroData() for x in range((end//split)+1)] #HeteroData for each snap
for snap in snapshots:
    for k,v in behaviors.items():
        snap['user',v,'item'].edge_index = torch.Tensor([[],[]]) #initialize data structures

In [None]:
for i,edge in enumerate(torch.stack(\
                        (subdata['user','to','item'].edge_index[0],\
                         subdata['user','to','item'].edge_index[1]),\
                        dim=1)):
    #for each edge, obtain its timestamp and its behaviour and append src, dst to the correct edge_index
    src, dst = int(edge[0]), int(edge[1])
    t = int(subdata['user','to','item'].t[int(i)] - min_t)
    if t >= end: continue
    b = behaviors[int(subdata['user','to','item'].behavior[int(i)])]
    try:
        edge_index = snapshots[t//split]['user',b,'item'].edge_index
    except IndexError:
        print(t, split, t//split)
    new_src = torch.cat((edge_index[0],torch.Tensor([src])))
    new_dst = torch.cat((edge_index[1],torch.Tensor([dst])))
    snapshots[t//split]['user',b,'item'].edge_index = torch.stack((new_src, new_dst))
    
    if i % 100_000 == 0:
        print(f'processed {i} edges')

In [None]:
len(snapshots)

In [None]:
for i, snap in enumerate(snapshots):
    for edge_t, v in snap.edge_index_dict.items():
        torch.save(v,f'TAOBAO-5/{i}_{edge_t}.pt')