# RHGN JD dataset process

In [1]:
import numpy as np
import pandas as pd
import os
import tqdm
import dgl
import time
import pickle
import fasttext
import torch
from torch import nn
from utils import *
from utils import neighbormap,split_char,filter_sample,combination

Using backend: pytorch[06:29:49] /opt/dgl/src/runtime/tensordispatch.cc:43: TensorDispatcher: dlopen failed: /home/purificato/.local/lib/python3.8/site-packages/dgl/tensoradapter/pytorch/libtensoradapter_pytorch_1.10.1.so: cannot open shared object file: No such file or directory



In [2]:
def show_df_info(df):
    print(df.info())
    print('####### Repeat ####### \n', df.duplicated().any())
    print('####### Count ####### \n', df.nunique())
    print('####### Example ####### \n',df.head())


def label_statics(label_df, label_list):
    print("####### nCount #######")
    for label in label_list:
        print(label_df[label].value_counts())
    print("####### nPercent #######")
    for label in label_list:
        print(label_df[label].value_counts()/label_df.shape[0])

## Base paths

In [3]:
raw_data_path = '/home/purificato/papers_code/CIKM21_RHGN/jd_data'
save_path = './input_jd_data/orig'

## USER analysis

In [4]:
df_user = pd.read_csv(os.path.join(raw_data_path, "user"))
df_user.dropna(inplace=True)

show_df_info(df_user)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   user_id    100000 non-null  int64 
 1   gender     100000 non-null  int64 
 2   age_range  100000 non-null  object
dtypes: int64(2), object(1)
memory usage: 3.1+ MB
None
####### Repeat ####### 
 False
####### Count ####### 
 user_id      100000
gender            2
age_range        13
dtype: int64
####### Example ####### 
    user_id  gender age_range
0        0       0     21~25
1        1       0     31~35
2        2       0     26~30
3        3       1     46~50
4        4       1     31~35


In [5]:
age_dic = {'11~15':0, '16~20':0, '21~25':0, '26~30':1, '31~35':1, '36~40':2, '41~45':2, '46~50':3, '51~55':3, '56~60':4, '61~65':4, '66~70':4, '71~':4}

df_user[["age_range"]] = df_user[["age_range"]].applymap(lambda x:age_dic[x])
df_user.rename(columns={"user_id":"uid", "age_range":"age"}, inplace=True)

show_df_info(df_user)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype
---  ------  --------------   -----
 0   uid     100000 non-null  int64
 1   gender  100000 non-null  int64
 2   age     100000 non-null  int64
dtypes: int64(3)
memory usage: 3.1 MB
None
####### Repeat ####### 
 False
####### Count ####### 
 uid       100000
gender         2
age            5
dtype: int64
####### Example ####### 
    uid  gender  age
0    0       0    0
1    1       0    1
2    2       0    1
3    3       1    3
4    4       1    1


In [6]:
label_statics(df_user, df_user.columns[1:])

####### nCount #######
0    59500
1    40500
Name: gender, dtype: int64
1    50824
2    20847
0    19288
3     7100
4     1941
Name: age, dtype: int64
####### nPercent #######
0    0.595
1    0.405
Name: gender, dtype: float64
1    0.50824
2    0.20847
0    0.19288
3    0.07100
4    0.01941
Name: age, dtype: float64


### bin_age

In [7]:
# Add bin_age column
df_user["bin_age"] = df_user["age"]
df_user["bin_age"] = df_user["bin_age"].replace(1,0)
df_user["bin_age"] = df_user["bin_age"].replace(2,1)
df_user["bin_age"] = df_user["bin_age"].replace(3,1)
df_user["bin_age"] = df_user["bin_age"].replace(4,1)

label_statics(df_user, df_user.columns[1:])

####### nCount #######
0    59500
1    40500
Name: gender, dtype: int64
1    50824
2    20847
0    19288
3     7100
4     1941
Name: age, dtype: int64
0    70112
1    29888
Name: bin_age, dtype: int64
####### nPercent #######
0    0.595
1    0.405
Name: gender, dtype: float64
1    0.50824
2    0.20847
0    0.19288
3    0.07100
4    0.01941
Name: age, dtype: float64
0    0.70112
1    0.29888
Name: bin_age, dtype: float64


## ITEM analysis

In [8]:
df_item = pd.read_csv(os.path.join(raw_data_path, "item_info"))
df_item.dropna(inplace=True)
df_item.rename(columns={"item_id":"pid", "brand_code":"brand"}, inplace=True)
df_item.reset_index(drop=True, inplace=True)

show_df_info(df_item)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4730503 entries, 0 to 4730502
Data columns (total 11 columns):
 #   Column     Dtype  
---  ------     -----  
 0   pid        int64  
 1   cid1       int64  
 2   cid2       int64  
 3   cid3       int64  
 4   cid1_name  object 
 5   cid2_name  object 
 6   cid3_name  object 
 7   brand      float64
 8   price      float64
 9   item_name  object 
 10  seg_name   object 
dtypes: float64(2), int64(4), object(5)
memory usage: 397.0+ MB
None
####### Repeat ####### 
 False
####### Count ####### 
 pid          4730503
cid1              49
cid2             454
cid3            4098
cid1_name         49
cid2_name        443
cid3_name       3801
brand         133439
price          49875
item_name    4686250
seg_name     4728800
dtype: int64
####### Example ####### 
             pid  cid1   cid2   cid3 cid1_name cid2_name cid3_name    brand  \
0  100000002008  1315   1345  12015      服饰内衣        内衣      秋衣秋裤  12002.0   
1  100000002009  6196   6

In [9]:
df_item = df_item.sample(frac=0.15, random_state=11)
df_item.reset_index(drop=True, inplace=True)

show_df_info(df_item)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 709575 entries, 0 to 709574
Data columns (total 11 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   pid        709575 non-null  int64  
 1   cid1       709575 non-null  int64  
 2   cid2       709575 non-null  int64  
 3   cid3       709575 non-null  int64  
 4   cid1_name  709575 non-null  object 
 5   cid2_name  709575 non-null  object 
 6   cid3_name  709575 non-null  object 
 7   brand      709575 non-null  float64
 8   price      709575 non-null  float64
 9   item_name  709575 non-null  object 
 10  seg_name   709575 non-null  object 
dtypes: float64(2), int64(4), object(5)
memory usage: 59.6+ MB
None
####### Repeat ####### 
 False
####### Count ####### 
 pid          709575
cid1             46
cid2            412
cid3           3614
cid1_name        46
cid2_name       402
cid3_name      3382
brand         83674
price         20162
item_name    708363
seg_name     709516
dtype: int64


## CLICK analysis

In [10]:
df_click = pd.read_csv(os.path.join(raw_data_path, "user_click"), usecols=[0,1])
df_click.dropna(inplace=True)

df_click.rename(columns={"user_id":"uid", "item_id":"pid"}, inplace=True)
df_click.reset_index(drop=True, inplace=True)

show_df_info(df_click)

  exec(code_obj, self.user_global_ns, self.user_ns)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52983323 entries, 0 to 52983322
Data columns (total 2 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   uid     int64 
 1   pid     object
dtypes: int64(1), object(1)
memory usage: 808.5+ MB
None
####### Repeat ####### 
 True
####### Count ####### 
 uid      85177
pid    8335109
dtype: int64
####### Example ####### 
    uid       pid
0    0   1150551
1    0  11133236
2    0   6888601
3    0   5812383
4    0   4803330


In [11]:
df_click = df_click.sample(frac=0.15, random_state=11)
df_click.reset_index(drop=True, inplace=True)

show_df_info(df_click)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7947498 entries, 0 to 7947497
Data columns (total 2 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   uid     int64 
 1   pid     object
dtypes: int64(1), object(1)
memory usage: 121.3+ MB
None
####### Repeat ####### 
 True
####### Count ####### 
 uid      81173
pid    2739082
dtype: int64
####### Example ####### 
      uid          pid
0  91559       851954
1  43488      5089253
2  44568     12367746
3  18466  26220866219
4  80782      8591502


In [12]:
df_click = df_click[df_click["uid"].isin(df_user["uid"])]
df_click = df_click[df_click["pid"].isin(df_item["pid"])]

df_click.drop_duplicates(inplace=True)
df_click.reset_index(drop=True, inplace=True)

show_df_info(df_click)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 454909 entries, 0 to 454908
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   uid     454909 non-null  int64 
 1   pid     454909 non-null  object
dtypes: int64(1), object(1)
memory usage: 6.9+ MB
None
####### Repeat ####### 
 False
####### Count ####### 
 uid     41145
pid    188573
dtype: int64
####### Example ####### 
      uid          pid
0  29190  16237834468
1  56519      4596108
2  73751  21845916575
3  30253     11798319
4  88255      1228007


## Filter

In [13]:
def get_count(tp, id):
    playcount_groupbyid = tp[[id]].groupby(id, as_index=True)
    count = playcount_groupbyid.size()
    return count

def filter_triplets(tp, user, item, min_uc=0, min_sc=0):
    # Only keep the triplets for users who clicked on at least min_uc items
    if min_uc > 0:
        usercount = get_count(tp, user)
        tp = tp[tp[user].isin(usercount.index[usercount >= min_uc])]
    
    # Only keep the triplets for items which were clicked on by at least min_sc users. 
    if min_sc > 0:
        itemcount = get_count(tp, item)
        tp = tp[tp[item].isin(itemcount.index[itemcount >= min_sc])]
    
    # Update both usercount and itemcount after filtering
    usercount, itemcount = get_count(tp, user), get_count(tp, item) 
    return tp, usercount, itemcount

### Filter "df_click" (item interactions >= 2)

In [14]:
# Before filtering
users = set(df_click.uid.tolist())
items = set(df_click.pid.tolist())

print(len(users), len(items))

41145 188573


In [15]:
df_click, uid_activity, pid_popularity = filter_triplets(df_click, 'uid', 'pid', min_uc=0, min_sc=2)

sparsity = 1. * df_click.shape[0] / (uid_activity.shape[0] * pid_popularity.shape[0])

print("After filtering, there are %d interaction events from %d users and %d items (sparsity: %.4f%%)" % 
      (df_click.shape[0], uid_activity.shape[0], pid_popularity.shape[0], sparsity * 100))

After filtering, there are 315970 interaction events from 38322 users and 49634 items (sparsity: 0.0166%)


In [16]:
# After filtering
users = set(df_click.uid.tolist())
items = set(df_click.pid.tolist())

print(len(users), len(items))

38322 49634


## Process

In [17]:
df_user = df_user[df_user['uid'].isin(users)]
df_item = df_item[df_item['pid'].isin(items)]
df_user.reset_index(drop=True, inplace=True)
df_item.reset_index(drop=True, inplace=True)

In [18]:
df_user = df_user.astype({"uid": "str"}, copy=False)
df_item = df_item.astype({'pid': 'str', 'cid1': 'str', 'cid2': 'str', 'cid3': 'str', 'brand': 'str'}, copy=False)
df_click = df_click.astype({'uid': 'str', 'pid': 'str'}, copy=False)

In [19]:
show_df_info(df_user)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38322 entries, 0 to 38321
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   uid      38322 non-null  object
 1   gender   38322 non-null  int64 
 2   age      38322 non-null  int64 
 3   bin_age  38322 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 1.2+ MB
None
####### Repeat ####### 
 False
####### Count ####### 
 uid        38322
gender         2
age            5
bin_age        2
dtype: int64
####### Example ####### 
   uid  gender  age  bin_age
0   8       1    2        1
1  15       0    3        1
2  18       1    1        0
3  19       1    1        0
4  21       0    2        1


In [20]:
# Build a dictionary and remove duplicate items
user_dic = {k: v for v,k in enumerate(df_user.uid)}
cid1_dic = {k: v for v, k in enumerate(df_item.cid1_name.drop_duplicates())}  
cid2_dic = {k: v for v, k in enumerate(df_item.cid2_name.drop_duplicates())}
cid3_dic = {k: v for v, k in enumerate(df_item.cid3_name.drop_duplicates())}
brand_dic = {k: v for v, k in enumerate(df_item.brand.drop_duplicates())}
item_dic = {}
c1, c2, c3, brand = [], [], [], []
for i in range(len(df_item)):
    k = df_item.at[i,'pid']
    v = i
    item_dic[k] = v
    c1.append(cid1_dic[df_item.at[i,'cid1_name']])
    c2.append(cid2_dic[df_item.at[i,'cid2_name']])
    c3.append(cid3_dic[df_item.at[i,'cid3_name']])
    brand.append(brand_dic[df_item.at[i,'brand']])

df_item.drop(columns=["cid1_name", "cid2_name", "cid3_name", "price", "item_name", "seg_name"], inplace=True)

show_df_info(df_item)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49634 entries, 0 to 49633
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   pid     49634 non-null  object
 1   cid1    49634 non-null  object
 2   cid2    49634 non-null  object
 3   cid3    49634 non-null  object
 4   brand   49634 non-null  object
dtypes: object(5)
memory usage: 1.9+ MB
None
####### Repeat ####### 
 False
####### Count ####### 
 pid      49634
cid1        41
cid2       297
cid3      2056
brand    15737
dtype: int64
####### Example ####### 
            pid   cid1   cid2   cid3     brand
0  28664028364   1316   1381   1391   14031.0
1  29553196788   1320   1581   2644  151523.0
2      5153158  16750  16754  16801   27179.0
3  11817573566   1315   1346   9789   35951.0
4      7472300  12218  12222  12243  240300.0


In [21]:
label_statics(df_user, df_user.columns[1:])

####### nCount #######
0    24587
1    13735
Name: gender, dtype: int64
1    18944
2     8919
0     6773
3     2954
4      732
Name: age, dtype: int64
0    25717
1    12605
Name: bin_age, dtype: int64
####### nPercent #######
0    0.64159
1    0.35841
Name: gender, dtype: float64
1    0.494337
2    0.232738
0    0.176739
3    0.077084
4    0.019101
Name: age, dtype: float64
0    0.671077
1    0.328923
Name: bin_age, dtype: float64


In [32]:
df_user['bin_age'] = df_user['bin_age'].replace(1,2)
df_user['bin_age'] = df_user['bin_age'].replace(0,1)
df_user['bin_age'] = df_user['bin_age'].replace(2,0)

label_statics(df_user, df_user.columns[1:])

####### nCount #######
0    24587
1    13735
Name: gender, dtype: int64
1    18944
2     8919
0     6773
3     2954
4      732
Name: age, dtype: int64
1    25717
0    12605
Name: bin_age, dtype: int64
####### nPercent #######
0    0.64159
1    0.35841
Name: gender, dtype: float64
1    0.494337
2    0.232738
0    0.176739
3    0.077084
4    0.019101
Name: age, dtype: float64
1    0.671077
0    0.328923
Name: bin_age, dtype: float64


### Save

In [33]:
df_user.to_csv(os.path.join(save_path, "user.csv"), index=False)
df_item.to_csv(os.path.join(save_path, "item.csv"), index=False)
df_click.to_csv(os.path.join(save_path, "click.csv"), index=False)

## Generate graph

In [34]:
u = {v:k for k,v in user_dic.items()}
i = {v:k for k,v in item_dic.items()}
pickle.dump(u, open(os.path.join("./input_jd_data", "attweight", "user_dic.pkl"), 'wb'))
pickle.dump(i, open(os.path.join("./input_jd_data", "attweight", "item_dic.pkl"), 'wb'))

In [35]:
click_user = [user_dic[user] for user in df_click.uid]
click_item = [item_dic[item] for item in df_click.pid]

In [36]:
data_dict = {
    ('user', 'click', 'item'): (torch.tensor(click_user), torch.tensor(click_item)),
    ('item', 'click-by', 'user'): (torch.tensor(click_item), torch.tensor(click_user))
}

In [37]:
G = dgl.heterograph(data_dict)

In [38]:
model = fasttext.load_model('/home/purificato/papers_code/CIKM21_RHGN/jd_data/fasttext/fastText/cc.zh.200.bin')

temp = {k: model.get_sentence_vector(v) for v, k in cid1_dic.items()}
cid1_feature = torch.tensor([temp[k] for _, k in cid1_dic.items()])

temp = {k: model.get_sentence_vector(v) for v, k in cid2_dic.items()}
cid2_feature = torch.tensor([temp[k] for _, k in cid2_dic.items()])

temp = {k: model.get_sentence_vector(v) for v, k in cid3_dic.items()}
cid3_feature = torch.tensor([temp[k] for _, k in cid3_dic.items()])

temp = {k: model.get_sentence_vector(v) for v, k in brand_dic.items()}
brand_feature = torch.tensor([temp[k] for _, k in brand_dic.items()])



In [39]:
# Passing labels into label
label_gender = df_user.gender
label_age = df_user.age
label_bin_age = df_user.bin_age

In [40]:
G.nodes['user'].data['gender'] = torch.tensor(label_gender[:G.number_of_nodes('user')])
G.nodes['user'].data['age'] = torch.tensor(label_age[:G.number_of_nodes('user')])
G.nodes['user'].data['bin_age'] = torch.tensor(label_bin_age[:G.number_of_nodes('user')])
G.nodes['item'].data['cid1'] = torch.tensor(c1[:G.number_of_nodes('item')])
G.nodes['item'].data['cid2'] = torch.tensor(c2[:G.number_of_nodes('item')])
G.nodes['item'].data['cid3'] = torch.tensor(c3[:G.number_of_nodes('item')])
G.nodes['item'].data['brand'] = torch.tensor(brand[:G.number_of_nodes('item')])

### Save

In [41]:
torch.save(G, os.path.join(save_path, "G_new.pkl"))
torch.save(cid1_feature, os.path.join(save_path, "cid1_feature.npy"))
torch.save(cid2_feature, os.path.join(save_path, "cid2_feature.npy"))
torch.save(cid3_feature, os.path.join(save_path, "cid3_feature.npy"))
torch.save(brand_feature, os.path.join(save_path, "brand_feature.npy"))