In [1]:
import pandas as pd
import numpy as np

In [None]:
user_behavior_path = "topk-off-policy-correction/DBRL/resources/user_behavior.csv"

In [3]:
behavior_df = pd.read_csv(
    user_behavior_path,
    header=None,
    names=["user", "item", "behavior", "time"],
)

In [4]:
behavior_df

Unnamed: 0,user,item,behavior,time
0,286924,13546315,pv,1124981
1,893324,19447070,cart,292579
2,893324,19447070,pv,296258
3,153813,1251077,pv,243310
4,1403217,19735607,pv,476795
...,...,...,...,...
80475447,297365,30558739,pv,570709
80475448,297365,30558739,pv,570795
80475449,297365,30558739,pv,571783
80475450,297365,30558739,pv,999896


In [6]:
behavior = behavior_df.sort_values(by="time").reset_index(drop=True)
behavior = behavior_df.drop_duplicates(subset=["user", "item", "behavior"])

In [15]:
user_counts = behavior.groupby("user")[["user"]].count().rename(
    columns={"user": "count_user"}
).sort_values("count_user", ascending=False)
user_counts

Unnamed: 0_level_0,count_user
user,Unnamed: 1_level_1
61807,668
190074,505
731080,476
104821,464
716818,463
...,...
1530309,1
1974022,1
655354,1
1076751,1


In [16]:
short_users = np.array(
    user_counts[
        (user_counts.count_user > 5) & (user_counts.count_user <= 50)
    ].index
)

long_users = np.array(
    user_counts[
        (user_counts.count_user > 50) & (user_counts.count_user <= 200)
    ].index
)

short_chosen_users = np.random.choice(short_users, 60000, replace=False)
long_chosen_users = np.random.choice(long_users, 20000, replace=False)
chosen_users = np.concatenate([short_chosen_users, long_chosen_users])
chosen_users

array([ 878704,  649565, 1530901, ...,  284393,  593974, 1839762])

In [20]:
behavior = behavior[behavior.user.isin(chosen_users)]
print(f"n_users: {behavior.user.nunique()}")
print(f"n_items: {behavior.item.nunique()}")
print(f"total behavior: {len(behavior)}")

n_users: 80000
n_items: 1044959
total behavior: 3227545


In [21]:
behavior = behavior.sort_values(by="time").reset_index(drop=True)
behavior

Unnamed: 0,user,item,behavior,time
0,816997,12870435,pv,254
1,1243360,8798725,pv,359
2,669390,30093359,pv,607
3,1263779,3180376,pv,661
4,975830,37930700,pv,941
...,...,...,...,...
3227540,451266,23027488,pv,1382395
3227541,1520584,23598296,pv,1382396
3227542,1877201,2554174,pv,1382396
3227543,1947955,33157392,pv,1382398


In [22]:
# behavior.to_csv("tianchi.csv", header=None, index=False)

In [35]:
unique_user = behavior.user.unique()
user_map = {str(uid): str(idx) for idx, uid in enumerate(unique_user)}
unique_item = behavior.item.unique()
item_map = {str(iid): str(idx) for idx, iid in enumerate(unique_item)}

In [36]:
user_map, item_map

({'816997': '0',
  '1243360': '1',
  '669390': '2',
  '1263779': '3',
  '975830': '4',
  '1843845': '5',
  '1651276': '6',
  '1385396': '7',
  '856500': '8',
  '1936599': '9',
  '725039': '10',
  '1416508': '11',
  '1881961': '12',
  '1133765': '13',
  '913241': '14',
  '958039': '15',
  '1923194': '16',
  '1207781': '17',
  '1249402': '18',
  '1566861': '19',
  '1859049': '20',
  '646545': '21',
  '793980': '22',
  '186874': '23',
  '743181': '24',
  '1131830': '25',
  '263697': '26',
  '622249': '27',
  '944197': '28',
  '1178916': '29',
  '1935699': '30',
  '807759': '31',
  '36452': '32',
  '413600': '33',
  '1245175': '34',
  '1301425': '35',
  '37615': '36',
  '292295': '37',
  '1120503': '38',
  '1364485': '39',
  '1063685': '40',
  '1697846': '41',
  '917897': '42',
  '1460493': '43',
  '369393': '44',
  '1778991': '45',
  '1454548': '46',
  '1974162': '47',
  '575471': '48',
  '1585385': '49',
  '875863': '50',
  '29797': '51',
  '1787452': '52',
  '1273524': '53',
  '1242160'