In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
from datetime import timedelta

import pandas as pd
from feast import FeatureStore
from loguru import logger
from pydantic import BaseModel

from negative_sampling import generate_negative_samples

In [3]:
class Args(BaseModel):
    testing: bool = False
    run_name: str = "000-sequence-modeling"
    notebook_persist_dp: str = None
    random_seed: int = 41

    user_col: str = "user_id"
    item_col: str = "parent_asin"
    rating_col: str = "rating"
    timestamp_col: str = "event_timestamp"

    neg_to_pos_ratio: int = 1

    def init(self):
        self.notebook_persist_dp = os.path.abspath(f"data/{self.run_name}")
        os.makedirs(self.notebook_persist_dp, exist_ok=True)

        return self


args = Args().init()

print(args.model_dump_json(indent=2))

{
  "testing": false,
  "run_name": "000-sequence-modeling",
  "notebook_persist_dp": "/home/duong/Documents/datn1/src/feature_engineer/data/000-sequence-modeling",
  "random_seed": 41,
  "user_col": "user_id",
  "item_col": "parent_asin",
  "rating_col": "rating",
  "timestamp_col": "event_timestamp",
  "neg_to_pos_ratio": 1
}


In [None]:
pvc_path = os.getenv("PVC_PATH")
# pvc_path = "/home/duong/Documents/datn1/data"
if not pvc_path:
    raise ValueError("PVC_PATH environment variable not set")
train_features_path = f"{pvc_path}/train_features.parquet"
val_feature_path = f"{pvc_path}/val_features.parquet"
full_features_df_path = f"{pvc_path}/full_features_neg_sampling_df.parquet"
train_neg_df_path = f"{pvc_path}/train_features_neg_df.parquet"
val_neg_df_path = f"{pvc_path}/val_features_neg_df.parquet"

In [5]:
train_df = pd.read_parquet(train_features_path)
val_df = pd.read_parquet(val_feature_path)

assert val_df[args.timestamp_col].min() > train_df[args.timestamp_col].max()
val_timestamp = train_df[args.timestamp_col].max() + timedelta(seconds=1)
logger.info(f"{val_timestamp=}")

[32m2025-06-28 17:30:01.347[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mval_timestamp=Timestamp('2021-08-11 00:12:17.369000')[0m


In [6]:
full_df = pd.concat([train_df, val_df], axis=0)
full_df

Unnamed: 0,user_id,parent_asin,rating,event_timestamp,timestamp_unix,parent_asin_rating_cnt_365d,parent_asin_rating_avg_prev_rating_365d,parent_asin_rating_cnt_90d,parent_asin_rating_avg_prev_rating_90d,parent_asin_rating_cnt_30d,...,price,user_indice,item_indice,user_rating_cnt_90d,user_rating_avg_prev_rating_90d,user_rating_list_10_recent_asin,user_rating_list_10_recent_asin_timestamp,item_sequence_ts,item_sequence_ts_bucket,item_sequence
0,AF2UBRLFZTLECH44DEMVKDWS7Z5A,B00DQC2FPM,5.0,2015-01-02 17:54:49.000,1420221289,10,4.700000,4,5.000000,2,...,187.5,3195,1261,3,5.0,"B09QM5JMCD,B09PH8Z5R8,B000084JMC,B09NPJGN9N","2014-01-04T01:00:02.000Z,2015-01-03T00:39:56.0...","[-1, -1, -1, -1, -1, -1, 1388772002, 142022039...","[-1, -1, -1, -1, -1, -1, 5, 1, 1, 1]","[-1, -1, -1, -1, -1, -1, 3587, 3558, 102, 3535]"
1,AEW3V4ZWDGC6G33C6I5YSTNVJXSQ,B0BFGWHBB5,5.0,2015-06-14 19:21:09.000,1434309669,14,4.785714,5,4.600000,1,...,83.27,2759,3767,0,,,,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, 4143]"
2,AEW3V4ZWDGC6G33C6I5YSTNVJXSQ,B09NPJGN9N,5.0,2015-06-14 19:21:11.000,1434309671,13,5.000000,7,5.000000,1,...,9.69,2759,3535,1,5.0,B0BFGWHBB5,2015-06-15T02:21:09.000Z,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 1434309669]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, 0]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, 3767]"
3,AEW3V4ZWDGC6G33C6I5YSTNVJXSQ,B08GGHG8S6,5.0,2015-06-14 19:21:18.000,1434309678,30,4.666667,7,4.714286,3,...,,2759,3198,2,5.0,"B0BFGWHBB5,B09NPJGN9N","2015-06-15T02:21:09.000Z,2015-06-15T02:21:11.000Z","[-1, -1, -1, -1, -1, -1, -1, -1, 1434309669, 1...","[-1, -1, -1, -1, -1, -1, -1, -1, 0, 0]","[-1, -1, -1, -1, -1, -1, -1, -1, 3767, 3535]"
4,AEW3V4ZWDGC6G33C6I5YSTNVJXSQ,B006EFMSSM,5.0,2015-06-14 19:21:22.000,1434309682,24,4.833333,4,5.000000,2,...,,2759,862,3,5.0,"B0BFGWHBB5,B09NPJGN9N,B08GGHG8S6","2015-06-15T02:21:09.000Z,2015-06-15T02:21:11.0...","[-1, -1, -1, -1, -1, -1, -1, 1434309669, 14343...","[-1, -1, -1, -1, -1, -1, -1, 0, 0, 0]","[-1, -1, -1, -1, -1, -1, -1, 3767, 3535, 3198]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
856,AFKQD5SINKETJBCBOTWOCWMSOA5Q,B0979WX1W4,5.0,2021-12-13 05:43:30.735,1639374210,0,,0,,0,...,,4754,3432,0,,"B0033BM3K8,B0BT9H7VF3,B01ASVCZ44,B01N9KR2SZ,B0...","2015-05-28T23:08:50.000Z,2016-12-11T20:26:20.0...","[1432829330, 1481462780, 1494592575, 151508893...","[8, 8, 7, 7, 7, 7, 6, 6, 6, 5]","[569, 3911, 2098, 2342, 1859, 1340, 1305, 4073..."
857,AFJCPOPHR46UYP7S4YR4YXYMZHQA,B08XQMJCXL,5.0,2022-03-07 23:43:01.126,1646696581,2,5.000000,0,,0,...,39.95,4612,3334,0,,"B000W3TD4Y,B09M7XZ33P,B084K4J39K,B01N64HQ1X,B0...","2010-12-26T21:58:56.000Z,2011-12-13T07:56:13.0...","[-1, -1, -1, -1, -1, 1293375536, 1323737773, 1...","[-1, -1, -1, -1, -1, 9, 9, 8, 7, 6]","[-1, -1, -1, -1, -1, 363, 3508, 3004, 2336, 1562]"
858,AGNNDSTERX7WEWRHQGBVUW5EORUQ,B0BHT45FW9,5.0,2022-01-20 01:23:03.152,1642641783,6,4.166667,1,5.000000,1,...,17.98,8030,3790,0,,"B09NXVL2P2,B06XRGBBXP,B00IL7IFOM,B004K6KM8K,B0...","2018-06-07T09:41:33.364Z,2018-11-16T02:25:15.8...","[-1, -1, 1528339293, 1542309915, 1562260149, 1...","[-1, -1, 7, 7, 6, 6, 6, 6, 6, 6]","[-1, -1, 3542, 2367, 1450, 734, 827, 586, 1008..."
859,AH2SXPFOHZKKPEPBTFL7K4ZLUVVQ,B078X1Q2HC,5.0,2021-10-05 13:01:25.980,1633438885,3,5.000000,0,,0,...,49.36,9325,2499,0,,"B00B2B051A,B01L8JF64G,B0BFXK2HJW,B012CRQ7S2,B0...","2015-10-27T21:10:20.000Z,2015-10-27T21:10:58.0...","[-1, -1, -1, 1445955020, 1445955058, 148536225...","[-1, -1, -1, 8, 8, 7, 7, 7, 7, 7]","[-1, -1, -1, 1096, 2261, 3776, 1972, 2741, 193..."


## Generate negative samples


In [7]:
full_features_df = full_df

In [8]:
meta_features = ["main_category", "categories", "price"]

item_timestamp_features = [
    "parent_asin_rating_cnt_365d",
    "parent_asin_rating_avg_prev_rating_365d",
    "parent_asin_rating_cnt_90d",
    "parent_asin_rating_avg_prev_rating_90d",
    "parent_asin_rating_cnt_30d",
    "parent_asin_rating_avg_prev_rating_30d",
    "parent_asin_rating_cnt_7d",
    "parent_asin_rating_avg_prev_rating_7d",
]

item_features_df = full_features_df.drop_duplicates(subset=[args.item_col])[
    [args.item_col, "item_indice", *meta_features]
]

In [9]:
features = [
    "item_sequence",
    "user_rating_list_10_recent_asin_timestamp",
    "item_sequence_ts",
    "item_sequence_ts_bucket",
    "user_id",
    "user_rating_cnt_90d",
    "user_rating_avg_prev_rating_90d",
    "user_rating_list_10_recent_asin",
]

neg_df = generate_negative_samples(
    full_features_df,
    "user_indice",
    "item_indice",
    args.rating_col,
    neg_label=0,
    neg_to_pos_ratio=args.neg_to_pos_ratio,
    seed=args.random_seed,
    features=features,
)

neg_df = neg_df.pipe(
    lambda df: pd.merge(
        df, item_features_df, how="left", on="item_indice", validate="m:1"
    )
)

  0%|          | 0/91561 [00:00<?, ?it/s]

In [10]:
# Kiểm tra các bản ghi bị trùng item_indice
duplicated = item_features_df[item_features_df["item_indice"].duplicated(keep=False)]
print("Các bản ghi trùng item_indice:")
print(duplicated.sort_values("item_indice"))

# Kiểm tra xem có phải do mapping từ item_col sang item_indice không đúng
print("\nKiểm tra mapping:")
print(duplicated[[args.item_col, "item_indice"]])

Các bản ghi trùng item_indice:
Empty DataFrame
Columns: [parent_asin, item_indice, main_category, categories, price]
Index: []

Kiểm tra mapping:
Empty DataFrame
Columns: [parent_asin, item_indice]
Index: []


In [11]:
neg_df

Unnamed: 0,user_indice,item_indice,rating,event_timestamp,item_sequence,user_rating_list_10_recent_asin_timestamp,item_sequence_ts,item_sequence_ts_bucket,user_id,user_rating_cnt_90d,user_rating_avg_prev_rating_90d,user_rating_list_10_recent_asin,parent_asin,main_category,categories,price
0,3195,1079,0,2015-01-02 17:54:49.000,"[-1, -1, -1, -1, -1, -1, 3587, 3558, 102, 3535]","2014-01-04T01:00:02.000Z,2015-01-03T00:39:56.0...","[-1, -1, -1, -1, -1, -1, 1388772002, 142022039...","[-1, -1, -1, -1, -1, -1, 5, 1, 1, 1]",AF2UBRLFZTLECH44DEMVKDWS7Z5A,3,5.0,"B09QM5JMCD,B09PH8Z5R8,B000084JMC,B09NPJGN9N",B00ARQW0NW,Toys & Games,"[Toys & Games, Preschool, Pre-Kindergarten Toy...",
1,2759,171,0,2015-06-14 19:21:09.000,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 4143]",,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]",AEW3V4ZWDGC6G33C6I5YSTNVJXSQ,0,,,B000AS2AL4,Toys & Games,"[Toys & Games, Baby & Toddler Toys, Early Deve...",8.99
2,2759,3057,0,2015-06-14 19:21:11.000,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 3767]",2015-06-15T02:21:09.000Z,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 1434309669]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, 0]",AEW3V4ZWDGC6G33C6I5YSTNVJXSQ,1,5.0,B0BFGWHBB5,B087GY3245,Toys & Games,"[Toys & Games, Preschool, Pre-Kindergarten Toy...",
3,2759,160,0,2015-06-14 19:21:18.000,"[-1, -1, -1, -1, -1, -1, -1, -1, 3767, 3535]","2015-06-15T02:21:09.000Z,2015-06-15T02:21:11.000Z","[-1, -1, -1, -1, -1, -1, -1, -1, 1434309669, 1...","[-1, -1, -1, -1, -1, -1, -1, -1, 0, 0]",AEW3V4ZWDGC6G33C6I5YSTNVJXSQ,2,5.0,"B0BFGWHBB5,B09NPJGN9N",B0009P5GUA,Toys & Games,"[Toys & Games, Preschool, Pre-Kindergarten Toy...",
4,2759,464,0,2015-06-14 19:21:22.000,"[-1, -1, -1, -1, -1, -1, -1, 3767, 3535, 3198]","2015-06-15T02:21:09.000Z,2015-06-15T02:21:11.0...","[-1, -1, -1, -1, -1, -1, -1, 1434309669, 14343...","[-1, -1, -1, -1, -1, -1, -1, 0, 0, 0]",AEW3V4ZWDGC6G33C6I5YSTNVJXSQ,3,5.0,"B0BFGWHBB5,B09NPJGN9N,B08GGHG8S6",B001N11OI2,Toys & Games,"[Toys & Games, Games & Accessories, Board Games]",14.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91556,4754,1074,0,2021-12-13 05:43:30.735,"[569, 3911, 2098, 2342, 1859, 1340, 1305, 4073...","2015-05-28T23:08:50.000Z,2016-12-11T20:26:20.0...","[1432829330, 1481462780, 1494592575, 151508893...","[8, 8, 7, 7, 7, 7, 6, 6, 6, 5]",AFKQD5SINKETJBCBOTWOCWMSOA5Q,0,,"B0033BM3K8,B0BT9H7VF3,B01ASVCZ44,B01N9KR2SZ,B0...",B00APVXSM6,Toys & Games,"[Toys & Games, Preschool, Pre-Kindergarten Toy...",25.98
91557,4612,1172,0,2022-03-07 23:43:01.126,"[-1, -1, -1, -1, -1, 363, 3508, 3004, 2336, 1562]","2010-12-26T21:58:56.000Z,2011-12-13T07:56:13.0...","[-1, -1, -1, -1, -1, 1293375536, 1323737773, 1...","[-1, -1, -1, -1, -1, 9, 9, 8, 7, 6]",AFJCPOPHR46UYP7S4YR4YXYMZHQA,0,,"B000W3TD4Y,B09M7XZ33P,B084K4J39K,B01N64HQ1X,B0...",B00CIXVITY,Toys & Games,"[Toys & Games, STEM Toys, Technology & Enginee...",
91558,8030,3608,0,2022-01-20 01:23:03.152,"[-1, -1, 3542, 2367, 1450, 734, 827, 586, 1008...","2018-06-07T09:41:33.364Z,2018-11-16T02:25:15.8...","[-1, -1, 1528339293, 1542309915, 1562260149, 1...","[-1, -1, 7, 7, 6, 6, 6, 6, 6, 6]",AGNNDSTERX7WEWRHQGBVUW5EORUQ,0,,"B09NXVL2P2,B06XRGBBXP,B00IL7IFOM,B004K6KM8K,B0...",B09SLSZK1F,Toys & Games,[],12.6
91559,9325,3433,0,2021-10-05 13:01:25.980,"[-1, -1, -1, 1096, 2261, 3776, 1972, 2741, 193...","2015-10-27T21:10:20.000Z,2015-10-27T21:10:58.0...","[-1, -1, -1, 1445955020, 1445955058, 148536225...","[-1, -1, -1, 8, 8, 7, 7, 7, 7, 7]",AH2SXPFOHZKKPEPBTFL7K4ZLUVVQ,0,,"B00B2B051A,B01L8JF64G,B0BFXK2HJW,B012CRQ7S2,B0...",B097B1NFCP,AMAZON FASHION,"[Toys & Games, Preschool, Toddler Toys, Preten...",21.94


In [12]:
store = FeatureStore(repo_path="../../feature_store", fs_yaml_file="feature_store.yaml")



In [13]:
%%time
ts_features = [
    f"parent_asin_feature_view:{feature}" for feature in item_timestamp_features
]

neg_ts_features_df = store.get_historical_features(
    neg_df[[args.item_col, args.timestamp_col]].drop_duplicates(), ts_features
).to_df()
assert neg_ts_features_df.duplicated().sum() == 0, display(
    neg_ts_features_df.loc[neg_ts_features_df.duplicated()]
)

25/06/28 17:32:04 WARN Utils: Your hostname, duong resolves to a loopback address: 127.0.1.1; using 192.168.1.103 instead (on interface enp3s0)
25/06/28 17:32:04 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Ivy Default Cache set to: /home/duong/.ivy2/cache
The jars for the packages stored in: /home/duong/.ivy2/jars
org.apache.hadoop#hadoop-aws added as a dependency
org.apache.hadoop#hadoop-common added as a dependency
com.amazonaws#aws-java-sdk-bundle added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-b3060d42-5793-4efd-ad95-5a948cdad971;1.0
	confs: [default]


:: loading settings :: url = jar:file:/home/duong/Documents/datn1/.venv/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


	found org.apache.hadoop#hadoop-aws;3.3.4 in central
	found com.amazonaws#aws-java-sdk-bundle;1.12.262 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
	found org.apache.hadoop#hadoop-common;3.3.4 in central
	found org.apache.hadoop.thirdparty#hadoop-shaded-protobuf_3_7;1.1.1 in central
	found org.apache.hadoop#hadoop-annotations;3.3.4 in central
	found org.apache.hadoop.thirdparty#hadoop-shaded-guava;1.1.1 in central
	found com.google.guava#guava;27.0-jre in central
	found com.google.guava#failureaccess;1.0 in central
	found com.google.guava#listenablefuture;9999.0-empty-to-avoid-conflict-with-guava in central
	found com.google.code.findbugs#jsr305;3.0.2 in central
	found org.checkerframework#checker-qual;2.5.2 in central
	found com.google.j2objc#j2objc-annotations;1.1 in central
	found org.codehaus.mojo#animal-sniffer-annotations;1.17 in central
	found commons-cli#commons-cli;1.2 in central
	found org.apache.commons#commons-math3;3.1.1 in central
	found or

CPU times: user 884 ms, sys: 63.4 ms, total: 947 ms
Wall time: 18 s


In [14]:
neg_df = pd.merge(
    neg_df, neg_ts_features_df, on=[args.item_col, args.timestamp_col], how="left"
)
neg_df

Unnamed: 0,user_indice,item_indice,rating,event_timestamp,item_sequence,user_rating_list_10_recent_asin_timestamp,item_sequence_ts,item_sequence_ts_bucket,user_id,user_rating_cnt_90d,...,categories,price,parent_asin_rating_cnt_365d,parent_asin_rating_avg_prev_rating_365d,parent_asin_rating_cnt_90d,parent_asin_rating_avg_prev_rating_90d,parent_asin_rating_cnt_30d,parent_asin_rating_avg_prev_rating_30d,parent_asin_rating_cnt_7d,parent_asin_rating_avg_prev_rating_7d
0,3195,1079,0,2015-01-02 17:54:49.000,"[-1, -1, -1, -1, -1, -1, 3587, 3558, 102, 3535]","2014-01-04T01:00:02.000Z,2015-01-03T00:39:56.0...","[-1, -1, -1, -1, -1, -1, 1388772002, 142022039...","[-1, -1, -1, -1, -1, -1, 5, 1, 1, 1]",AF2UBRLFZTLECH44DEMVKDWS7Z5A,3,...,"[Toys & Games, Preschool, Pre-Kindergarten Toy...",,2.0,4.000000,0.0,,0.0,,0.0,
1,2759,171,0,2015-06-14 19:21:09.000,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 4143]",,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]",AEW3V4ZWDGC6G33C6I5YSTNVJXSQ,0,...,"[Toys & Games, Baby & Toddler Toys, Early Deve...",8.99,2.0,5.000000,0.0,,0.0,,0.0,
2,2759,3057,0,2015-06-14 19:21:11.000,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 3767]",2015-06-15T02:21:09.000Z,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 1434309669]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, 0]",AEW3V4ZWDGC6G33C6I5YSTNVJXSQ,1,...,"[Toys & Games, Preschool, Pre-Kindergarten Toy...",,,,,,,,,
3,2759,160,0,2015-06-14 19:21:18.000,"[-1, -1, -1, -1, -1, -1, -1, -1, 3767, 3535]","2015-06-15T02:21:09.000Z,2015-06-15T02:21:11.000Z","[-1, -1, -1, -1, -1, -1, -1, -1, 1434309669, 1...","[-1, -1, -1, -1, -1, -1, -1, -1, 0, 0]",AEW3V4ZWDGC6G33C6I5YSTNVJXSQ,2,...,"[Toys & Games, Preschool, Pre-Kindergarten Toy...",,2.0,5.000000,0.0,,0.0,,0.0,
4,2759,464,0,2015-06-14 19:21:22.000,"[-1, -1, -1, -1, -1, -1, -1, 3767, 3535, 3198]","2015-06-15T02:21:09.000Z,2015-06-15T02:21:11.0...","[-1, -1, -1, -1, -1, -1, -1, 1434309669, 14343...","[-1, -1, -1, -1, -1, -1, -1, 0, 0, 0]",AEW3V4ZWDGC6G33C6I5YSTNVJXSQ,3,...,"[Toys & Games, Games & Accessories, Board Games]",14.33,3.0,1.333333,3.0,1.333333,1.0,2.0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91556,4754,1074,0,2021-12-13 05:43:30.735,"[569, 3911, 2098, 2342, 1859, 1340, 1305, 4073...","2015-05-28T23:08:50.000Z,2016-12-11T20:26:20.0...","[1432829330, 1481462780, 1494592575, 151508893...","[8, 8, 7, 7, 7, 7, 6, 6, 6, 5]",AFKQD5SINKETJBCBOTWOCWMSOA5Q,0,...,"[Toys & Games, Preschool, Pre-Kindergarten Toy...",25.98,9.0,4.888889,1.0,5.000000,0.0,,0.0,
91557,4612,1172,0,2022-03-07 23:43:01.126,"[-1, -1, -1, -1, -1, 363, 3508, 3004, 2336, 1562]","2010-12-26T21:58:56.000Z,2011-12-13T07:56:13.0...","[-1, -1, -1, -1, -1, 1293375536, 1323737773, 1...","[-1, -1, -1, -1, -1, 9, 9, 8, 7, 6]",AFJCPOPHR46UYP7S4YR4YXYMZHQA,0,...,"[Toys & Games, STEM Toys, Technology & Enginee...",,0.0,,0.0,,0.0,,0.0,
91558,8030,3608,0,2022-01-20 01:23:03.152,"[-1, -1, 3542, 2367, 1450, 734, 827, 586, 1008...","2018-06-07T09:41:33.364Z,2018-11-16T02:25:15.8...","[-1, -1, 1528339293, 1542309915, 1562260149, 1...","[-1, -1, 7, 7, 6, 6, 6, 6, 6, 6]",AGNNDSTERX7WEWRHQGBVUW5EORUQ,0,...,[],12.6,6.0,3.666667,1.0,5.000000,0.0,,0.0,
91559,9325,3433,0,2021-10-05 13:01:25.980,"[-1, -1, -1, 1096, 2261, 3776, 1972, 2741, 193...","2015-10-27T21:10:20.000Z,2015-10-27T21:10:58.0...","[-1, -1, -1, 1445955020, 1445955058, 148536225...","[-1, -1, -1, 8, 8, 7, 7, 7, 7, 7]",AH2SXPFOHZKKPEPBTFL7K4ZLUVVQ,0,...,"[Toys & Games, Preschool, Toddler Toys, Preten...",21.94,1.0,5.000000,0.0,,0.0,,0.0,


## Concating positive data with negative samples

In [15]:
full_features_df = (
    pd.concat([full_features_df, neg_df], axis=0)
    .reset_index(drop=True)
    .sample(frac=1, replace=False, random_state=args.random_seed)
)

In [16]:
key_cols = [
    args.user_col,
    args.item_col,
    "user_indice",
    "item_indice",
    "item_sequence",
    "item_sequence_ts_bucket",
    args.rating_col,
    args.timestamp_col,
]
assert (
    full_features_df[key_cols].isna().sum().sum() == 0
), "Null values found at key colums"

In [17]:
val_timestamp

Timestamp('2021-08-11 00:12:17.369000')

## Split back train test

In [18]:
to_unix_ts = lambda s: s.astype("int64") // 10**6
train_neg_df = full_features_df.loc[lambda df: df[args.timestamp_col].lt(val_timestamp)]
val_neg_df = full_features_df.loc[lambda df: df[args.timestamp_col].ge(val_timestamp)]
train_neg_df

Unnamed: 0,user_id,parent_asin,rating,event_timestamp,timestamp_unix,parent_asin_rating_cnt_365d,parent_asin_rating_avg_prev_rating_365d,parent_asin_rating_cnt_90d,parent_asin_rating_avg_prev_rating_90d,parent_asin_rating_cnt_30d,...,price,user_indice,item_indice,user_rating_cnt_90d,user_rating_avg_prev_rating_90d,user_rating_list_10_recent_asin,user_rating_list_10_recent_asin_timestamp,item_sequence_ts,item_sequence_ts_bucket,item_sequence
98413,AESGLJAZY4EBKQ2AZ2NHWVETH4TA,B01N4H4NZY,0.0,2017-05-03 18:06:23.000,,,,,,,...,24.99,2370,2329,3,4.666667,"B00CEUMJ62,B09PH7VX5F,B0001XNTJA,B07Z5RPTH5","2014-11-19T05:02:56.000Z,2017-03-05T02:56:35.0...","[-1, -1, -1, -1, -1, -1, 1416348176, 148865739...","[-1, -1, -1, -1, -1, -1, 6, 5, 5, 5]","[-1, -1, -1, -1, -1, -1, 1165, 3552, 121, 2890]"
59779,AERAZWOLE52UMWNJZDGI5RZ2KAEQ,B005AW85YG,1.0,2018-12-17 02:54:30.417,1.545015e+09,5.0,4.800000,1.0,4.000000,0.0,...,24.95,2270,811,0,,"B07TKPRMC9,B09M7Z5FBY,B017VXCL78","2014-11-08T00:33:22.000Z,2015-03-15T00:28:44.0...","[-1, -1, -1, -1, -1, -1, -1, 1415381602, 14263...","[-1, -1, -1, -1, -1, -1, -1, 7, 7, 5]","[-1, -1, -1, -1, -1, -1, -1, 2785, 3509, 2052]"
180382,AHB7JRJX7ZGSSLIEUWWIVNL23PGA,B07J5XPL2D,0.0,2015-01-10 06:03:04.000,,14.0,4.785714,10.0,4.800000,7.0,...,11.99,9947,2659,2,5.000000,"B0B9RQ1J4V,B000N178E2,B00ARV3CKM,B08NGJJKV3","2014-01-27T14:25:39.000Z,2014-02-20T04:41:59.0...","[-1, -1, -1, -1, -1, -1, 1390807539, 139284611...","[-1, -1, -1, -1, -1, -1, 5, 5, 5, 4]","[-1, -1, -1, -1, -1, -1, 3741, 297, 1082, 3270]"
40088,AEAZQVCFV4PDSYZOBA2ARFZPKBIA,B01KMUUQVK,5.0,2020-04-26 17:00:26.235,1.587920e+09,10.0,4.600000,0.0,,0.0,...,32.99,659,2251,2,4.500000,"B0197UC222,B00J5KVWQW,B08J8KS7HN,B004LKWP8K,B0...","2017-02-09T04:45:42.000Z,2017-02-09T04:49:40.0...","[-1, -1, -1, -1, -1, 1486590342, 1486590580, 1...","[-1, -1, -1, -1, -1, 7, 7, 5, 2, 2]","[-1, -1, -1, -1, -1, 2060, 1529, 3215, 738, 1850]"
124870,AH2W7OQEETDS7RQHDMMPK7F4X67Q,B007EA4UBY,0.0,2018-08-24 17:00:26.835,,19.0,4.578947,3.0,4.666667,1.0,...,22.1,9336,917,0,,,,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, 4143]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53491,AHCDVVCIICIZIDC2R6YHNANO5XSA,B00I0D3O4S,5.0,2017-01-04 19:53:44.000,1.483560e+09,6.0,3.833333,2.0,3.000000,1.0,...,22.99,10066,1421,0,,B09LTZC5MP,2016-03-03T21:35:57.000Z,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 1457015757]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, 5]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, 3502]"
106817,AF3OWF4OGSLGHKFC6KTKW77U3HWQ,B005AW85YG,0.0,2020-11-16 18:48:43.354,,4.0,5.000000,0.0,,0.0,...,24.95,3273,811,1,3.000000,"B0C2FFP1JM,B0BKQYFFLB,B0BP4G56Y8,B0C87RRT31,B0...","2019-09-10T20:05:23.193Z,2019-09-10T20:07:35.0...","[-1, -1, -1, -1, 1568120723, 1568120855, 15681...","[-1, -1, -1, -1, 6, 6, 6, 6, 6, 0]","[-1, -1, -1, -1, 3988, 3806, 3865, 4109, 4049,..."
61324,AEBREHKS4UMU5MBDH4WN6WQILD4Q,B0BPG2B61H,5.0,2017-08-20 11:46:32.114,1.503230e+09,11.0,4.727273,3.0,4.333333,1.0,...,23.99,729,3871,0,,"B09M7XZ33P,B000PGRXG8,B09NPJGN9N","2013-05-28T18:06:23.000Z,2013-07-08T02:55:13.0...","[-1, -1, -1, -1, -1, -1, -1, 1369739183, 13732...","[-1, -1, -1, -1, -1, -1, -1, 7, 7, 7]","[-1, -1, -1, -1, -1, -1, -1, 3508, 323, 3535]"
132003,AH7BKWN5OQPMQXJ4ID2CDIUSZ5CA,B09ND9TCQW,0.0,2015-10-02 18:14:01.000,,,,,,,...,44.0,9768,3523,6,4.333333,"B00I0CEJC0,B0BFGWHBB5,B00ITOB30U,B09VTZ1V5C,B0...","2015-10-01T23:16:53.000Z,2015-10-02T00:06:03.0...","[-1, -1, -1, -1, 1443716213, 1443719163, 14437...","[-1, -1, -1, -1, 3, 3, 3, 2, 1, 1]","[-1, -1, -1, -1, 1418, 3767, 1482, 3616, 3276,..."


In [19]:
val_neg_df

Unnamed: 0,user_id,parent_asin,rating,event_timestamp,timestamp_unix,parent_asin_rating_cnt_365d,parent_asin_rating_avg_prev_rating_365d,parent_asin_rating_cnt_90d,parent_asin_rating_avg_prev_rating_90d,parent_asin_rating_cnt_30d,...,price,user_indice,item_indice,user_rating_cnt_90d,user_rating_avg_prev_rating_90d,user_rating_list_10_recent_asin,user_rating_list_10_recent_asin_timestamp,item_sequence_ts,item_sequence_ts_bucket,item_sequence
91021,AFUJHZXN6PSURHQIIJHMOPORATFA,B07HD27YZT,5.0,2021-11-09 18:59:32.529,1.636484e+09,10.0,3.900000,1.0,5.0,0.0,...,,5670,2649,0,,"B00RCZ5NJY,B001AYYA7E,B0C9K2SMBZ,B09C18KZK5,B0...","2019-03-07T04:37:28.331Z,2020-02-25T21:11:13.6...","[-1, -1, -1, -1, -1, 1551908248, 1582639873, 1...","[-1, -1, -1, -1, -1, 6, 6, 6, 5, 5]","[-1, -1, -1, -1, -1, 1778, 415, 4118, 3453, 2705]"
90704,AGPPSB3U2ZYJBCCOQPNWOSRFHEGA,B0BTZ8RBSC,5.0,2022-05-10 18:23:28.240,1.652207e+09,0.0,,0.0,,0.0,...,12.99,8226,3919,0,,"B00123COXK,B0BV8DDRPC,B09ZKXZPCX,B007SYH2GQ,B0...","2019-12-28T06:15:11.762Z,2020-12-22T12:49:59.4...","[-1, -1, -1, -1, 1577488511, 1608616199, 16104...","[-1, -1, -1, -1, 6, 6, 6, 6, 6, 6]","[-1, -1, -1, -1, 375, 3923, 3666, 948, 2648, 2..."
91107,AHAC2AD342E34AFRDL6L5HI3PB2A,B08JC8TJ79,5.0,2021-08-17 01:03:32.856,1.629162e+09,1.0,5.000000,0.0,,0.0,...,19.99,9862,3223,0,,"B00C6Q1Z6E,B00U7EXD0I,B00T03T5X6,B07R7W3LM4,B0...","2015-01-04T02:05:56.000Z,2016-04-13T10:57:42.0...","[-1, -1, -1, -1, -1, 1420311956, 1460519862, 1...","[-1, -1, -1, -1, -1, 8, 8, 8, 7, 7]","[-1, -1, -1, -1, -1, 1148, 1874, 1828, 2739, 4..."
90845,AHFM5YXIKQSRMEMGMB3KKT3BDALQ,B01MS6B7BV,5.0,2021-09-09 03:17:04.699,1.631157e+09,4.0,4.500000,0.0,,0.0,...,14.39,10374,2295,0,,"B07N93ZTG8,B0BKVYHDFX,B0BDPQYVX3,B08B2BJ5CM,B0...","2019-05-26T06:49:54.856Z,2019-07-14T02:27:09.6...","[-1, 1558828194, 1563046029, 1563046507, 15630...","[-1, 6, 6, 6, 6, 6, 6, 6, 5, 5]","[-1, 2707, 3810, 3759, 3126, 3185, 3679, 3284,..."
182777,AEOEKFWEQ3KDALPEFKWLWZU3DN5Q,B004UIATJA,0.0,2022-04-23 13:01:43.519,,3.0,4.666667,1.0,5.0,0.0,...,,1981,781,2,5.0,"B086H3Y7XW,B0C3RYYVG6,B07H1TJYVB,B0BTNM3FB1,B0...","2019-07-24T04:01:00.028Z,2019-08-14T23:42:44.1...","[1563915660, 1565800964, 1573313664, 157331498...","[6, 6, 6, 6, 6, 6, 6, 6, 4, 4]","[3039, 4013, 2642, 3917, 1351, 3251, 3410, 318..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
182598,AEURVVYODVZ7AQGPFML6W2GVHV6Q,B005XVCR48,0.0,2021-09-16 20:51:47.406,,8.0,5.000000,3.0,5.0,0.0,...,,2619,846,1,5.0,"B00EDBY7X8,B07C4NGT17,B0C6YXFX2G,B087P21XYJ,B0...","2020-10-06T09:23:54.382Z,2020-10-06T09:28:41.7...","[1601951034, 1601951321, 1601951398, 160434040...","[5, 5, 5, 5, 5, 5, 5, 5, 5, 5]","[1294, 2559, 4090, 3082, 389, 2659, 2611, 3331..."
182506,AEFP5S6SYAYP6SB5YLIDZBXWUOAA,B00CMRD4SY,0.0,2022-01-11 19:28:06.586,,1.0,5.000000,1.0,5.0,1.0,...,8.4,1143,1181,0,,"B0CCDJQ5B8,B0C5BRR4VM,B09VV7YKJH,B09BVG7GMH,B0...","2019-07-31T00:26:48.844Z,2019-10-09T10:31:59.5...","[-1, -1, -1, 1564507608, 1570591919, 157801650...","[-1, -1, -1, 6, 6, 6, 6, 6, 5, 5]","[-1, -1, -1, 4129, 4053, 3621, 3449, 3529, 355..."
91453,AEGYGL7KYUVYJFYPHQ3GA2RUZJ5A,B0178J7KY4,5.0,2021-09-05 15:06:35.091,1.630854e+09,1.0,5.000000,0.0,,0.0,...,19.99,1274,2036,0,,"B08P1LJQ8D,B000ELQVAI,B0BV8DDRPC,B0C4XMNLPT,B0...","2015-10-06T10:47:05.000Z,2017-09-12T08:21:16.5...","[1444103225, 1505179276, 1555249681, 155525033...","[8, 7, 6, 6, 6, 6, 6, 6, 5, 5]","[3285, 198, 3923, 4042, 1366, 3974, 2237, 3633..."
182685,AHT7KNX3DFU3RA266FAIADDMNL7Q,B00009NQQO,0.0,2021-11-08 01:33:20.650,,2.0,3.000000,1.0,1.0,0.0,...,14.05,11749,111,0,,"B001P9OGRS,B004TT6RD2,B0BW3QTWJJ,B09XN32NG9,B0...","2016-12-07T09:58:21.000Z,2017-01-06T07:00:26.0...","[-1, -1, -1, -1, -1, 1481079501, 1483660826, 1...","[-1, -1, -1, -1, -1, 7, 7, 7, 6, 6]","[-1, -1, -1, -1, -1, 467, 776, 3934, 3651, 2303]"


## Check

In [20]:
user = val_neg_df.sample(n=1)[args.user_col].values[0]
logger.info(f"Checking user {user}...")
check_df = train_neg_df.loc[lambda df: df[args.user_col].eq(user)].sort_values(
    args.timestamp_col
)
assert (
    check_df[args.rating_col].gt(0).sum() * (args.neg_to_pos_ratio + 1)
    == check_df.shape[0]
), "Unexpected number of pos and neg samples"

[32m2025-06-28 17:32:55.460[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mChecking user AGN5KJZU3FYSKVWXWM66LXYWL5CQ...[0m


In [21]:
val_check_df = val_neg_df.loc[lambda df: df[args.user_col].eq(user)].sort_values(
    args.timestamp_col
)
item = val_check_df.loc[lambda df: df[args.rating_col].gt(0)][args.item_col].values[0]
logger.info(f"Checking item {item}...")
assert (
    train_neg_df.loc[lambda df: df[args.item_col].eq(item)].shape[0] > 5
), f"Item {item} does not appear much in training data"

[32m2025-06-28 17:32:57.867[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [1mChecking item B0BNKLX52M...[0m


In [22]:
check_df

Unnamed: 0,user_id,parent_asin,rating,event_timestamp,timestamp_unix,parent_asin_rating_cnt_365d,parent_asin_rating_avg_prev_rating_365d,parent_asin_rating_cnt_90d,parent_asin_rating_avg_prev_rating_90d,parent_asin_rating_cnt_30d,...,price,user_indice,item_indice,user_rating_cnt_90d,user_rating_avg_prev_rating_90d,user_rating_list_10_recent_asin,user_rating_list_10_recent_asin_timestamp,item_sequence_ts,item_sequence_ts_bucket,item_sequence
162549,AGN5KJZU3FYSKVWXWM66LXYWL5CQ,B006ZJHKA8,0.0,2019-06-28 00:11:21.535,,1.0,3.0,0.0,,0.0,...,,7982,887,0,,,,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, 4143]"
70988,AGN5KJZU3FYSKVWXWM66LXYWL5CQ,B07XRR3QZ8,5.0,2019-06-28 00:11:21.535,1561681000.0,7.0,4.571429,5.0,4.8,2.0,...,16.99,7982,2850,0,,,,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, 4143]"
70989,AGN5KJZU3FYSKVWXWM66LXYWL5CQ,B07TMRMPR2,5.0,2019-07-18 11:36:46.425,1563450000.0,1.0,5.0,1.0,5.0,1.0,...,,7982,2789,1,5.0,B07XRR3QZ8,2019-06-28T07:11:21.535Z,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 1561680681]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, 4]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, 2850]"
162550,AGN5KJZU3FYSKVWXWM66LXYWL5CQ,B00CQHZ0ZK,0.0,2019-07-18 11:36:46.425,,1.0,4.0,0.0,,0.0,...,,7982,1197,1,5.0,B07XRR3QZ8,2019-06-28T07:11:21.535Z,"[-1, -1, -1, -1, -1, -1, -1, -1, -1, 1561680681]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, 4]","[-1, -1, -1, -1, -1, -1, -1, -1, -1, 2850]"
70990,AGN5KJZU3FYSKVWXWM66LXYWL5CQ,B07PHTWLNJ,3.0,2019-08-20 18:44:07.354,1566327000.0,0.0,,0.0,,0.0,...,45.99,7982,2721,2,5.0,"B07XRR3QZ8,B07TMRMPR2","2019-06-28T07:11:21.535Z,2019-07-18T18:36:46.425Z","[-1, -1, -1, -1, -1, -1, -1, -1, 1561680681, 1...","[-1, -1, -1, -1, -1, -1, -1, -1, 5, 5]","[-1, -1, -1, -1, -1, -1, -1, -1, 2850, 2789]"
162551,AGN5KJZU3FYSKVWXWM66LXYWL5CQ,B09Y998YMS,0.0,2019-08-20 18:44:07.354,,9.0,5.0,2.0,5.0,0.0,...,54.99,7982,3659,2,5.0,"B07XRR3QZ8,B07TMRMPR2","2019-06-28T07:11:21.535Z,2019-07-18T18:36:46.425Z","[-1, -1, -1, -1, -1, -1, -1, -1, 1561680681, 1...","[-1, -1, -1, -1, -1, -1, -1, -1, 5, 5]","[-1, -1, -1, -1, -1, -1, -1, -1, 2850, 2789]"
162552,AGN5KJZU3FYSKVWXWM66LXYWL5CQ,B01LXS9TD1,0.0,2019-12-01 17:46:39.216,,2.0,3.0,0.0,,0.0,...,,7982,2267,0,,"B07XRR3QZ8,B07TMRMPR2,B07PHTWLNJ","2019-06-28T07:11:21.535Z,2019-07-18T18:36:46.4...","[-1, -1, -1, -1, -1, -1, -1, 1561680681, 15634...","[-1, -1, -1, -1, -1, -1, -1, 5, 5, 5]","[-1, -1, -1, -1, -1, -1, -1, 2850, 2789, 2721]"
70991,AGN5KJZU3FYSKVWXWM66LXYWL5CQ,B09HJQ9BX7,5.0,2019-12-01 17:46:39.216,1575222000.0,1.0,5.0,1.0,5.0,1.0,...,,7982,3473,0,,"B07XRR3QZ8,B07TMRMPR2,B07PHTWLNJ","2019-06-28T07:11:21.535Z,2019-07-18T18:36:46.4...","[-1, -1, -1, -1, -1, -1, -1, 1561680681, 15634...","[-1, -1, -1, -1, -1, -1, -1, 5, 5, 5]","[-1, -1, -1, -1, -1, -1, -1, 2850, 2789, 2721]"
162553,AGN5KJZU3FYSKVWXWM66LXYWL5CQ,B00K5OLKDC,0.0,2020-02-11 11:21:22.013,,3.0,5.0,0.0,,0.0,...,,7982,1566,1,5.0,"B07XRR3QZ8,B07TMRMPR2,B07PHTWLNJ,B09HJQ9BX7","2019-06-28T07:11:21.535Z,2019-07-18T18:36:46.4...","[-1, -1, -1, -1, -1, -1, 1561680681, 156344980...","[-1, -1, -1, -1, -1, -1, 5, 5, 5, 5]","[-1, -1, -1, -1, -1, -1, 2850, 2789, 2721, 3473]"
70992,AGN5KJZU3FYSKVWXWM66LXYWL5CQ,B001US2C1G,3.0,2020-02-11 11:21:22.013,1581420000.0,4.0,4.5,4.0,4.5,0.0,...,,7982,488,1,5.0,"B07XRR3QZ8,B07TMRMPR2,B07PHTWLNJ,B09HJQ9BX7","2019-06-28T07:11:21.535Z,2019-07-18T18:36:46.4...","[-1, -1, -1, -1, -1, -1, 1561680681, 156344980...","[-1, -1, -1, -1, -1, -1, 5, 5, 5, 5]","[-1, -1, -1, -1, -1, -1, 2850, 2789, 2721, 3473]"


In [23]:
val_check_df

Unnamed: 0,user_id,parent_asin,rating,event_timestamp,timestamp_unix,parent_asin_rating_cnt_365d,parent_asin_rating_avg_prev_rating_365d,parent_asin_rating_cnt_90d,parent_asin_rating_avg_prev_rating_90d,parent_asin_rating_cnt_30d,...,price,user_indice,item_indice,user_rating_cnt_90d,user_rating_avg_prev_rating_90d,user_rating_list_10_recent_asin,user_rating_list_10_recent_asin_timestamp,item_sequence_ts,item_sequence_ts_bucket,item_sequence
91426,AGN5KJZU3FYSKVWXWM66LXYWL5CQ,B0BNKLX52M,3.0,2021-12-16 16:05:26.060,1639671000.0,1.0,5.0,1.0,5.0,1.0,...,44.99,7982,3858,0,,"B07XRR3QZ8,B07TMRMPR2,B07PHTWLNJ,B09HJQ9BX7,B0...","2019-06-28T07:11:21.535Z,2019-07-18T18:36:46.4...","[-1, -1, 1561680681, 1563449806, 1566326647, 1...","[-1, -1, 6, 6, 6, 6, 6, 6, 5, 5]","[-1, -1, 2850, 2789, 2721, 3473, 488, 3346, 30..."
182987,AGN5KJZU3FYSKVWXWM66LXYWL5CQ,B001NQHN7S,0.0,2021-12-16 16:05:26.060,,1.0,5.0,1.0,5.0,1.0,...,,7982,465,0,,"B07XRR3QZ8,B07TMRMPR2,B07PHTWLNJ,B09HJQ9BX7,B0...","2019-06-28T07:11:21.535Z,2019-07-18T18:36:46.4...","[-1, -1, 1561680681, 1563449806, 1566326647, 1...","[-1, -1, 6, 6, 6, 6, 6, 6, 5, 5]","[-1, -1, 2850, 2789, 2721, 3473, 488, 3346, 30..."


In [24]:
# Persist
full_features_df.to_parquet(full_features_df_path, index=False)
train_neg_df.to_parquet(train_neg_df_path, index=False)
val_neg_df.to_parquet(val_neg_df_path, index=False)