In [1]:
import json
import sys
import os
import pandas as pd
from loguru import logger
from pydantic import BaseModel
from tqdm.auto import tqdm

In [2]:
class Args(BaseModel):
    num_negative_samples: int = 5
    window_size: int = 1
    batch_size: int = 16

    user_col: str = "user_id"
    item_col: str = "parent_asin"


args = Args()
print(args.model_dump_json(indent=2))

{
  "num_negative_samples": 5,
  "window_size": 1,
  "batch_size": 16,
  "user_col": "user_id",
  "item_col": "parent_asin"
}


## Load Data

In [None]:
pvc_path = os.getenv("PVC_PATH")
# pvc_path = "/home/duong/Documents/datn1/data"
if not pvc_path:
    raise ValueError("PVC_PATH environment variable not set")
train_features_path = f"{pvc_path}/train_features.parquet"
val_feature_path = f"{pvc_path}/val_features.parquet"
sequences_path = f"{pvc_path}/train_item_sequence.jsonl"
val_sequences_path = f"{pvc_path}/val_item_sequence.jsonl"
batch_sequences_overfit_path = f"{pvc_path}/batch_sequences_overfit_path.jsonl"

In [4]:
def get_sequence(df, user_col=args.user_col, item_col=args.item_col):
    return (
        df.groupby(user_col)[item_col]
        .agg(list)
        .loc[lambda s: s.apply(len) > 1]  # Remove sequence with only one item
    ).values.tolist()

In [5]:
train_df = pd.read_parquet(train_features_path)
item_sequence = train_df.pipe(get_sequence)
len(item_sequence)

12409

In [6]:
val_df = pd.read_parquet(val_feature_path)
val_item_sequence = val_df.pipe(get_sequence)
len(val_item_sequence)

137

## Persist

In [7]:
with open(sequences_path, "w") as f:
    for sequence in item_sequence:
        f.write(json.dumps(sequence) + "\n")
with open(val_sequences_path, "w") as f:
    for sequence in val_item_sequence:
        f.write(json.dumps(sequence) + "\n")

logger.info(f"{len(item_sequence)=:,.0f} {len(val_item_sequence)=:,.0f}")

[32m2025-06-28 17:33:51.163[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mlen(item_sequence)=12,409 len(val_item_sequence)=137[0m


## Persist a small data for overfitting

In [8]:
num_sequences = 2
batch_item_sequence = item_sequence[:num_sequences]

with open(batch_sequences_overfit_path, "w") as f:
    for sequence in batch_item_sequence:
        f.write(json.dumps(sequence) + "\n")