In [8]:
import os
import sys

import datasets
import pandas as pd
from datasets import load_dataset
from dotenv import load_dotenv
from loguru import logger
from pydantic import BaseModel

sys.path.insert(0, "..")
from src.data_prep_utils import handle_dtypes, parse_dt

load_dotenv("../.env", override=True)
datasets.logging.set_verbosity_error()

In [9]:
class Args(BaseModel):
    run_name: str = "000-prep-data"
    testing: bool = True
    notebook_persist_dp: str = None
    random_seed: int = 41

    train_fp: str = "../data/train.parquet"
    val_fp: str = "../data/val.parquet"

    user_col: str = "user_id"
    item_col: str = "parent_asin"
    rating_col: str = "rating"
    timestamp_col: str = "timestamp"

    # Number of days left out not being pushed into the OLTP so that later we can simulate having them as new data
    num_days_holdout: int = 30
    holdout_fp: str = "../data/holdout.parquet"

    # Output PostgreSQL table
    table_name: str = "amz_review_rating_raw"

    def init(self):
        self.notebook_persist_dp = os.path.abspath(f"data/{self.run_name}")
        if not self.testing:
            os.makedirs(self.notebook_persist_dp, exist_ok=True)

        return self


args = Args().init()

print(args.model_dump_json(indent=2))

{
  "run_name": "000-prep-data",
  "testing": true,
  "notebook_persist_dp": "/home/duong/Documents/datn1/notebooks/data/000-prep-data",
  "random_seed": 41,
  "train_fp": "../data/train.parquet",
  "val_fp": "../data/val.parquet",
  "user_col": "user_id",
  "item_col": "parent_asin",
  "rating_col": "rating",
  "timestamp_col": "timestamp",
  "num_days_holdout": 30,
  "holdout_fp": "../data/holdout.parquet",
  "table_name": "amz_review_rating_raw"
}


## Load data


In [10]:
metadata_raw = load_dataset(
    "McAuley-Lab/Amazon-Reviews-2023", "raw_meta_Toys_and_Games", trust_remote_code=True
)
metadata_raw_df = metadata_raw["full"].to_pandas()
metadata_raw_df

Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,categories,details,parent_asin,bought_together,subtitle,author
0,Toys & Games,"KUNGOON Happy Anniversary Balloon Banner,Weddi...",4.5,241,[],[],,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': ['Pretty Cool!', 'Product assembly a...",Kunggo,[],"{""Package Dimensions"": ""10.12 x 8.03 x 0.51 in...",B08GPM7CQN,,,
1,Toys & Games,Gothic Mothman Plushie Doll with Bright Red Ey...,1.3,2,[🦋 Mothman’s bright red eyes could stare you d...,[🦋 Description: Mothman’s bright red eyes coul...,18.99,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",Felicy,"[Toys & Games, Stuffed Animals & Plush Toys, P...","{""Item Weight"": ""2.47 ounces"", ""Manufacturer r...",B09X9XW42H,,,
2,Toys & Games,Melody Jane Dollhouse Builders DIY 1:24 Scale ...,4.2,67,[1:24 Scale - Plastic - Approximate cut out si...,[],,"{'hi_res': [None, 'https://m.media-amazon.com/...",{'title': ['Cutemini wooden window double door...,Melody Jane Dolls Houses,"[Toys & Games, Dolls & Accessories, Dollhouse ...","{""Item Weight"": ""0.48 ounces"", ""Manufacturer r...",B01I9QET6M,,,
3,Toys & Games,Traxxas Stampede 4X4: 1/10 Scale 4wd Monster T...,4.5,48,[Waterproof electronics for all-weather drivin...,[Stampede 4X4 is built Traxxas Tough to withst...,,{'hi_res': ['https://m.media-amazon.com/images...,{'title': ['Traxxas Slash 2WD Short Course Rac...,Traxxas,"[Toys & Games, Remote & App Controlled Vehicle...","{""Product Dimensions"": ""15.63 x 13.39 x 8.94 i...",B019XEEX1A,,,
4,Toys & Games,Hot Wheels Monster Truck 1:24 Scale 2022 Bone ...,4.8,17699,[Designed in 1:24 scale with durable die-cast ...,[The Hot Wheels Monster Trucks 1:24 scale die-...,27.98,{'hi_res': ['https://m.media-amazon.com/images...,{'title': ['Hot Wheels 1:24 Scale Monster Truc...,Hot Wheels,"[Toys & Games, Preschool, Pre-Kindergarten Toys]","{""Product Dimensions"": ""5 x 6.27 x 5.5 inches""...",B09G7K3JWQ,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
890869,Toys & Games,Dollhouse Miniature 1:12 Scale Fire Place Acce...,4.6,2,[],[Unless stated otherwise this item is 1:12 sca...,16.09,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",Melody Jane Dolls Houses,"[Toys & Games, Dolls & Accessories, Dollhouse ...","{""Product Dimensions"": ""2.99 x 2.52 x 0.08 inc...",B00BGO1PDU,,,
890870,Sports & Outdoors,Hacko Games Pride Deck Poker Cards,4.6,5,[Custom deck of playing cards],[Pride is a fantastically color card system. A...,,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",Hacko Games,"[Toys & Games, Games & Accessories, Card Games...","{""Item Package Dimensions L x W x H"": ""3.54 x ...",B07T16B3W1,,,
890871,Toys & Games,Mini Squee-Z-Bubs & Bubbles (Sold Individually...,3.7,7,"[Toysmith 774546 Mini Squee-z Bubbles, Educati...",[Toysmith 774546 Mini Squee-z Bubbles. Toysmit...,,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",Toysmith,"[Toys & Games, Sports & Outdoor Play, Bubbles,...","{""Product Dimensions"": ""2 x 4.2 x 1.1 inches"",...",B002IOZ92K,,,
890872,Toys & Games,Sentosphère Aquarellum Junior Butterflies & Fl...,4.6,141,"[Complete kit., Paint without going over the l...",[Fantastic. A few drops of paint and any child...,,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",Sentosphère,"[Toys & Games, Arts & Crafts, Craft Kits, Pain...","{""Product Dimensions"": ""10.43 x 7.68 x 1.18 in...",B06XJVLKDD,,,


In [11]:
if not os.path.exists(args.train_fp):
    raise Exception(
        f"{args.train_fp} does not exist, you need to run the notebook 000-prep-data in the parent recsys-mvp folder first"
    )

train_df = pd.read_parquet(args.train_fp)
val_df = pd.read_parquet(args.val_fp)
train_df

Unnamed: 0,user_id,parent_asin,rating,timestamp
20,AH6CATODIVPVUOJEWHRSRCSKAOHA,B08NJSW98M,3.0,1414371251000
21,AH6CATODIVPVUOJEWHRSRCSKAOHA,B000CBURPU,3.0,1414371300000
22,AH6CATODIVPVUOJEWHRSRCSKAOHA,B004TT6RD2,5.0,1414371371000
24,AH6CATODIVPVUOJEWHRSRCSKAOHA,B005XVCPJ0,4.0,1417576924000
26,AH6CATODIVPVUOJEWHRSRCSKAOHA,B08DR6G1MY,5.0,1431641545000
...,...,...,...,...
3114120,AHXRC3ZXTYTIRYOB2X5GVCADKDOQ,B00QCBBHHQ,5.0,1433377310000
3114122,AHXRC3ZXTYTIRYOB2X5GVCADKDOQ,B00QCBBHIU,5.0,1433377544000
3114123,AHXRC3ZXTYTIRYOB2X5GVCADKDOQ,B00OCLA5GM,5.0,1433377644000
3114124,AHXRC3ZXTYTIRYOB2X5GVCADKDOQ,B00J0RZKH2,5.0,1433377680000


## Merge metadata

In [12]:
cols = ["main_category", "title", "description", "categories", "price"]

# Merge the item features into the interaction data
train_features_df = pd.merge(
    train_df, metadata_raw_df[[args.item_col] + cols], how="left", on=args.item_col
)
val_features_df = pd.merge(
    val_df, metadata_raw_df[[args.item_col] + cols], how="left", on=args.item_col
)

full_df = (
    pd.concat(
        [
            train_features_df.assign(source="train"),
            val_features_df.assign(source="val"),
        ],
        axis=0,
    )
    .assign(
        description=lambda df: df["description"].apply(list),
        categories=lambda df: df["categories"].apply(list),
    )
    .pipe(parse_dt)
    .pipe(handle_dtypes)
)
full_df

Unnamed: 0,user_id,parent_asin,rating,timestamp,main_category,title,description,categories,price,source
0,AH6CATODIVPVUOJEWHRSRCSKAOHA,B08NJSW98M,3.0,2014-10-27 00:54:11.000,Toys & Games,Award Winning Hape Mighty Mini Band Wooden Per...,"[Product Description, Get ready to move and gr...","[Toys & Games, Learning & Education, Musical I...",34.99,train
1,AH6CATODIVPVUOJEWHRSRCSKAOHA,B000CBURPU,3.0,2014-10-27 00:55:00.000,Musical Instruments,8 Inch Rainstick,[Halilit by Edushape musical instruments are t...,"[Toys & Games, Learning & Education, Musical I...",,train
2,AH6CATODIVPVUOJEWHRSRCSKAOHA,B004TT6RD2,5.0,2014-10-27 00:56:11.000,Toys & Games,Melissa & Doug Sunny Patch Bella Butterfly Tea...,"[Product Description, With rosy colors and cha...","[Toys & Games, Preschool, Pre-Kindergarten Toy...",,train
3,AH6CATODIVPVUOJEWHRSRCSKAOHA,B005XVCPJ0,4.0,2014-12-03 03:22:04.000,Toys & Games,Fisher-Price Brilliant Basics Activity Puzzle,"[Product Description, Which one goes where? A ...","[Toys & Games, Preschool, Pre-Kindergarten Toy...",,train
4,AH6CATODIVPVUOJEWHRSRCSKAOHA,B08DR6G1MY,5.0,2015-05-14 22:12:25.000,Toys & Games,Little Tikes Anchors Away Pirate Ship – Amazon...,"[Product Description, Ahoy matey! Get ready fo...","[Toys & Games, Preschool, Toddler Toys, Activi...",79.65,train
...,...,...,...,...,...,...,...,...,...,...
907,AEJPAQAXIU23T3W6AGLMIEMFF7JA,B083LCLQQY,5.0,2021-12-08 23:05:39.706,Toys & Games,"KOKODI LCD Writing Tablet, 10 Inch Colorful To...",[],"[Toys & Games, Arts & Crafts, Drawing & Writin...",19.99,val
908,AGPID74S4Z4ATACH7NBRJJEA3C5Q,B00GCGG6FO,4.0,2022-04-07 01:28:17.303,Toys & Games,Buckle Toys - Blu Whale - Develop Motor Skills...,[],"[Toys & Games, Stuffed Animals & Plush Toys, P...",19.99,val
909,AFKUIPCXHGODKZ5AF7IVJ7DXZIZA,B091RGT9F8,5.0,2021-12-20 18:25:53.289,Toys & Games,Fun Forts Glow Fort Building Kit for Kids - 81...,[],"[Toys & Games, Preschool, Pre-Kindergarten Toy...",39.99,val
910,AGKI3YWKSLNVXTOK5IYA7PSUAHTQ,B0BHT45FW9,5.0,2021-08-19 18:33:38.116,Toys & Games,CozyBomB Magnetic Fishing Pool Toys Game for K...,[],"[Toys & Games, Preschool, Pre-Kindergarten Toy...",17.98,val


In [13]:
holdout_date = (
    full_df["timestamp"].max() - pd.to_timedelta(args.num_days_holdout, unit="d")
).strftime("%Y-%m-%d")
logger.info(f"{holdout_date=}")
to_insert_df = full_df.loc[lambda df: df["timestamp"].lt(holdout_date)]
holdout_df = full_df.loc[lambda df: df["timestamp"].ge(holdout_date)]
print("5 first line of to_insert_df")
print(to_insert_df.head(5))

print("\n5 first line of holdout_df:")
print(holdout_df.head(5))

[32m2025-07-26 17:47:47.024[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1mholdout_date='2022-06-15'[0m


5 first line of to_insert_df
                        user_id parent_asin  rating           timestamp  \
0  AH6CATODIVPVUOJEWHRSRCSKAOHA  B08NJSW98M     3.0 2014-10-27 00:54:11   
1  AH6CATODIVPVUOJEWHRSRCSKAOHA  B000CBURPU     3.0 2014-10-27 00:55:00   
2  AH6CATODIVPVUOJEWHRSRCSKAOHA  B004TT6RD2     5.0 2014-10-27 00:56:11   
3  AH6CATODIVPVUOJEWHRSRCSKAOHA  B005XVCPJ0     4.0 2014-12-03 03:22:04   
4  AH6CATODIVPVUOJEWHRSRCSKAOHA  B08DR6G1MY     5.0 2015-05-14 22:12:25   

         main_category                                              title  \
0         Toys & Games  Award Winning Hape Mighty Mini Band Wooden Per...   
1  Musical Instruments                                   8 Inch Rainstick   
2         Toys & Games  Melissa & Doug Sunny Patch Bella Butterfly Tea...   
3         Toys & Games      Fisher-Price Brilliant Basics Activity Puzzle   
4         Toys & Games  Little Tikes Anchors Away Pirate Ship – Amazon...   

                                         description  \
0

In [14]:
# Check items (parent_asin)
train_items = train_df["parent_asin"].unique()
val_items = val_df["parent_asin"].unique()
items_not_in_train = set(val_items) - set(train_items)
num_missing_items = len(items_not_in_train)

# Check users (user_id)
train_users = train_df["user_id"].unique()
val_users = val_df["user_id"].unique()
users_not_in_train = set(val_users) - set(train_users)
num_missing_users = len(users_not_in_train)

# Validation checks
assert (
    num_missing_items == 0
), f"""
Error: Found {num_missing_items} items in validation set that do not appear in training set!
This may cause item cold-start problems when evaluating the model."""

assert (
    num_missing_users == 0
), f"""
Error: Found {num_missing_users} users in validation set that do not appear in training set!
This may cause user cold-start problems when evaluating the model."""

In [15]:
import pandas as pd


def print_min_max_time(df, df_name):
    # Lấy min và max timestamp
    min_time = df["timestamp"].min()
    max_time = df["timestamp"].max()

    # Chuyển đổi từ milliseconds sang seconds và thành datetime (UTC)
    min_datetime = pd.to_datetime(min_time, unit="ms").tz_localize("UTC")
    max_datetime = pd.to_datetime(max_time, unit="ms").tz_localize("UTC")

    # Chuyển đổi sang timezone khác (ví dụ: 'US/Pacific')
    min_datetime_pst = min_datetime.tz_convert("US/Pacific")
    max_datetime_pst = max_datetime.tz_convert("US/Pacific")

    print(f"\n{df_name}:")
    print(
        f"- Min timestamp (raw): {min_time} → Datetime (UTC): {min_datetime} → PST: {min_datetime_pst}"
    )
    print(
        f"- Max timestamp (raw): {max_time} → Datetime (UTC): {max_datetime} → PST: {max_datetime_pst}"
    )


# Áp dụng cho từng tập dữ liệu
print_min_max_time(train_df, "Train")
print_min_max_time(val_df, "Validation")


Train:
- Min timestamp (raw): 1036400058000 → Datetime (UTC): 2002-11-04 08:54:18+00:00 → PST: 2002-11-04 00:54:18-08:00
- Max timestamp (raw): 1628640736369 → Datetime (UTC): 2021-08-11 00:12:16.369000+00:00 → PST: 2021-08-10 17:12:16.369000-07:00

Validation:
- Min timestamp (raw): 1628648461740 → Datetime (UTC): 2021-08-11 02:21:01.740000+00:00 → PST: 2021-08-10 19:21:01.740000-07:00
- Max timestamp (raw): 1657847308934 → Datetime (UTC): 2022-07-15 01:08:28.934000+00:00 → PST: 2022-07-14 18:08:28.934000-07:00


## Upload data to cloud storage

In [None]:
from sqlalchemy import create_engine, text

username = "postgres"
password = "postgres"
host = "simulate-oltp-db.cdkwg6wyo7r8.ap-southeast-1.rds.amazonaws.com"
port = "5432"
database = "raw_data"
schema = "public"
table_name = "reviews"

connection_string = (
    f"postgresql+psycopg2://{username}:{password}@{host}:{port}/{database}"
)
engine = create_engine(connection_string)

with engine.connect() as conn:
    conn.execute(text(f"CREATE SCHEMA IF NOT EXISTS {schema};"))

to_insert_df.to_sql(table_name, engine, if_exists="replace", index=False, schema=schema)

with engine.connect() as conn:
    conn.execute(
        text(
            f"""
        ALTER TABLE {schema}.{table_name}
        ALTER COLUMN user_id TYPE varchar(64),
        ALTER COLUMN price TYPE varchar(32);
    """
        )
    )
    conn.execute(
        text(
            f"""
        ALTER TABLE {schema}.{table_name}
        ADD CONSTRAINT reviews_pkey PRIMARY KEY (user_id, timestamp, price);
    """
        )
    )

  conn.execute(text(f"CREATE SCHEMA IF NOT EXISTS {schema};"))


In [None]:
from sqlalchemy import create_engine, text

username = "postgres"
password = "postgres"
host = "simulate-oltp-db.cdkwg6wyo7r8.ap-southeast-1.rds.amazonaws.com"
port = "5432"
database = "raw_data"
schema = "public"
reviews_table = "reviews"
new_table = "new_reviews"

connection_string = (
    f"postgresql+psycopg2://{username}:{password}@{host}:{port}/{database}"
)
engine = create_engine(connection_string)

with engine.connect() as conn:
    conn.execute(text(f"DROP TABLE IF EXISTS {schema}.{new_table};"))
    conn.execute(
        text(
            f"CREATE TABLE {schema}.{new_table} (LIKE {schema}.{reviews_table} INCLUDING ALL);"
        )
    )
    print(
        f"Đã tạo bảng {schema}.{new_table} tương tự {schema}.{reviews_table} thành công."
    )

Đã tạo bảng public.new_reviews tương tự public.reviews thành công.


In [None]:
import os

import boto3
import pandas as pd
from dotenv import load_dotenv

load_dotenv(".env", override=True)

AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
AWS_DEFAULT_REGION = os.getenv("AWS_DEFAULT_REGION", "ap-southeast-1")
S3_BUCKET = os.getenv("S3_BUCKET")
HOLDOUT_FP = "holdout.parquet"

holdout_df.sort_values("timestamp", ascending=True).to_parquet(HOLDOUT_FP, index=False)

s3 = boto3.client(
    "s3",
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    region_name=AWS_DEFAULT_REGION,
)
s3.upload_file(HOLDOUT_FP, S3_BUCKET, HOLDOUT_FP)
print(f"Đã upload file {HOLDOUT_FP} lên s3://{S3_BUCKET}/{HOLDOUT_FP}")

Đã upload file holdout.parquet lên s3://recsys-ops/holdout.parquet


In [None]:
# import os
# import time

# import boto3
# import pandas as pd
# from evidently.metric_preset import DataDriftPreset
# from evidently.report import Report

# # ---- 1. Load configurations from environment variables ----
# REGION = os.getenv("AWS_REGION")
# ACCESS_KEY = os.getenv("AWS_ACCESS_KEY_ID")
# SECRET_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")

# # ---- 2. Column configuration ----
# cols_check = ["user_id", "parent_asin", "rating", "timestamp", "main_category", "price"]

# # ---- 3. Function to fix data types ----


# def fix_dtypes(df):
#     df = df.copy()
#     for col in cols_check:
#         if col not in df.columns:
#             df[col] = None
#     df["user_id"] = df["user_id"].astype(str)
#     df["parent_asin"] = df["parent_asin"].astype(str)
#     df["main_category"] = df["main_category"].astype(str)
#     df["rating"] = pd.to_numeric(df["rating"], errors="coerce").astype("float64")
#     df["price"] = pd.to_numeric(df["price"], errors="coerce").astype("float64")
#     df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
#     if pd.api.types.is_datetime64tz_dtype(df["timestamp"]):
#         df["timestamp"] = df["timestamp"].dt.tz_localize(None)
#     df = df[cols_check].dropna()
#     return df


# # ---- 4. Load reference and current data from parquet files ----


# def load_data():
#     print(
#         "Loading reference data from /home/duong/Documents/datn1/data/to_insert_df.parquet..."
#     )
#     df_ref = pd.read_parquet("/home/duong/Documents/datn1/data/to_insert_df.parquet")
#     print(
#         "Loading current data from /home/duong/Documents/datn1/notebooks/holdout.parquet..."
#     )
#     df_current = pd.read_parquet(
#         "/home/duong/Documents/datn1/notebooks/holdout.parquet"
#     )

#     print("Reference data shape:", df_ref.shape)
#     print("Current data shape:", df_current.shape)

#     df_ref = fix_dtypes(df_ref)
#     df_current = fix_dtypes(df_current)

#     print("Reference columns and types:")
#     print(df_ref.dtypes)
#     print(df_ref.head())
#     print("Current columns and types:")
#     print(df_current.dtypes)
#     print(df_current.head())

#     return df_ref, df_current


# # ---- 5. Function to log p-value and drift score for each column ----


# def log_drift_details(report):
#     # Extract drift score and p-value from Evidently output
#     result = report.as_dict()
#     try:
#         feature_metrics = result["metrics"][0]["result"]["drift_by_columns"]
#         for col, metrics in feature_metrics.items():
#             score = metrics.get("drift_score")
#             stattest = metrics.get("stattest_name")
#             p_value = metrics.get("stattest_threshold")
#             actual_pvalue = metrics.get("stattest_p_value")
#             detected = metrics.get("drift_detected")
#             print(
#                 f"[{col}] | Drift Score: {score:.4f} | p-value: {actual_pvalue:.4g} | StatTest: {stattest} | Drift Detected: {detected}"
#             )
#     except Exception as ex:
#         print("Failed to log drift details:", ex)


# # ---- 6. Function to upload HTML report to S3 ----


# def upload_html_to_s3(local_path, bucket, s3_key):
#     s3 = boto3.client(
#         "s3",
#         region_name=REGION,
#         aws_access_key_id=ACCESS_KEY,
#         aws_secret_access_key=SECRET_KEY,
#     )
#     s3.upload_file(local_path, bucket, s3_key, ExtraArgs={"ContentType": "text/html"})
#     print(f"Uploaded to s3://{bucket}/{s3_key}")


# # ---- 7. Main function ----


# def main():
#     # Load data
#     df_ref, df_current = load_data()

#     # Check for column and dtype mismatches
#     if not all(df_ref.columns == df_current.columns):
#         print("Column mismatch between reference and current data, exiting.")
#         print("df_ref.columns:", df_ref.columns)
#         print("df_current.columns:", df_current.columns)
#         return
#     if not all(df_ref.dtypes == df_current.dtypes):
#         print("Dtype mismatch between reference and current data, exiting.")
#         print("df_ref.dtypes:", df_ref.dtypes)
#         print("df_current.dtypes:", df_current.dtypes)
#         return

#     # Run Evidently report
#     print("Running Evidently report...")
#     report = Report(metrics=[DataDriftPreset()])
#     report.run(reference_data=df_ref, current_data=df_current)

#     # Log drift details
#     log_drift_details(report)

#     # Save HTML report
#     report_path = f"drift_report_{int(time.time())}.html"
#     report.save_html(report_path)
#     print(f"Successfully exported report to file: {report_path}")

#     # Upload to S3
#     s3_bucket = os.getenv("S3_BUCKET", "recsys-ops")
#     s3_key = os.getenv("S3_KEY", "drift_report.html")
#     upload_html_to_s3(report_path, bucket=s3_bucket, s3_key=s3_key)


# if __name__ == "__main__":
#     main()

Loading reference data from /home/duong/Documents/datn1/data/to_insert_df.parquet...
Loading current data from /home/duong/Documents/datn1/notebooks/holdout.parquet...
Reference data shape: (91561, 10)
Current data shape: (51, 10)
Reference columns and types:
user_id                  object
parent_asin              object
rating                  float64
timestamp        datetime64[ns]
main_category            object
price                   float64
dtype: object
                        user_id parent_asin  rating           timestamp  \
0  AH6CATODIVPVUOJEWHRSRCSKAOHA  B08NJSW98M     3.0 2014-10-27 00:54:11   
4  AH6CATODIVPVUOJEWHRSRCSKAOHA  B08DR6G1MY     5.0 2015-05-14 22:12:25   
5  AH6CATODIVPVUOJEWHRSRCSKAOHA  B00G70DFC2     5.0 2016-05-13 15:16:08   
6  AH6CATODIVPVUOJEWHRSRCSKAOHA  B00068Q7LC     5.0 2016-05-13 15:24:17   
9  AH6CATODIVPVUOJEWHRSRCSKAOHA  B08KQCK5Y1     5.0 2016-06-05 19:27:21   

  main_category  price  
0  Toys & Games  34.99  
4  Toys & Games  79.65  
5  Toys 

  if pd.api.types.is_datetime64tz_dtype(df["timestamp"]):
  if pd.api.types.is_datetime64tz_dtype(df["timestamp"]):


Failed to log drift details: 'drift_by_columns'
Successfully exported report to file: drift_report_1753530792.html
Uploaded to s3://recsys-ops/drift_report.html
