In [None]:
import sys

import datasets
import pandas as pd
import plotly.express as px
from datasets import load_dataset
from pydantic import BaseModel

datasets.logging.set_verbosity_error()

sys.path.insert(0, "..")

from src.data_prep_utils import handle_dtypes, parse_dt

In [10]:
class Args(BaseModel):
    hf_dataset_path: str = "McAuley-Lab/Amazon-Reviews-2023"
    report_sample_num_rows: int = 10000
    random_seed: int = 41


args = Args()

print(args.model_dump_json())

{"hf_dataset_path":"McAuley-Lab/Amazon-Reviews-2023","report_sample_num_rows":10000,"random_seed":41}


In [4]:
metadata = load_dataset(
    args.hf_dataset_path,
    name="raw_meta_Toys_and_Games",
    split="full",
    trust_remote_code=True,
)
metadata[0]

{'main_category': 'Toys & Games',
 'title': 'KUNGOON Happy Anniversary Balloon Banner,Wedding Anniversary Party Decorations,Love Party and Anniversary Party Supplies,16 Inch Gold Aluminum Foil.',
 'average_rating': 4.5,
 'rating_number': 241,
 'features': [],
 'description': [],
 'price': 'None',
 'images': {'hi_res': ['https://m.media-amazon.com/images/I/51tskkWgFmL._AC_SL1000_.jpg',
   'https://m.media-amazon.com/images/I/51pCkL9l5LL._AC_SL1000_.jpg',
   'https://m.media-amazon.com/images/I/61vjFq1pe1L._AC_SL1000_.jpg'],
  'large': ['https://m.media-amazon.com/images/I/41enwmDCoZL._AC_.jpg',
   'https://m.media-amazon.com/images/I/414NxCPIOmL._AC_.jpg',
   'https://m.media-amazon.com/images/I/516olygyHAL._AC_.jpg'],
  'thumb': ['https://m.media-amazon.com/images/I/41enwmDCoZL._AC_US40_.jpg',
   'https://m.media-amazon.com/images/I/414NxCPIOmL._AC_US40_.jpg',
   'https://m.media-amazon.com/images/I/516olygyHAL._AC_US40_.jpg'],
  'variant': ['MAIN', 'PT01', 'PT02']},
 'videos': {'title

In [5]:
dataset = load_dataset(
    args.hf_dataset_path, name="5core_timestamp_Toys_and_Games", trust_remote_code=True
)
train_raw_df = dataset["train"].to_pandas()
train_raw_df

Unnamed: 0,user_id,parent_asin,rating,timestamp
0,AGKASBHYZPGTEPO6LWZPVJWB2BVA,B006GBITXC,3.0,1452647382000
1,AGKASBHYZPGTEPO6LWZPVJWB2BVA,B00TLEMSVK,4.0,1454675785000
2,AGKASBHYZPGTEPO6LWZPVJWB2BVA,B00SO7HF6I,3.0,1454676014000
3,AGKASBHYZPGTEPO6LWZPVJWB2BVA,B00MZG6OO8,3.0,1471541996000
4,AGKASBHYZPGTEPO6LWZPVJWB2BVA,B007JWWUDW,5.0,1471542588000
...,...,...,...,...
3114942,AHMDS2PYZIJWE6SBXGDTBSJ4SRLA,B00BY2ER66,1.0,1379154911000
3114943,AHMDS2PYZIJWE6SBXGDTBSJ4SRLA,B00CVDMCH8,3.0,1379155054000
3114944,AHMDS2PYZIJWE6SBXGDTBSJ4SRLA,B00BY2ER6G,5.0,1379155144000
3114945,AHMDS2PYZIJWE6SBXGDTBSJ4SRLA,B00I8Z6GAM,5.0,1420664276000


In [6]:
metadata_raw_df = metadata.to_pandas()
metadata_raw_df

Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,categories,details,parent_asin,bought_together,subtitle,author
0,Toys & Games,"KUNGOON Happy Anniversary Balloon Banner,Weddi...",4.5,241,[],[],,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': ['Pretty Cool!', 'Product assembly a...",Kunggo,[],"{""Package Dimensions"": ""10.12 x 8.03 x 0.51 in...",B08GPM7CQN,,,
1,Toys & Games,Gothic Mothman Plushie Doll with Bright Red Ey...,1.3,2,[🦋 Mothman’s bright red eyes could stare you d...,[🦋 Description: Mothman’s bright red eyes coul...,18.99,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",Felicy,"[Toys & Games, Stuffed Animals & Plush Toys, P...","{""Item Weight"": ""2.47 ounces"", ""Manufacturer r...",B09X9XW42H,,,
2,Toys & Games,Melody Jane Dollhouse Builders DIY 1:24 Scale ...,4.2,67,[1:24 Scale - Plastic - Approximate cut out si...,[],,"{'hi_res': [None, 'https://m.media-amazon.com/...",{'title': ['Cutemini wooden window double door...,Melody Jane Dolls Houses,"[Toys & Games, Dolls & Accessories, Dollhouse ...","{""Item Weight"": ""0.48 ounces"", ""Manufacturer r...",B01I9QET6M,,,
3,Toys & Games,Traxxas Stampede 4X4: 1/10 Scale 4wd Monster T...,4.5,48,[Waterproof electronics for all-weather drivin...,[Stampede 4X4 is built Traxxas Tough to withst...,,{'hi_res': ['https://m.media-amazon.com/images...,{'title': ['Traxxas Slash 2WD Short Course Rac...,Traxxas,"[Toys & Games, Remote & App Controlled Vehicle...","{""Product Dimensions"": ""15.63 x 13.39 x 8.94 i...",B019XEEX1A,,,
4,Toys & Games,Hot Wheels Monster Truck 1:24 Scale 2022 Bone ...,4.8,17699,[Designed in 1:24 scale with durable die-cast ...,[The Hot Wheels Monster Trucks 1:24 scale die-...,27.98,{'hi_res': ['https://m.media-amazon.com/images...,{'title': ['Hot Wheels 1:24 Scale Monster Truc...,Hot Wheels,"[Toys & Games, Preschool, Pre-Kindergarten Toys]","{""Product Dimensions"": ""5 x 6.27 x 5.5 inches""...",B09G7K3JWQ,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
890869,Toys & Games,Dollhouse Miniature 1:12 Scale Fire Place Acce...,4.6,2,[],[Unless stated otherwise this item is 1:12 sca...,16.09,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",Melody Jane Dolls Houses,"[Toys & Games, Dolls & Accessories, Dollhouse ...","{""Product Dimensions"": ""2.99 x 2.52 x 0.08 inc...",B00BGO1PDU,,,
890870,Sports & Outdoors,Hacko Games Pride Deck Poker Cards,4.6,5,[Custom deck of playing cards],[Pride is a fantastically color card system. A...,,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",Hacko Games,"[Toys & Games, Games & Accessories, Card Games...","{""Item Package Dimensions L x W x H"": ""3.54 x ...",B07T16B3W1,,,
890871,Toys & Games,Mini Squee-Z-Bubs & Bubbles (Sold Individually...,3.7,7,"[Toysmith 774546 Mini Squee-z Bubbles, Educati...",[Toysmith 774546 Mini Squee-z Bubbles. Toysmit...,,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",Toysmith,"[Toys & Games, Sports & Outdoor Play, Bubbles,...","{""Product Dimensions"": ""2 x 4.2 x 1.1 inches"",...",B002IOZ92K,,,
890872,Toys & Games,Sentosphère Aquarellum Junior Butterflies & Fl...,4.6,141,"[Complete kit., Paint without going over the l...",[Fantastic. A few drops of paint and any child...,,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",Sentosphère,"[Toys & Games, Arts & Crafts, Craft Kits, Pain...","{""Product Dimensions"": ""10.43 x 7.68 x 1.18 in...",B06XJVLKDD,,,


In [7]:
# convert timestamp
train_df = train_raw_df.pipe(parse_dt).pipe(handle_dtypes)
train_df

Unnamed: 0,user_id,parent_asin,rating,timestamp
0,AGKASBHYZPGTEPO6LWZPVJWB2BVA,B006GBITXC,3.0,2016-01-13 01:09:42
1,AGKASBHYZPGTEPO6LWZPVJWB2BVA,B00TLEMSVK,4.0,2016-02-05 12:36:25
2,AGKASBHYZPGTEPO6LWZPVJWB2BVA,B00SO7HF6I,3.0,2016-02-05 12:40:14
3,AGKASBHYZPGTEPO6LWZPVJWB2BVA,B00MZG6OO8,3.0,2016-08-18 17:39:56
4,AGKASBHYZPGTEPO6LWZPVJWB2BVA,B007JWWUDW,5.0,2016-08-18 17:49:48
...,...,...,...,...
3114942,AHMDS2PYZIJWE6SBXGDTBSJ4SRLA,B00BY2ER66,1.0,2013-09-14 10:35:11
3114943,AHMDS2PYZIJWE6SBXGDTBSJ4SRLA,B00CVDMCH8,3.0,2013-09-14 10:37:34
3114944,AHMDS2PYZIJWE6SBXGDTBSJ4SRLA,B00BY2ER6G,5.0,2013-09-14 10:39:04
3114945,AHMDS2PYZIJWE6SBXGDTBSJ4SRLA,B00I8Z6GAM,5.0,2015-01-07 20:57:56


## Distribution per time 

In [13]:
def plot_interaction_by_dayofweek(df, timezone="US/Eastern"):
    plot_df = df.assign(
        day_of_week=df["timestamp"]
        .dt.tz_localize("UTC")
        .dt.tz_convert(timezone)
        .dt.day_name()
    )

    interaction_count_by_day = (
        plot_df.groupby("day_of_week")
        .size()
        .reindex(
            [
                "Monday",
                "Tuesday",
                "Wednesday",
                "Thursday",
                "Friday",
                "Saturday",
                "Sunday",
            ]
        )
    )

    # Create the plot
    fig = px.bar(
        x=interaction_count_by_day.index,
        y=interaction_count_by_day.values,
        labels={"x": "Day of the Week", "y": "Number of Interactions"},
        title=f"Interaction Count by Day of the Week (assuming {timezone} timezone)",
        text=[f"{val:,.0f}" for val in interaction_count_by_day.values],
        height=500,
        width=700,
    )

    fig.update_layout(xaxis_tickmode="linear")

    fig.show()


timezone = "US/Eastern"  # Assume this time zone because the data is Amazon Data
train_sample_df = train_df.sample(
    args.report_sample_num_rows, replace=False, random_state=args.random_seed
)
plot_interaction_by_dayofweek(train_sample_df, timezone)

In [14]:
def plot_interaction_by_hour(df, timezone="US/Eastern"):
    # Extract the hour from the timestamp
    plot_df = df.assign(
        hour_of_day=df["timestamp"]
        .dt.tz_localize("UTC")
        .dt.tz_convert(timezone)
        .dt.hour
    )

    # Group by the hour of the day and count the number of interactions
    interaction_count_by_hour = (
        plot_df.groupby("hour_of_day").size().reindex(range(24), fill_value=0)
    )

    # Create the plot
    fig = px.bar(
        x=interaction_count_by_hour.index,
        y=interaction_count_by_hour.values,
        labels={"x": "Hour of the Day", "y": "Number of Interactions"},
        title=f"Interaction Count by Hour of the Day (assuming {timezone} timezone)",
        text=[f"{val:,.0f}" for val in interaction_count_by_hour.values],
        height=500,
        width=1200,
    )

    fig.update_layout(xaxis_tickmode="linear")

    fig.show()


timezone = "US/Eastern"  # Assume this time zone because the data is Amazon Data
plot_interaction_by_hour(train_sample_df, timezone)

In [15]:
def plot_interaction_by_month(df, timezone="US/Eastern"):
    # Ensure timestamp is in datetime format
    if not pd.api.types.is_datetime64_any_dtype(df["timestamp"]):
        df["timestamp"] = pd.to_datetime(df["timestamp"])

    # Extract the month from the timestamp (localized)
    plot_df = df.assign(
        month_of_year=df["timestamp"]
        .dt.tz_localize("UTC")
        .dt.tz_convert(timezone)
        .dt.month
    )

    # Group by month and count number of interactions
    interaction_count_by_month = (
        plot_df.groupby("month_of_year").size().reindex(range(1, 13), fill_value=0)
    )

    # Plot
    fig = px.bar(
        x=interaction_count_by_month.index,
        y=interaction_count_by_month.values,
        labels={"x": "Month", "y": "Number of Interactions"},
        title=f"Interaction Count by Month (assuming {timezone} timezone)",
        text=[f"{val:,.0f}" for val in interaction_count_by_month.values],
        height=500,
        width=1200,
    )

    fig.update_layout(
        xaxis_tickmode="linear",
        xaxis_tickvals=interaction_count_by_month.index,
        xaxis_ticktext=[
            pd.to_datetime(str(month), format="%m").strftime("%B")
            for month in interaction_count_by_month.index
        ],
    )

    fig.show()


timezone = "US/Eastern"
plot_interaction_by_month(train_sample_df, timezone)

## Sparsity


In [16]:
def calculate_sparsity(df, user_col="user_id", item_col="parent_asin"):
    return 1 - df.shape[0] / (df[user_col].nunique() * df[item_col].nunique())


print(f"Sparsity: {calculate_sparsity(train_df):,.4%}")

Sparsity: 99.9944%
