In [2]:
import pandas as pd
from pathlib import Path

# -----------------------------------------------------
# Load RAW data (2015â€“2025)
# -----------------------------------------------------
RAW_PATH = Path("../data/raw")

df = pd.concat(
    [pd.read_parquet(p) for p in RAW_PATH.glob("year=*/*.parquet")],
    ignore_index=True
)

df['calendar_date'] = pd.to_datetime(df['calendar_date'])
df['cases'] = pd.to_numeric(df['cases'], errors='coerce')
df['year'] = df['calendar_date'].dt.year

print("Data Loaded:", df.year.min(), "to", df.year.max())

# Ensure category column is string
df['category'] = df['category'].astype(str)

# Year-wise unique categories
year_wise_categories = (
    df.groupby('year')['category']
      .unique()
      .reset_index()
)

# Print nicely
for _, row in year_wise_categories.iterrows():
    print(f"\nYear: {row['year']}")
    for cat in row['category']:
        print(f" - {cat}")


Data Loaded: 2015 to 2025

Year: 2015
 - Shoes
 - Ammunition
 - Apparel
 - Other

Year: 2016
 - Ammunition
 - Other
 - Shoes
 - Apparel

Year: 2017
 - Ammunition
 - Shoes
 - Other
 - Apparel

Year: 2018
 - Shoes
 - Ammunition
 - Other
 - Apparel

Year: 2019
 - Apparel
 - Ammunition
 - Shoes
 - Other

Year: 2020
 - Apparel
 - Shoes
 - Ammunition
 - Other

Year: 2021
 - Apparel
 - Shoes
 - Ammunition
 - Other

Year: 2022
 - Ammunition
 - Apparel
 - Other
 - Shoes

Year: 2023
 - Other
 - Shoes
 - Ammunition
 - Apparel

Year: 2024
 - Apparel
 - Other
 - Shoes
 - Ammunition

Year: 2025
 - Apparel
 - Ammunition
 - Shoes
 - Other


In [3]:
df['calendar_date'] = pd.to_datetime(df['calendar_date'])

day_year_category = (
    df[['calendar_date', 'year', 'category']]
    .drop_duplicates()
    .sort_values(['year', 'calendar_date'])
)

day_year_category.to_csv(
    "day_year_wise_categories.csv",
    index=False
)

print("Day-wise category file created")


Day-wise category file created
