In [None]:
import boto3
import pandas as pd
import json
import re

# ---- Setup ----
REGION = "eu-west-2"
TABLE_NAME = "FashionAnalysis"
BUCKET = "fashion-trend-forecast-data"

dynamodb = boto3.client("dynamodb", region_name=REGION)
s3 = boto3.client("s3")

# ---- Normalization Helpers ----
def normalize_season(season: str) -> str:
    if not season:
        return season

    s = season.lower()
    s = s.replace("_", " ").replace("-", " ")   # unify separators
    s = " ".join(s.split())                    # collapse spaces

    # Regex patterns to match common season structures
    match = re.match(r"(fall|spring|summer|winter|pre fall|prefall|resort|couture)\s*(\w*)\s*(\d{4})", s)
    if match:
        season_name = match.group(1).title()
        # Handle two-word seasons like "fall winter", "spring summer"
        if "fall" in s and "winter" in s:
            season_name = "Fall-Winter"
        elif "spring" in s and "summer" in s:
            season_name = "Spring-Summer"
        elif "pre fall" in s or "prefall" in s:
            season_name = "Pre-Fall"
        elif "resort" in s:
            season_name = "Resort"
        elif "couture" in s:
            season_name = "Couture"
        
        year = match.group(3)
        return f"{season_name} {year}"

    # Fallback: title case original
    return season.strip().title()


def normalize_brand(brand: str) -> str:
    if not brand:
        return brand
    return brand.strip().title()

# ---- Fetch Data from DynamoDB ----
def fetch_data():
    items = []
    response = dynamodb.scan(TableName=TABLE_NAME)
    items.extend(response["Items"])

    # Handle pagination if more results
    while "LastEvaluatedKey" in response:
        response = dynamodb.scan(
            TableName=TABLE_NAME,
            ExclusiveStartKey=response["LastEvaluatedKey"]
        )
        items.extend(response["Items"])
    
    # Flatten into list of dicts
    data = []
    for item in items:
        season = item.get("season", {}).get("S")
        designer = item.get("designer", {}).get("S")
        color = item.get("color_name", {}).get("S")
        itm = item.get("item_name", {}).get("S")
        mat = item.get("materials", {}).get("S")

        data.append({
            "season": normalize_season(season),
            "designer": normalize_brand(designer),
            "color_name": color,
            "item_name": itm,
            "materials": mat,
        })
    return pd.DataFrame(data)

# ---- Aggregate and Save ----
def save_top10(df, group_key, output_key):
    # Group and count
    agg = (
        df.groupby(["season", "designer", group_key])
          .size()
          .reset_index(name="frequency")
          .sort_values("frequency", ascending=False)
    )

    # Take top 10 per season
    top10 = agg.groupby("season").head(10)

    # Convert to JSON (NDJSON format)
    json_data = top10.to_dict(orient="records")
    json_str = "\n".join(json.dumps(record) for record in json_data)

    # Upload to S3
    s3.put_object(
        Bucket=BUCKET,
        Key=output_key,
        Body=json_str,
        ContentType="application/json"
    )
    print(f"✅ Uploaded {output_key} to S3.")

# ---- Main ----
def main():
    df = fetch_data()
    print("Fetched data sample:", df.head())

    # Colors
    save_top10(df, "color_name", "processed/colors.json")

    # Items
    save_top10(df, "item_name", "processed/items.json")

    # Materials
    save_top10(df, "materials", "processed/materials.json")

if __name__ == "__main__":
    main()


Fetched data sample:                season       designer color_name item_name materials
0  Spring-Summer 2024        Miu Miu      black       bag   leather
1  Spring Summer 2025  Louis Vuitton        red     dress      silk
2  Spring Summer 2025        Miu Miu       gray     dress    cotton
3  Spring Summer 2025         Chanel      black    jacket     denim
4    Fall-Winter 2024        Miu Miu      black    jacket     nylon
✅ Uploaded processed/colors.json to S3.
✅ Uploaded processed/items.json to S3.
✅ Uploaded processed/materials.json to S3.
