In [1]:

! pip install h5py



In [2]:
import sys
import h5py
from pathlib import Path
import zipfile, os, sys
import numpy as np
from collections import defaultdict
import re
import json
import pandas as pd

ROOT = Path.cwd().parent          
SRC  = ROOT / "src"
sys.path.insert(0, str(SRC))      

from fashiongen_h5_loader import (ensure_h5_ready, build_vocab_from_h5, make_loaders_with_existing_val)

# 1. Import Data

In [4]:
DATA_DIR = ROOT / "data"
assert DATA_DIR.exists(), f"Missing data dir: {DATA_DIR}"
print("DATA_DIR:", DATA_DIR)

DATA_DIR: /Users/ruilu/Shop-Till-You-Drop/data


In [5]:
# 1) Find all .h5 files directly (no zip assumption)
h5s = sorted(DATA_DIR.rglob("*.h5"))

print("DATA_DIR:", DATA_DIR)
print("Found H5 files:")
for p in h5s:
    print(" -", p, " -> ", p.name)

# 2) Identify splits by filename
train_h5 = next((str(p) for p in h5s if "train" in p.name.lower()), None)
val_h5   = next((str(p) for p in h5s if "val" in p.name.lower() or "validation" in p.name.lower()), None)
test_h5  = next((str(p) for p in h5s if "test" in p.name.lower()), None)

print("train_h5:", train_h5)
print("val_h5  :", val_h5)
print("test_h5 :", test_h5)

DATA_DIR: /Users/ruilu/Shop-Till-You-Drop/data
Found H5 files:
 - /Users/ruilu/Shop-Till-You-Drop/data/fashiongen_256_256_train.h5  ->  fashiongen_256_256_train.h5
 - /Users/ruilu/Shop-Till-You-Drop/data/fashiongen_256_256_validation.h5  ->  fashiongen_256_256_validation.h5
train_h5: /Users/ruilu/Shop-Till-You-Drop/data/fashiongen_256_256_train.h5
val_h5  : /Users/ruilu/Shop-Till-You-Drop/data/fashiongen_256_256_validation.h5
test_h5 : None


In [6]:
with h5py.File(train_h5, "r") as f:
    print("Keys in H5:", list(f.keys()))

Keys in H5: ['index', 'index_2', 'input_brand', 'input_category', 'input_composition', 'input_concat_description', 'input_department', 'input_description', 'input_gender', 'input_image', 'input_msrpUSD', 'input_name', 'input_pose', 'input_productID', 'input_season', 'input_subcategory']


In [7]:
# 1) Directly collect .h5 files 
h5s = sorted(DATA_DIR.rglob("*.h5"))

print("DATA_DIR:", DATA_DIR)
print("Found H5 files:")
for p in h5s:
    print(" -", p.name)

# 2) Identify splits
train_h5 = next((str(p) for p in h5s if "train" in p.name.lower()), None)
val_h5   = next((str(p) for p in h5s if "val" in p.name.lower() or "validation" in p.name.lower()), None)
test_h5  = next((str(p) for p in h5s if "test" in p.name.lower()), None)

print("train_h5:", train_h5)
print("val_h5  :", val_h5)
print("test_h5 :", test_h5)

DATA_DIR: /Users/ruilu/Shop-Till-You-Drop/data
Found H5 files:
 - fashiongen_256_256_train.h5
 - fashiongen_256_256_validation.h5
train_h5: /Users/ruilu/Shop-Till-You-Drop/data/fashiongen_256_256_train.h5
val_h5  : /Users/ruilu/Shop-Till-You-Drop/data/fashiongen_256_256_validation.h5
test_h5 : None


In [8]:
# 2) Build label vocab from train 
vocab_path = ROOT / "data/label_vocab.json"
_ = build_vocab_from_h5(
    train_h5,
    label_key="input_category",  
    add_other=True,
    min_count=5,
    out_json=str(vocab_path)
)

[vocab] Saved 49 labels to /Users/ruilu/Shop-Till-You-Drop/data/label_vocab.json


In [9]:
# 3. Build datasets/loaders
train_loader, val_loader, test_loader, train_ds, val_ds, test_ds = make_loaders_with_existing_val(
    train_h5=train_h5,
    val_h5=val_h5,
    vocab_json=str(vocab_path),
    image_key="input_image",
    label_key="input_category",
    train_ratio=0.8,
    batch_size=64,
    num_workers=8,
)

In [10]:
print("image_key:", train_ds.image_key)
print("label_key:", train_ds.label_key)
print("caption_key:", train_ds.caption_key)

image_key: input_image
label_key: input_category
caption_key: input_description


In [11]:
with h5py.File(train_h5, "r") as f:
    for k in [train_ds.image_key, train_ds.label_key, train_ds.caption_key]:
        if k is None: 
            continue
        d = f[k]
        print(f"{k}: shape={getattr(d,'shape',None)}, dtype={getattr(d,'dtype',None)}")

input_image: shape=(260490, 256, 256, 3), dtype=uint8
input_category: shape=(260490, 1), dtype=|S100
input_description: shape=(260490, 1), dtype=|S400


In [12]:
pip install "numpy==1.26.4"


Note: you may need to restart the kernel to use updated packages.


In [13]:
import numpy as np
import torch, torchvision

print("NumPy:", np.__version__)
print("Torch:", torch.__version__)
print("Torchvision:", torchvision.__version__)

NumPy: 1.26.4
Torch: 2.2.2
Torchvision: 0.17.2


In [16]:
a= next(iter(train_loader))
print(a["image"].shape, a["label"].shape)

torch.Size([64, 3, 256, 256]) torch.Size([64])


In [17]:
b = next(iter(test_loader))
print(b["image"].shape, b["label"].shape)

torch.Size([64, 3, 256, 256]) torch.Size([64])


In [18]:
c = next(iter( val_loader,))
print(c["image"].shape, c["label"].shape)

torch.Size([64, 3, 256, 256]) torch.Size([64])


In [None]:
#### Start to build model structure 