In [None]:
import sys
import h5py
from pathlib import Path
import zipfile, os, sys
import numpy as np
from collections import defaultdict
import re
import json
import pandas as pd

ROOT = Path.cwd().parent          
SRC  = ROOT / "src"
sys.path.insert(0, str(SRC))      

from fashiongen_h5_loader import (
    ensure_h5_ready, build_vocab_from_h5, make_loaders_with_existing_val
)

ImportError: cannot import name 'make_loaders_with_existing_val' from 'fashiongen_h5_loader' (/Users/jiaqingzhang/Downloads/GT_DL/project/Shop-Till-You-Drop/src/fashiongen_h5_loader.py)

# 1. Import Data

In [9]:
DATA_DIR = ROOT / "data"
assert DATA_DIR.exists(), f"Missing data dir: {DATA_DIR}"
print("DATA_DIR:", DATA_DIR)

DATA_DIR: /Users/jiaqingzhang/Downloads/GT_DL/project/Shop-Till-You-Drop/data


In [None]:
# 1) Unzip both train/validation if needed; search h5 recursively
h5s = []
for zp in sorted(DATA_DIR.glob("*.zip")):
    out_dir = DATA_DIR / zp.stem
    out_dir.mkdir(parents=True, exist_ok=True)
    # extract only missing files
    with zipfile.ZipFile(zp, "r") as zf:
        for m in zf.infolist():
            # skip macOS metadata
            if m.filename.startswith("__MACOSX/"): 
                continue
            target = out_dir / m.filename
            if not target.exists():
                zf.extract(m, out_dir)
    # collect any .h5 inside (recursively)
    h5s.extend(sorted(out_dir.rglob("*.h5")))

In [None]:
h5s = sorted(set(h5s))
print("Found H5 files:")
for p in h5s: print(" -", p)

Found H5 files:
 - /Users/jiaqingzhang/Downloads/GT_DL/project/Shop-Till-You-Drop/data/fashiongen_256_256_train.h5/fashiongen_256_256_train.h5
 - /Users/jiaqingzhang/Downloads/GT_DL/project/Shop-Till-You-Drop/data/fashiongen_256_256_validation.h5/fashiongen_256_256_validation.h5


In [14]:
# 2) Identify splits by filename
lower = [p.name.lower() for p in h5s]
train_h5 = next((str(p) for p in h5s if "train" in p.name.lower()), None)
val_h5   = next((str(p) for p in h5s if "val" in p.name.lower() or "validation" in p.name.lower()), None)
test_h5  = next((str(p) for p in h5s if "test" in p.name.lower()), None)
print("train_h5:", train_h5)
print("val_h5  :", val_h5)
print("test_h5 :", test_h5)


train_h5: /Users/jiaqingzhang/Downloads/GT_DL/project/Shop-Till-You-Drop/data/fashiongen_256_256_train.h5/fashiongen_256_256_train.h5
val_h5  : /Users/jiaqingzhang/Downloads/GT_DL/project/Shop-Till-You-Drop/data/fashiongen_256_256_validation.h5/fashiongen_256_256_validation.h5
test_h5 : None


In [17]:
# 3) Build label vocab from train 
vocab_path = ROOT / "data/label_vocab.json"
_ = build_vocab_from_h5(
    train_h5,
    label_key="input_category",  
    add_other=True,
    min_count=5,
    out_json=str(vocab_path)
)

[vocab] Saved 49 labels to /Users/jiaqingzhang/Downloads/GT_DL/project/Shop-Till-You-Drop/data/label_vocab.json


In [None]:
# 4) Build datasets, Stratified 80/20
def _norm(s): return re.sub(r"\s+", " ", str(s).strip().lower())

with open(vocab_path) as f:
    vocab = json.load(f)
other_id = vocab.get("__other__", 0)

# Read labels from train.h5
with h5py.File(train_h5, "r") as f:
    lab_dset = f["input_category"]  
    N = lab_dset.shape[0]
    labels = np.empty(N, dtype=np.int64)
    for i in range(N):
        x = lab_dset[i]
        if not np.issubdtype(lab_dset.dtype, np.integer):
            x = _norm(x.decode() if hasattr(x, "decode") else str(x))
            labels[i] = vocab.get(x, other_id)
        else:
            labels[i] = int(x)

In [35]:
vc_pd = pd.Series(labels).value_counts().sort_index()
print(vc_pd.head(10))

1     2858
2      110
3      335
4       17
5       12
6     4515
7       71
8     1120
9     9458
10    1533
Name: count, dtype: int64


In [36]:
vc_desc = vc_pd.sort_values(ascending=False)  
print(vc_desc.head(10))

45    47764
42    44591
19    35035
30    23230
20    13586
35    11398
39    10478
9      9458
36     7416
37     6952
Name: count, dtype: int64


In [38]:
rng = np.random.default_rng(42)
train_idx, test_idx = [], []
by_class = defaultdict(list)
for i, y in enumerate(labels):
    by_class[int(y)].append(i)
for y, idxs in by_class.items():
    rng.shuffle(idxs)
    n = len(idxs)
    n_tr = int(n * 0.8)                 
    train_idx += idxs[:n_tr]
    test_idx  += idxs[n_tr:]
train_idx, test_idx = map(np.array, (train_idx, test_idx))

In [43]:
print(f"Train Size: {len(train_idx)};\nTest Size: {len(test_idx)}")

Train Size: 208375;
Test Size: 52115


In [None]:
# 4) Build datasets/loaders
train_ds, val_ds, test_ds = make_datasets_from_h5(
    train_h5=train_h5, val_h5=val_h5, test_h5=test_h5,
    vocab_json=str(vocab_path),
    image_key="images",
    label_key_candidates=("category","articleType","class","label"),
    caption_key_candidates=("captions","descriptions","caption","description"),
    image_size=256, normalize="imagenet",
    drop_unknown=False,
)
train_loader, val_loader, test_loader = make_loaders(
    train_ds, val_ds, test_ds,
    batch_size=64, num_workers=8, use_weighted_sampler=False
)