In [3]:
import os
os.makedirs("data", exist_ok=True)


In [4]:
import time, re
import requests
import pandas as pd

BASE = "https://openaccess-api.clevelandart.org/api/artworks/"
OUT = "data/cma_objects_20k.csv"

session = requests.Session()
headers = {"User-Agent": "Mozilla/5.0", "Accept": "application/json"}

# keyword filter to keep it print/poster-like (you can widen later)
printish = re.compile(r"(print|poster|lithograph|etching|engraving|woodcut|linocut|screenprint|serigraph|aquatint)",
                      re.IGNORECASE)

def fetch_page(skip, limit=1000):
    params = {"skip": skip, "limit": limit}
    r = session.get(BASE, params=params, headers=headers, timeout=60)
    r.raise_for_status()
    return r.json()

target = 20000
limit = 1000
skip = 0

rows = []
seen_ids = set()

while len(rows) < target:
    data = fetch_page(skip=skip, limit=limit)
    items = data.get("data", [])
    if not items:
        break

    for obj in items:
        oid = obj.get("id")
        if oid in seen_ids:
            continue
        seen_ids.add(oid)

        # image URL (CMA usually stores images under "images")
        img = None
        images = obj.get("images") or {}
        # prefer web-sized url if present
        if isinstance(images, dict):
            web = images.get("web") or {}
            img = web.get("url") or None

        if not img:
            continue

        # Use whichever text fields exist to filter print/poster-like work
        # (CMA objects commonly have "technique", "medium", "type", etc.)
        technique = obj.get("technique") or ""
        medium = obj.get("medium") or ""
        type_ = obj.get("type") or ""
        dept = obj.get("department") or ""
        title = obj.get("title") or ""

        hay = " | ".join([technique, medium, type_, dept, title])

        if not printish.search(hay):
            continue

        creators = obj.get("creators") or []
        artist = ""
        if creators and isinstance(creators, list):
            artist = creators[0].get("description") or creators[0].get("name") or ""

        rows.append({
            "source": "CMA",
            "id": oid,
            "title": title,
            "artist": artist,
            "department": dept,
            "type": type_,
            "technique": technique,
            "medium": medium,
            "creation_date": obj.get("creation_date") or "",
            "culture": obj.get("culture") or "",
            "image_url": img,
            "canonical_url": obj.get("url") or ""
        })

        if len(rows) % 250 == 0:
            pd.DataFrame(rows).to_csv(OUT, index=False)
            print(f"Kept {len(rows)}/{target} | skip={skip}")

        if len(rows) >= target:
            break

    skip += limit
    print(f"Scanned up to skip={skip} | kept={len(rows)}")
    time.sleep(0.15)  # polite pacing

df = pd.DataFrame(rows)
df.to_csv(OUT, index=False)
print("DONE:", len(df), "saved to", OUT)


Scanned up to skip=1000 | kept=11
Scanned up to skip=2000 | kept=15
Scanned up to skip=3000 | kept=69
Scanned up to skip=4000 | kept=134
Scanned up to skip=5000 | kept=222
Kept 250/20000 | skip=5000
Scanned up to skip=6000 | kept=300
Scanned up to skip=7000 | kept=352
Scanned up to skip=8000 | kept=425
Kept 500/20000 | skip=8000
Scanned up to skip=9000 | kept=506
Scanned up to skip=10000 | kept=600
Kept 750/20000 | skip=10000
Scanned up to skip=11000 | kept=767
Scanned up to skip=12000 | kept=905
Scanned up to skip=13000 | kept=905
Kept 1000/20000 | skip=13000
Scanned up to skip=14000 | kept=1012
Kept 1250/20000 | skip=14000
Scanned up to skip=15000 | kept=1251
Scanned up to skip=16000 | kept=1477
Kept 1500/20000 | skip=16000
Scanned up to skip=17000 | kept=1692
Kept 1750/20000 | skip=17000
Scanned up to skip=18000 | kept=1823
Kept 2000/20000 | skip=18000
Scanned up to skip=19000 | kept=2059
Scanned up to skip=20000 | kept=2145
Kept 2250/20000 | skip=20000
Scanned up to skip=21000 | ke

In [5]:
import pandas as pd
df = pd.read_csv("data/cma_objects_20k.csv")
df.head(10)


Unnamed: 0,source,id,title,artist,department,type,technique,medium,creation_date,culture,image_url,canonical_url
0,CMA,143142,Battle of the Nudes,"Antonio del Pollaiuolo (Italian, 1431/32–1498)",Prints,Print,engraving,,1470s–80s,"['Italy, 15th century']",https://openaccess-cdn.clevelandart.org/1967.1...,https://clevelandart.org/art/1967.127
1,CMA,109147,Closed Eyes,"Odilon Redon (French, 1840–1916)",Prints,Print,lithograph on China paper laid on wove paper,,1890,"['France, 19th century']",https://openaccess-cdn.clevelandart.org/1927.3...,https://clevelandart.org/art/1927.306
2,CMA,111654,"South Wind, Clear Sky, from Thirty-Six Views o...","Katsushika Hokusai (Japanese, 1760–1849)",Japanese Art,Print,color woodblock print,,early 1830s,"['Japan, Edo period (1615–1868)']",https://openaccess-cdn.clevelandart.org/1930.1...,https://clevelandart.org/art/1930.189
3,CMA,112586,Venus Reclining in a Landscape,"Giulio Campagnola (Italian, 1482–1515)",Prints,Print,engraving,,c. 1508–9,"['Italy, late 15th-early 16th Century']",https://openaccess-cdn.clevelandart.org/1931.2...,https://clevelandart.org/art/1931.205
4,CMA,103244,Sudden Shower over Shin-Ōhashi Bridge and Atak...,"Utagawa Hiroshige (Japanese, 1797–1858)",Japanese Art,Print,color woodblock print,,1857,"['Japan, Edo period (1615–1868)']",https://openaccess-cdn.clevelandart.org/1921.3...,https://clevelandart.org/art/1921.318
5,CMA,108501,Melencolia I,"Albrecht Dürer (German, 1471–1528)",Prints,Print,engraving,,1514,['Germany'],https://openaccess-cdn.clevelandart.org/1926.2...,https://clevelandart.org/art/1926.211
6,CMA,128048,Equestrian Portrait of the Emperor Maximilian,"Hans Burgkmair (German, 1473–1531)",Prints,Print,"woodcut, in black and white on paper washed blue",,1508,['Germany'],https://openaccess-cdn.clevelandart.org/1950.7...,https://clevelandart.org/art/1950.72
7,CMA,106442,Virgin and Child with the Infant Saint John,"Cristofano Robetta (Italian, 1462–1534)",Prints,Print,engraving,,1500–1520,"['Italy, 16th century']",https://openaccess-cdn.clevelandart.org/1924.5...,https://clevelandart.org/art/1924.514
8,CMA,111675,Ichikawa Ebizō IV (Danjurō) as Takemura Sadano...,"Tōshūsai Sharaku (Japanese, active 1794–95)",Japanese Art,Print,color woodblock print,,1794,"['Japan, Edo period (1615–1868)']",https://openaccess-cdn.clevelandart.org/1930.2...,https://clevelandart.org/art/1930.205
9,CMA,111676,"Otani Tokuji I as the Retainer Sodesuke in ""Fl...","Tōshūsai Sharaku (Japanese, active 1794–95)",Japanese Art,Print,color woodblock print,,1794,"['Japan, Edo period (1615–1868)']",https://openaccess-cdn.clevelandart.org/1930.2...,https://clevelandart.org/art/1930.206


In [9]:

import os, time, hashlib
import pandas as pd
import requests

IN_CSV = "data/cma_objects_20k.csv"
IMG_DIR = "data/images_cma"
os.makedirs(IMG_DIR, exist_ok=True)

df = pd.read_csv(IN_CSV)

session = requests.Session()
headers = {"User-Agent": "Mozilla/5.0"}

def safe_name(url):
    # stable filename even if titles have weird chars
    h = hashlib.md5(url.encode("utf-8")).hexdigest()
    return f"{h}.jpg"

downloaded = 0
for i, row in df.iterrows():
    url = row["image_url"]
    fn = safe_name(url)
    out_path = os.path.join(IMG_DIR, fn)

    if os.path.exists(out_path):
        continue

    try:
        r = session.get(url, headers=headers, timeout=40)
        r.raise_for_status()
        with open(out_path, "wb") as f:
            f.write(r.content)
        downloaded += 1

        if downloaded % 200 == 0:
            print("Downloaded:", downloaded, "| processed rows:", i+1)

        time.sleep(0.05)  # be polite
    except Exception:
        continue

print("Done. Newly downloaded:", downloaded)


Downloaded: 200 | processed rows: 200
Downloaded: 400 | processed rows: 400
Downloaded: 600 | processed rows: 600
Downloaded: 800 | processed rows: 800
Downloaded: 1000 | processed rows: 1000
Downloaded: 1200 | processed rows: 1201


KeyboardInterrupt: 

In [10]:
import os
n = len([f for f in os.listdir("data/images_cma") if f.lower().endswith(".jpg")])
print("Images downloaded:", n)


Images downloaded: 1378


In [12]:
import os, glob
import numpy as np
import pandas as pd
from PIL import Image, ImageFilter

IMG_DIR = "data/images_cma"
OUT_FEATS = "data/features_cma_basic.csv"

paths = glob.glob(os.path.join(IMG_DIR, "*.jpg"))
print("Images found:", len(paths))

# Resume support
done = set()
if os.path.exists(OUT_FEATS):
    df_old = pd.read_csv(OUT_FEATS)
    done = set(df_old["file"].astype(str).tolist())
    print("Already processed:", len(done))
else:
    df_old = pd.DataFrame()

def edge_density_pil(gray_arr):
    # simple gradient magnitude as a proxy for edges
    gy, gx = np.gradient(gray_arr.astype(np.float32))
    mag = np.sqrt(gx**2 + gy**2)
    # threshold chosen to roughly mimic "edges present"
    return float((mag > 18).mean())

def sharpness_pil(gray_arr):
    # Laplacian variance approximation via second derivatives
    gy, gx = np.gradient(gray_arr.astype(np.float32))
    gyy, _ = np.gradient(gy)
    _, gxx = np.gradient(gx)
    lap = gxx + gyy
    return float(lap.var())

def feats(path):
    fn = os.path.basename(path)

    im = Image.open(path).convert("RGB").resize((256,256))
    arr = np.asarray(im).astype(np.float32) / 255.0

    mean = arr.mean(axis=(0,1))
    std = arr.std(axis=(0,1))

    lum = 0.2126*arr[:,:,0] + 0.7152*arr[:,:,1] + 0.0722*arr[:,:,2]
    contrast = float(lum.std())

    gray = (lum * 255.0).astype(np.uint8)

    ed = edge_density_pil(gray)
    sh = sharpness_pil(gray)

    return {
        "file": fn,
        "r_mean": float(mean[0]), "g_mean": float(mean[1]), "b_mean": float(mean[2]),
        "r_std": float(std[0]), "g_std": float(std[1]), "b_std": float(std[2]),
        "contrast": contrast,
        "edge_density": ed,
        "sharpness": sh
    }

rows = []
processed = 0
skipped = 0
failed = 0

for p in paths:
    fn = os.path.basename(p)
    if fn in done:
        skipped += 1
        continue
    try:
        rows.append(feats(p))
        processed += 1
    except Exception:
        failed += 1
        continue

    if processed % 200 == 0:
        df_new = pd.DataFrame(rows)
        df_out = pd.concat([df_old, df_new], ignore_index=True) if len(df_old) else df_new
        df_out.to_csv(OUT_FEATS, index=False)
        df_old = df_out
        rows = []
        print(f"Processed {processed} | skipped {skipped} | failed {failed}")

if rows:
    df_new = pd.DataFrame(rows)
    df_out = pd.concat([df_old, df_new], ignore_index=True) if len(df_old) else df_new
    df_out.to_csv(OUT_FEATS, index=False)

print("Saved:", OUT_FEATS)


Images found: 1378
Processed 200 | skipped 0 | failed 0
Processed 400 | skipped 0 | failed 0
Processed 600 | skipped 0 | failed 0
Processed 800 | skipped 0 | failed 0
Processed 1000 | skipped 0 | failed 0
Processed 1200 | skipped 0 | failed 0
Saved: data/features_cma_basic.csv


In [13]:
import pandas as pd
df = pd.read_csv("data/features_cma_basic.csv")
print(df.shape)
df.head()


(1378, 10)


Unnamed: 0,file,r_mean,g_mean,b_mean,r_std,g_std,b_std,contrast,edge_density,sharpness
0,001a30cb9db153534ce3067323e036e7.jpg,0.446982,0.394393,0.357631,0.199286,0.203897,0.171056,0.20045,0.152191,196.814667
1,0029c3e846e69049f792b6d5bf124703.jpg,0.584591,0.435318,0.286998,0.151701,0.182455,0.220051,0.172381,0.226242,241.831848
2,00339040dd904be8a5e1e6e5cb147b67.jpg,0.438422,0.430737,0.411382,0.278123,0.27324,0.261619,0.273427,0.257065,435.878967
3,00c35fef6d0f0ea221025799aa88f371.jpg,0.581593,0.578999,0.535431,0.278166,0.250481,0.235102,0.254179,0.129654,181.338364
4,00fc7230659a96eb9f368a6adcdf771a.jpg,0.691278,0.681643,0.632228,0.193556,0.191277,0.184626,0.191264,0.425095,589.471497
