<a href="https://colab.research.google.com/github/physicalintelligencelab/openmotor-backend/blob/main/Motor_Dataset_Scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
!pip install -q --upgrade requests tqdm


In [13]:
# ░█▀█░▀█▀░█▀█░█▀▄░█▀█  COLAB OpenMotor SCRAPER 2.3  ░█
!pip -q install requests tqdm

import csv, json, re, sys, requests, time
from urllib.parse import quote_plus
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

KW_POS = [
    "motor learning", "motor adaptation", "visuomotor", "reach adaptation",
    "sensorimotor adaptation", "force field learning", "saccade adaptation",
    "implicit motor", "explicit motor", "error-based learning"
]
KW_NEG = ["fmri", "mri", "eeg", "meg"]

HEADERS = {"User-Agent": "motor-learning-harvester/2.3 (colab)"}

def good(txt: str) -> bool:
    t = txt.lower()
    return any(p in t for p in KW_POS) and not any(n in t for n in KW_NEG)

def to_html(link_dict, *keys):
    for k in keys:
        if k in link_dict:
            return link_dict[k]
    return None

def verify(url: str, timeout=2):
    try:
        r = requests.get(url, headers=HEADERS, stream=True,
                         allow_redirects=True, timeout=timeout)
        size = int(r.headers.get("content-length", 0)) / 1e6 if r.headers.get("content-length") else None
        return (r.status_code < 400, size)
    except requests.RequestException:
        return (False, None)

def datacite_hits():
    for kw in tqdm(KW_POS, desc="DataCite", ncols=80):
        url = (f"https://api.datacite.org/dois?query={quote_plus(kw)}"
               f"&resource-type-id=dataset&page[size]=200")
        try:
            for item in requests.get(url, headers=HEADERS, timeout=4).json().get("data", []):
                attr  = item["attributes"]
                title = attr.get("title","")
                desc  = attr.get("description","")
                if good(title + " " + desc) and attr.get("url"):
                    yield title, attr["url"], "DataCite"
        except requests.Timeout:
            continue

def zenodo_hits():
    for kw in tqdm(KW_POS, desc="Zenodo", ncols=80):
        url = f"https://zenodo.org/api/records/?q={quote_plus(kw)}&type=dataset&size=200"
        try:
            hits = requests.get(url, headers=HEADERS, timeout=4).json()["hits"]["hits"]
        except requests.Timeout:
            continue
        for rec in hits:
            meta  = rec["metadata"]
            title = meta.get("title","")
            desc  = meta.get("description","")
            html  = to_html(rec.get("links", {}), "html", "self_html", "preview_html")
            if good(title + " " + desc) and html and rec.get("files"):
                yield title, html, "Zenodo"

def osf_has_files(node_id):
    base = f"https://api.osf.io/v2/nodes/{node_id}"
    try:
        files = requests.get(f"{base}/files", headers=HEADERS, timeout=4).json().get("data", [])
        if files: return True

        kids = requests.get(f"{base}/children", headers=HEADERS, timeout=4).json().get("data", [])
        for kid in kids:
            if requests.get(kid["relationships"]["files"]["links"]["related"]["href"],
                            headers=HEADERS, timeout=4).json().get("data", []):
                return True
    except requests.Timeout:
        pass
    return False

def osf_hits():
    for kw in tqdm(KW_POS, desc="OSF", ncols=80):
        url = (f"https://api.osf.io/v2/search/?q={quote_plus(kw)}"
               f"&filter[resource_type]=project&per_page=100")
        try:
            data = requests.get(url, headers=HEADERS, timeout=4).json().get("data", [])
        except requests.Timeout:
            continue
        for item in data:
            if not item or "attributes" not in item: continue
            attrs = item["attributes"]; title = attrs.get("title","")
            if not good(title): continue
            link  = attrs.get("public_url") or to_html(item.get("links", {}), "html")
            if not link: continue
            node_id = item["id"]
            if osf_has_files(node_id):
                yield title, link, "OSF"

_pat = re.compile(r'"/datasets/(ds\d{6,})"')
def openneuro_hits():
    for kw in tqdm(KW_POS, desc="OpenNeuro", ncols=80):
        try:
            html = requests.get(f"https://openneuro.org/search?q={quote_plus(kw)}",
                                headers=HEADERS, timeout=4).text
            for ds in set(_pat.findall(html)):
                yield f"OpenNeuro {ds}", f"https://openneuro.org/datasets/{ds}", "OpenNeuro"
        except requests.Timeout:
            continue

def harvest():
    gens = [datacite_hits, zenodo_hits, osf_hits, openneuro_hits]
    print("\n🔎  Collecting candidates …")
    cand = []
    for g in gens: cand.extend(list(g()))
    print(f"\n⚙️   {len(cand)} raw candidates. Verifying …")

    ok_rows, seen = [], set()
    with ThreadPoolExecutor(max_workers=20) as pool:
        for (title, url, src), (ok, size) in tqdm(
            zip(cand, pool.map(lambda x: verify(x[1]), cand)),
            total=len(cand), ncols=80, desc="Verify"):
            if ok and url not in seen:
                seen.add(url)
                ok_rows.append((title, url, src, size if size else ""))
    return ok_rows

def main(out="/content/motor_learning_datasets.csv"):
    rows = harvest()
    if not rows:
        sys.exit("❌  Still no live motor-learning datasets; APIs may be blocked.")
    with open(out, "w", newline="", encoding="utf-8") as f:
        csv.writer(f).writerows([("title","url","source","size_MB"), *rows])
    print(f"\n Saved {len(rows)} datasets → {out}")

main()



🔎  Collecting candidates …


DataCite: 100%|█████████████████████████████████| 10/10 [00:28<00:00,  2.83s/it]
Zenodo: 100%|███████████████████████████████████| 10/10 [00:42<00:00,  4.29s/it]
OSF: 100%|██████████████████████████████████████| 10/10 [00:41<00:00,  4.12s/it]
OpenNeuro: 100%|████████████████████████████████| 10/10 [00:00<00:00, 64.55it/s]



⚙️   62 raw candidates. Verifying …


Verify: 100%|███████████████████████████████████| 62/62 [00:04<00:00, 14.14it/s]


✅  Saved 56 datasets → /content/motor_learning_datasets.csv



