In [None]:
import os
import requests
import gzip
import shutil

# Download and extract original MNIST IDX files from Yann LeCun
BASE_URL = 'http://yann.lecun.com/exdb/mnist/'
FILES = {
    'train-images-idx3-ubyte.gz': 'train-images-idx3-ubyte/train-images-idx3-ubyte',
    'train-labels-idx1-ubyte.gz': 'train-labels-idx1-ubyte/train-labels-idx1-ubyte',
    't10k-images-idx3-ubyte.gz': 't10k-images-idx3-ubyte/t10k-images-idx3-ubyte',
    't10k-labels-idx1-ubyte.gz': 't10k-labels-idx1-ubyte/t10k-labels-idx1-ubyte',
}

# Target base relative to the notebook. This will create '../input/...' to match the loader's paths
ROOT_INPUT = os.path.abspath(os.path.join(os.getcwd(), '..', 'input'))
os.makedirs(ROOT_INPUT, exist_ok=True)

def download_and_extract(gz_name, dest_rel):
    url = BASE_URL + gz_name
    dest_path = os.path.join(ROOT_INPUT, dest_rel)
    dest_dir = os.path.dirname(dest_path)
    os.makedirs(dest_dir, exist_ok=True)
    tmp_gz = dest_path + '.gz'
    if os.path.exists(dest_path):
        print(f'Skipping, already exists: {dest_path}')
        return
    print(f'Downloading {url} -> {tmp_gz}')
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(tmp_gz, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)
    print(f'Extracting {tmp_gz} -> {dest_path}')
    with gzip.open(tmp_gz, 'rb') as f_in:
        with open(dest_path, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    os.remove(tmp_gz)

for gz, dest in FILES.items():
    try:
        download_and_extract(gz, dest)
    except Exception as e:
        print('Error downloading', gz, e)

print('\nDownloaded files:')
for gz, dest in FILES.items():
    p = os.path.join(ROOT_INPUT, dest)
    if os.path.exists(p):
        print(p, os.path.getsize(p), 'bytes')
    else:
        print('MISSING:', p)

Path to dataset files: /home/omar/.cache/kagglehub/datasets/hojjatk/mnist-dataset/versions/1
