In [None]:
import sys
import subprocess
import os
import shutil

# ==========================================
#        DATA SETUP SCRIPT
# ==========================================
# This script automatically downloads the PCAM dataset (700MB)
# from Kaggle and places it in a './data' folder for the project.

# --- USER CONFIGURATION ---
# PLEASE READ: You need a Kaggle API Token to download this data.
# 1. Go to https://www.kaggle.com/settings -> API -> Create New Token
# 2. Copy the 'KGAT' token (or username/key pair)
# 3. Paste it below inside the quotes.
# ------------------------------------------
# PASTE YOUR TOKEN BELOW
MY_KAGGLE_TOKEN = "YOUR_TOKEN_HERE" 
# ------------------------------------------

def install_and_import():
    """Ensures kagglehub is installed in the current Jupyter kernel"""
    try:
        import kagglehub
    except ImportError:
        print("Installing required library: kagglehub...")
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'kagglehub'])
        import kagglehub
    return kagglehub

def download_data():
    # 1. Setup Authentication
    if MY_KAGGLE_TOKEN == "PASTE_YOUR_KGAT_TOKEN_HERE":
        print("STOP: You must paste your Kaggle API Token in the script first!")
        return

    os.environ["KAGGLE_API_TOKEN"] = MY_KAGGLE_TOKEN
    
    # 2. Import library safely
    kagglehub = install_and_import()

    # 3. Define target location
    # We want the data in a folder named 'data' in this directory
    target_dir = "./data"
    
    # Check if data already exists to save time
    if os.path.exists(target_dir) and len(os.listdir(target_dir)) > 0:
        print(f"Data folder '{target_dir}' already exists and is not empty.")
        print("Skipping download.")
        return

    # 4. Download
    print("Starting download of 'nicolaish/pcam-data'...")
    print("    (Note: This is ~700MB. It may take some time depending on internet speed)")
    
    # This downloads to the system cache first
    cache_path = kagglehub.dataset_download("nicolaish/pcam-data")
    print(f"    Download complete. Cached at: {cache_path}")

    # 5. Move to project folder
    print(f"Moving files to project folder: {target_dir}...")
    if not os.path.exists(target_dir):
        # We use copytree to move the entire folder structure
        shutil.copytree(cache_path, target_dir, dirs_exist_ok=True)
    
    print("SUCCESS! Dataset is ready.")
    print(f"   You can now load your data from: {os.path.abspath(target_dir)}")

if __name__ == "__main__":
    download_data()