In [5]:
pip install requests tqdm

Note: you may need to restart the kernel to use updated packages.


In [1]:
import requests
import os
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

# Configuration
BASE_URL = "https://dataverse.harvard.edu"
DOI = "doi:10.7910/DVN/HYJDE0"  # The DOI from your text
OUTPUT_DIR = "downloaded_data"  # Where to save files on your PC
MAX_WORKERS = 5  # Number of simultaneous downloads (increase for speed, decrease if errors)

def get_dataset_files(persistent_id):
    """Fetches the list of all files in the dataset via API."""
    api_url = f"{BASE_URL}/api/datasets/:persistentId/?persistentId={persistent_id}"
    response = requests.get(api_url)
    data = response.json()
    
    if data['status'] != 'OK':
        print("Error fetching dataset metadata.")
        return []
    
    # Extract file information from the latest version
    files = []
    for item in data['data']['latestVersion']['files']:
        file_info = {
            'id': item['dataFile']['id'],
            'filename': item['dataFile']['filename'],
            'directory': item.get('directoryLabel', ''), # Gets folder like 'CHN_exec'
            'size': item['dataFile']['filesize']
        }
        files.append(file_info)
    return files

def download_file(file_info):
    """Downloads a single file and places it in the correct folder."""
    file_id = file_info['id']
    filename = file_info['filename']
    directory = file_info['directory']
    
    # Construct local path (e.g., downloaded_data/CHN_exec/news.jsonl)
    full_folder_path = os.path.join(OUTPUT_DIR, directory)
    if not os.path.exists(full_folder_path):
        os.makedirs(full_folder_path, exist_ok=True)
        
    file_path = os.path.join(full_folder_path, filename)
    
    # Skip if already exists (resume capability)
    if os.path.exists(file_path) and os.path.getsize(file_path) == file_info['size']:
        return f"Skipped {filename} (already exists)"

    # Download URL format seen in your screenshot
    download_url = f"{BASE_URL}/api/access/datafile/{file_id}"
    
    try:
        with requests.get(download_url, stream=True) as r:
            r.raise_for_status()
            with open(file_path, 'wb') as f:
                for chunk in r.iter_content(chunk_size=8192):
                    f.write(chunk)
        return None # Success
    except Exception as e:
        return f"Failed to download {filename}: {str(e)}"

def main():
    print(f"Fetching file list for {DOI}...")
    files = get_dataset_files(DOI)
    print(f"Found {len(files)} files.")

    # Optional: Filter if you ONLY want 'news.jsonl' files
    # files = [f for f in files if 'news.jsonl' in f['filename']]

    print(f"Starting download with {MAX_WORKERS} threads...")
    
    # Use ThreadPoolExecutor to download in parallel
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        # Tqdm creates the progress bar
        results = list(tqdm(executor.map(download_file, files), total=len(files), unit="file"))

    # Print any errors
    for res in results:
        if res:
            print(res)
            
    print("\nDownload complete!")

if __name__ == "__main__":
    main()

Fetching file list for doi:10.7910/DVN/HYJDE0...
Found 498 files.
Starting download with 5 threads...


100%|██████████| 498/498 [01:55<00:00,  4.29file/s]

Skipped images.jsonl (already exists)
Skipped images.jsonl (already exists)
Skipped images.jsonl (already exists)
Skipped images.jsonl (already exists)
Skipped images.jsonl (already exists)
Skipped images.jsonl (already exists)
Skipped images.jsonl (already exists)
Skipped images.jsonl (already exists)
Skipped images.jsonl (already exists)
Skipped images.jsonl (already exists)
Skipped images.jsonl (already exists)
Skipped images.jsonl (already exists)
Skipped images.jsonl (already exists)
Skipped images.jsonl (already exists)
Skipped images.jsonl (already exists)
Skipped images.jsonl (already exists)
Skipped images.jsonl (already exists)
Skipped images.jsonl (already exists)
Skipped images.jsonl (already exists)
Skipped images.jsonl (already exists)
Skipped images.jsonl (already exists)
Skipped images.jsonl (already exists)
Skipped images.jsonl (already exists)
Skipped images.jsonl (already exists)
Skipped images.jsonl (already exists)
Skipped images.jsonl (already exists)
Skipped imag


