# Dataset Download (Small Samples)

This notebook downloads small sample datasets into `data/raw/`.

In [1]:
import os
from pathlib import Path
import urllib.request

root = Path.cwd()
if root.name == 'notebooks':
    root = root.parent

base = root / 'data' / 'raw'
(base / 'wikipedia').mkdir(parents=True, exist_ok=True)
(base / 'covid').mkdir(parents=True, exist_ok=True)
(base / 'world_bank').mkdir(parents=True, exist_ok=True)
(base / 'finance').mkdir(parents=True, exist_ok=True)

print('Folders ready:', base)


Folders ready: C:\Users\SANTANU\Downloads\migraph\data\raw


## Wikipedia sample (Simple English dump)

This is much smaller than the full English Wikipedia dump.

In [2]:
from pathlib import Path
import urllib.request

wiki_url = 'https://dumps.wikimedia.org/simplewiki/latest/simplewiki-latest-pages-articles.xml.bz2'
wiki_out = Path(base / 'wikipedia' / 'simplewiki-latest-pages-articles.xml.bz2')

if not wiki_out.exists():
    print('Downloading:', wiki_url)
    urllib.request.urlretrieve(wiki_url, wiki_out)
else:
    print('Already exists:', wiki_out)


Already exists: C:\Users\SANTANU\Downloads\migraph\data\raw\wikipedia\simplewiki-latest-pages-articles.xml.bz2


## COVID dataset (OWID latest snapshot)

This is a small CSV snapshot for UI testing.

In [3]:
from pathlib import Path
import urllib.request

covid_url = 'https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/latest/owid-covid-latest.csv'
covid_out = Path(base / 'covid' / 'owid-covid-latest.csv')

if not covid_out.exists():
    print('Downloading:', covid_url)
    urllib.request.urlretrieve(covid_url, covid_out)
else:
    print('Already exists:', covid_out)


Already exists: C:\Users\SANTANU\Downloads\migraph\data\raw\covid\owid-covid-latest.csv


## World Bank sample (GDP for USA)

Small CSV from the World Bank API.

In [4]:
wb_specs = [
    ('world_bank_gdp_multi.zip', 'NY.GDP.MKTP.CD'),
    ('world_bank_gdp_pc_multi.zip', 'NY.GDP.PCAP.CD'),
    ('world_bank_inflation_multi.zip', 'FP.CPI.TOTL.ZG'),
    ('world_bank_unemployment_multi.zip', 'SL.UEM.TOTL.ZS'),
]

for filename, indicator in wb_specs:
    wb_url = f"https://api.worldbank.org/v2/country/all/indicator/{indicator}?downloadformat=csv"
    wb_out = Path(base / 'world_bank' / filename)
    if not wb_out.exists():
        print('Downloading:', wb_url)
        urllib.request.urlretrieve(wb_url, wb_out)
    else:
        print('Already exists:', wb_out)


Already exists: C:\Users\SANTANU\Downloads\migraph\data\raw\world_bank\world_bank_gdp_multi.zip
Already exists: C:\Users\SANTANU\Downloads\migraph\data\raw\world_bank\world_bank_gdp_pc_multi.zip
Downloading: https://api.worldbank.org/v2/country/all/indicator/FP.CPI.TOTL.ZG?downloadformat=csv


Downloading: https://api.worldbank.org/v2/country/all/indicator/SL.UEM.TOTL.ZS?downloadformat=csv


## Finance sample (AAPL daily from Stooq)

Small single?ticker CSV.

In [5]:
from pathlib import Path
import urllib.request

fin_url = 'https://stooq.com/q/d/l/?s=aapl.us&i=d'
fin_out = Path(base / 'finance' / 'aapl.us.csv')

if not fin_out.exists():
    print('Downloading:', fin_url)
    urllib.request.urlretrieve(fin_url, fin_out)
else:
    print('Already exists:', fin_out)


Already exists: C:\Users\SANTANU\Downloads\migraph\data\raw\finance\aapl.us.csv
