In [1]:
# pip
# !pip install tqdm
# !pip install dask

In [2]:
# conda
# !conda install tqdm
# !conda install dask

In [1]:
# configuration, worked on using python@3.10.9 
import os
import urllib
from threading import Thread, Lock
import dask
from tqdm import tqdm

In [2]:
# data folder
data_dir = 'data'

# data urls
historic_arrest_loc = { 'url': 'https://data.cityofnewyork.us/resource/8h9b-rp9u.json?$limit=10000', 'filename': 'arrest.json' }
historic_complaint_loc = { 'url': 'https://data.cityofnewyork.us/resource/qgea-i56i.json?$limit=10000', 'filename': 'complaint.json' }
historic_court_summons_loc = { 'url': 'https://data.cityofnewyork.us/resource/sv2w-rv3k.json?$limit=10000', 'filename': 'summons.json' }
traffic_speed_loc = { 'url': 'https://data.cityofnewyork.us/resource/i4gi-tjb9.json?$limit=10000', 'filename': 'speed.json' }
turnstile_loc = { 'url': 'https://data.ny.gov/resource/i55r-43gk.json?$limit=10000', 'filename': 'turnstile.json' }
subway_loc = { 'url': 'http://web.mta.info/developers/data/nyct/subway/Stations.csv?$limit=10000', 'filename': 'subway.json' }

In [3]:
# download flags
downloadflag = True
redownload = True

thread_lock = Lock()

# download utils
def download_dataset_thread(loc, folder):
    with thread_lock:
         if ((not os.path.exists(os.path.join(folder, loc['filename']))) or redownload) and downloadflag:
            if os.path.isfile(os.path.join(folder, loc['filename'])):
                os.remove(os.path.join(folder, loc['filename']))
            if not os.path.exists(folder):
                os.makedirs(folder) 
            with tqdm(unit="B", unit_scale=True, desc=loc['filename'], miniters=1) as progress_bar:
                urllib.request.urlretrieve(loc['url'], os.path.join(folder, loc['filename']), lambda block_num, block_size, total_size: progress_bar.update(block_size))
            progress_bar.display()
        
def download_dataset(loc, folder):
    thread = Thread(target=download_dataset_thread, args=(loc, folder))
    thread.start()
    thread.join()
        

In [4]:
# download datasets
for dataset in [historic_arrest_loc,
                historic_complaint_loc,
                historic_court_summons_loc,
                turnstile_loc,
                subway_loc]:
    download_dataset(dataset, data_dir)

arrest.json: 6.98MB [00:01, 4.70MB/s]
complaint.json: 10.0MB [00:01, 5.35MB/s]
summons.json: 6.77MB [00:02, 2.98MB/s]s]
turnstile.json: 2.17MB [00:01, 1.97MB/s]
subway.json: 65.5kB [00:00, 484kB/s] /s]
subway.json: 65.5kB [00:00, 482kB/s]