# Download CT Data

Contains a routine to download data from zenodo and saving to google drive, as well as another one for unzipping the corresponding files.

In [None]:
import os
import shutil
import concurrent.futures
import subprocess
import glob
from tqdm import tqdm
from google.colab import drive
import time

drive.mount('/content/drive')

# Setup folders to download to
os.makedirs('data', exist_ok=True)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
def download_and_move(filename, base_url):
    url = f'{base_url}/{filename}?download=1'
    output_path = f'data/{filename}'
    drive_path = f'drive/My\ Drive/deep-learning-with-pytorch/data/{filename}'

    if os.path.exists(drive_path):
        print(f'{filename} already exists')
        return

    print(f'Downloading {filename}')
    !wget -O {output_path} '{url}'
    print(f'Finished downloading {filename}')

In [None]:
base_url_part1 = f'https://zenodo.org/records/3723295/files'
filenames_part1 = [f'subset{i}.zip' for i in range(7)]

base_url_part2 = f'https://zenodo.org/records/4121926/files'
filenames_part2 = [f'subset{i}.zip' for i in range(7, 10)]

In [None]:
with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
    futures = [executor.submit(download_and_move, fn, base_url_part1) for fn in filenames_part1]
    concurrent.futures.wait(futures)

In [None]:
with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
    futures = [executor.submit(download_and_move, fn, base_url_part2) for fn in filenames_part2]
    concurrent.futures.wait(futures)

In [None]:
def move_file(filename):
    output_path = f'data/{filename}'
    drive_path = f'drive/My\ Drive/deep-learning-with-pytorch/data/{filename}'
    !mv {output_path} {drive_path}

In [None]:
for index in tqdm(range(10)):
    file = f'subset{index}.zip'
    move_file(file)

  0%|          | 0/10 [00:00<?, ?it/s]

mv: cannot stat 'data/subset0.zip': No such file or directory


100%|██████████| 10/10 [21:01<00:00, 126.11s/it]


In [None]:
source_dir = '/content/drive/MyDrive/deep-learning-with-pytorch/data'
target_dir = '/content/data'

drive_zip_files = glob.glob(os.path.join(source_dir, '*.zip'))
for drive_zip_file in drive_zip_files:
    start = time.time()

    file_name = os.path.basename(drive_zip_file)
    unzipped_folder_name = os.path.splitext(file_name)[0]
    local_zip_file = os.path.join(target_dir, file_name)

    print(f'\n\n=== Processing file {unzipped_folder_name} ===')
    if os.path.exists(local_zip_file):
        print(f'    Skipping file')
        continue

    # Copy file from drive to notebook
    shutil.copy(drive_zip_file, target_dir)
    copy_time = time.time()
    print(f'    Finished copying in {copy_time - start} seconds')

    # Unzip file in notebook
    subprocess.run([
        'unzip',
        '-q', local_zip_file,
        '-d', target_dir
    ])
    unzip_time = time.time()
    print(f'    Finished unzipping in {unzip_time - copy_time} seconds')

    # Copy back to google drive
    local_folder_path = os.path.join(target_dir, unzipped_folder_name)
    drive_folder_path = os.path.join(source_dir, unzipped_folder_name)
    shutil.copytree(
        local_folder_path,
        drive_folder_path,
        dirs_exist_ok=True
    )
    copy_back_time = time.time()
    print(f'    Finished copy back to drive in ' +
          f'{copy_back_time - unzip_time} seconds')

    # Delete unused local files
    os.remove(local_zip_file)
    shutil.rmtree(local_folder_path, ignore_errors=True)
    print(f'    Finished file in {time.time() - start} seconds')



=== Processing file subset0 ===
    Skipping file


=== Processing file subset2 ===
    Finished copying in 137.53758096694946 seconds
    Finished unzipping in 213.3952956199646 seconds
    Finished copy back to drive in 174.1155514717102 seconds
    Finished file in 525.9430611133575 seconds


=== Processing file subset1 ===
    Finished copying in 120.98802447319031 seconds
    Finished unzipping in 188.9628221988678 seconds
    Finished copy back to drive in 145.1288800239563 seconds
    Finished file in 457.8082287311554 seconds


=== Processing file subset3 ===
    Finished copying in 116.02492332458496 seconds
    Finished unzipping in 191.0640811920166 seconds
    Finished copy back to drive in 196.4029471874237 seconds
    Finished file in 504.360493183136 seconds


=== Processing file subset4 ===
    Finished copying in 133.73262000083923 seconds
    Finished unzipping in 192.32238173484802 seconds
    Finished copy back to drive in 171.20644307136536 seconds
    Finished f

In [None]:
drive.flush_and_unmount()