# Downloading datasets from the ISIC Archive
This notebook uses the ```isicarchive``` package to download images from
the ISIC Archive, either by selecting by dataset names (this downloads
all images of the respective datasets!), or by a previously made selection
in form of a CSV (**alternative path** below).

## Preparation
When creating an instance of the ```IsicApi``` object, you must provide
an existing username and password that allow access to the requested
datasets! So, please first register a username, and ask an administrator,
such as Nick Kurtansky, to configure the appropriate access privileges!

As part of the first path, the call to ```cache_images()``` will download
information (and meta information) about all images that are available
(with your credentials). Next, the code selects all images from the
```IsicApi``` object that belong to the desired set of datasets, and then
downloads those images.

## Alternative path
The alternative path loads a CSV, and iterates over the image_id column.
Please note that if the metadata has not yet been cached (2nd code cell),
two HTTPS requests will be made per image--one for the metadata, one for
the image binary data.

In [None]:
# imports
import os
import shutil
import sys

import pandas as pd

# install/upgrade isicarchive package, then import
!{sys.executable} -m pip install --upgrade git+https://github.com/neuroelf/isicarchive
from isicarchive.api import IsicApi

# define target folder
target_folder = 'C:\\Users\\weberj3\\Documents\\2020 Challenge\\fromISIC'

# define sub-selection CSV
subselection_csv = 'C:\\Users\\weberj3\\Desktop\\ISIC_CHALLENGE_SET_2020.csv'

# username (please change to your own credentials!)
username = 'weberj3@mskcc.org'

# DO NOT PUT A PASSWORD into the document, it will be requested,
# or you can store it in a privately readable ~/.netrc file!

# please also change the next line of code,
# and then create this folder, as well as the "cache" subfolder!
doc_folder = 'Z:\\10.Imaging Informatics\\'
cache_folder = doc_folder + 'ISIC' + os.sep + 'cache'

# setting debug flag, this will print out every web-API operation
debug = False

# Preparation
api = IsicApi(username, cache_folder=cache_folder, debug=debug)

# Full dataset downloads path

In [None]:
# caching all images metadata; this can take a long time!!
api.cache_images()

In [None]:
# list available datasets
api.datasets

In [None]:
# datasets, please configure as {'Full name': 'folder', ...}
datasets = {
    'ISIC 2020 Challenge - MSKCC contribution': 'mskcc',
}
for d in datasets.values():
    t = target_folder + os.sep + d
    if not os.path.exists(t):
        os.mkdir(t)

In [None]:
# select images from a list of datasets
selection = api.select_images(['dataset.name', 'in', list(datasets.keys())])
len(selection)

In [None]:
# and create a meta-dictionary as a CSV
sk = list(selection.keys())
md = api.write_csv(target_folder + os.sep + 'isic_challenge_2020_datasets_metadata.csv', api.selected_metadata())

## This is the actual download, if you want only the metadata, skip this cell!

In [None]:
# iterate over list
for idx,s in enumerate(sk):
    if idx % 100 == 0:
        print(idx)
    si = api.image(s)
    sname = si.name
    sid = si.id
    fname = os.sep + sname
    tname = cache_folder + os.sep + sid[-2] + os.sep + sid[-1] + os.sep + 'image_' + sid + '_' + sname
    if not os.path.exists(tname + '.jpg'):
        si.load_image_data()
        si.clear_data()
    if os.path.exists(tname + '.jpg'):
        shutil.copyfile(tname + '.jpg', target_folder + os.sep + datasets[si.dataset['name']] + fname + '.jpg')
    else:
        print(fname)

# Alternative (CSV) path

In [None]:
# alternatively read sub-selection CSV
selected = pd.read_csv(subselection_csv)
sk = list(selected['image_id'])

In [None]:
# iterate over list
for idx,s in enumerate(sk):
    if idx % 100 == 0:
        print(idx)
    si = api.image(s)
    sname = si.name
    sid = si.id
    fname = os.sep + sname
    tname = cache_folder + os.sep + sid[-2] + os.sep + sid[-1] + os.sep + 'image_' + sid + '_' + sname
    if not os.path.exists(tname + '.jpg'):
        si.load_image_data()
        si.clear_data()
    if os.path.exists(tname + '.jpg'):
        shutil.copyfile(tname + '.jpg', target_folder + os.sep + datasets[si.dataset['name']] + fname + '.jpg')
    else:
        print(fname)