In [6]:
from fastai import *
from fastai.vision import *
from collections import defaultdict

In [7]:
path = Path('data/protein')
path.ls()

[PosixPath('data/protein/train.csv'),
 PosixPath('data/protein/ext_train_images'),
 PosixPath('data/protein/train_images'),
 PosixPath('data/protein/protein_predictions_tta.csv'),
 PosixPath('data/protein/ext_train.csv'),
 PosixPath('data/protein/protein_predictions.csv'),
 PosixPath('data/protein/test_images'),
 PosixPath('data/protein/sample_submission.csv'),
 PosixPath('data/protein/models'),
 PosixPath('data/protein/ext_data_images_urls.txt'),
 PosixPath('data/protein/protein_predictions_no_tta.csv'),
 PosixPath('data/protein/.ipynb_checkpoints')]

#### Construct the URL file

In [8]:
df = pd.read_csv(path/'ext_train.csv')
df.head()

Unnamed: 0,Id,Target
0,10580_1610_C1_1,21 13 25 2 0
1,10580_1610_C1_2,21 13 25 2 0
2,10580_1756_B1_1,21 13 25 2 0
3,10580_1756_B1_2,21 13 25 2 0
4,10580_1758_B1_1,21 13 25 2 0


In [9]:
URL_prefix = "https://v18.proteinatlas.org/images/"

In [11]:
URLs = defaultdict(list)
colors = ['red', 'green', 'blue', 'yellow']

for id, target in df.values:
    folder = id.split('_')[0]
    file_id = '_'.join(id.split('_')[1:])
    for c in colors:
        URLs[id].append(f'{URL_prefix}{folder}/{file_id}_{c}.jpg')

In [12]:
def download_url(url:str, dest:str, overwrite:bool=False, pbar:ProgressBar=None,
                 show_progress=True, chunk_size=1024*1024, timeout=4)->None:
    "Download `url` to `dest` unless it exists and not `overwrite`."
    if os.path.exists(dest) and not overwrite: return

    u = requests.get(url, stream=True, timeout=timeout)
    try: file_size = int(u.headers["Content-Length"])
    except: show_progress = False

    with open(dest, 'wb') as f:
        nbytes = 0
        if show_progress: pbar = progress_bar(range(file_size), auto_update=False, leave=False, parent=pbar)
        for chunk in u.iter_content(chunk_size=chunk_size):
            nbytes += len(chunk)
            if show_progress: pbar.update(nbytes)
            f.write(chunk)

In [19]:
def parallel(func, arr:Collection, max_workers:int=None):
    "Call `func` on every element of `arr` in parallel using `max_workers`."
    max_workers = ifnone(max_workers, defaults.cpus)
    if max_workers<2: _ = [func(o,i) for i,o in enumerate(arr)]
    else:
        with ProcessPoolExecutor(max_workers=max_workers) as ex:
            futures = [ex.submit(func,o,i) for i,o in enumerate(arr)]
            for f in progress_bar(concurrent.futures.as_completed(futures), total=len(arr)): pass

In [29]:
def download_image(url,dest, timeout=4):
    try: r = download_url(url, dest, overwrite=False, show_progress=False, timeout=timeout)
    except Exception as e: print(f"Error {url} {e}")

In [23]:
def _download_image_inner(url_dest, i, timeout=4):
    url, dest = url_dest
    download_image(url, dest, timeout=timeout)

In [25]:
def download_images(urls:Collection[str], max_pics:int=1000, max_workers:int=8, timeout=4):
    "Download images listed in text file `urls` to path `dest`, at most `max_pics`"
    parallel(partial(_download_image_inner, timeout=timeout), urls, max_workers=max_workers)

In [22]:
URL_list = []
for id, urls in URLs.items():
    
    for url in urls:
        dest_file = path/'ext_train_images'/(id + '_' + url.split('_')[-1])
        
        URL_list.append((url, dest_file))
URL_list[:4]

[('https://v18.proteinatlas.org/images/10580/1610_C1_1_red.jpg',
  PosixPath('data/protein/ext_train_images/10580_1610_C1_1_red.jpg')),
 ('https://v18.proteinatlas.org/images/10580/1610_C1_1_green.jpg',
  PosixPath('data/protein/ext_train_images/10580_1610_C1_1_green.jpg')),
 ('https://v18.proteinatlas.org/images/10580/1610_C1_1_blue.jpg',
  PosixPath('data/protein/ext_train_images/10580_1610_C1_1_blue.jpg')),
 ('https://v18.proteinatlas.org/images/10580/1610_C1_1_yellow.jpg',
  PosixPath('data/protein/ext_train_images/10580_1610_C1_1_yellow.jpg'))]

In [30]:
download_images(URL_list, max_pics=1000000)