<a href="https://colab.research.google.com/github/murilogustineli/fathomnet-2024/blob/main/20240910_mg_download_images.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Download Images under 20minutes instead of 4hours

- [Kaggle notebook to download images](https://www.kaggle.com/code/picekl/download-images-under-20m-instead-of-4h)


In [2]:
# Mount into drive
from google.colab import drive
drive.mount("/content/drive")

%load_ext autoreload
%autoreload 2

Mounted at /content/drive


In [3]:
# Change directory to the project folder
%cd '/content/drive/MyDrive/Kaggle/FathomNet2024'
# Verify the contents of the current folder
!ls

/content/drive/MyDrive/Kaggle/FathomNet2024
eval.json  notebooks  test-data  training-data	train.json


In [4]:
"""
download_images

Script to retrieve images for the 2024 FathomNet out-of-sample challenge as part of FGVC 10.

Assumes COCO formatted annotation file has been downloaded from http://www.kaggle.com/competitions/fathomnet-out-of-sample-detection
"""
# Authors:
# Eric Orenstein (eorenstein@mbari.org)
# Lukas Picek (lukaspicek@gmail.com)


import os
import json
import logging
import argparse
import requests
import progressbar
from PIL import Image
from tqdm import tqdm
from shutil import copyfileobj
from multiprocessing import Pool, cpu_count


def download_img(args, resize=True):
    """
    Download a single image.

    :param args: Tuple of (name, url, outdir)
    """
    name, url, outdir = args
    file_name = os.path.join(outdir, name)

    # Only download if the image does not exist in the outdir
    if not os.path.exists(file_name):
        resp = requests.get(url, stream=True)
        resp.raw.decode_content = True
        with open(file_name, 'wb') as f:
            copyfileobj(resp.raw, f)

        # Resize the downloaded image while keeping the aspect ratio
        if resize:
            try:
                img = Image.open(file_name)
                img.thumbnail((1280,720), Image.LANCZOS)
                img.save(file_name)
            except:
                print(f"Problem with resizing image: {file_name}")

        return 1  # Indicate that image was downloaded
    else:
        return 0  # Indicate that image already exists


def image_data_setup(imgs, outdir=None):
    """
    Download images to an output dir

    :param imgs: list of tuples (name, url)
    :param outdir: desired directory [default to working directory]
    """

    # Set the out directory to default if not specified
    if not outdir:
        outdir = os.path.join(os.getcwd(), 'images')

    # Make the directory if it does not exist
    if not os.path.exists(outdir):
        os.mkdir(outdir)
        logging.info(f"Created directory {outdir}")

    num_processes = cpu_count() * 2  # Use twice the number of CPU cores for multiprocessing
    pool = Pool(processes=num_processes)

    # Prepare arguments for multiprocessing
    args_list = [(name, url, outdir) for name, url in imgs]

    # Use tqdm for progress bar
    with tqdm(total=len(imgs)) as pbar:
        for _ in pool.imap_unordered(download_img, args_list):
            pbar.update(1)

    pool.close()
    pool.join()


def download_image_data(dataset_path: str, outpath: str) -> None:
    logging.info(f'opening {dataset_path}')
    with open(dataset_path, 'r') as ff:
        dataset = json.load(ff)
    # get images
    ims = dataset['images']
    # logging message
    logging.info(f'retrieving {len(ims)} images')
    # create image list
    ims = [(im['file_name'], im['coco_url']) for im in ims]

    # Download images
    image_data_setup(ims, outdir=outpath)


In [5]:
train_dataset = "/content/drive/MyDrive/Kaggle/FathomNet2024/train.json"
train_outpath = "/content/drive/MyDrive/Kaggle/FathomNet2024/training-data"

download_image_data(train_dataset, train_outpath)
# f8cf3738-69db-4706-9688-100eb76d6ad5.png

100%|██████████| 8058/8058 [00:29<00:00, 274.90it/s] 


In [6]:
eval_dataset = "/content/drive/MyDrive/Kaggle/FathomNet2024/eval.json"
eval_outpath = "/content/drive/MyDrive/Kaggle/FathomNet2024/test-data"

download_image_data(eval_dataset, eval_outpath)

100%|██████████| 2686/2686 [00:00<00:00, 3294.48it/s]


In [7]:
# check number of downloaded training and eval images
# Should be 8058 and

import json

def check_num_images(json_file:str, image_folder_name:str):
    with open(json_file) as f:
        data = json.load(f)

    # get total number of images in json file
    total_imgs = 0
    for img in data["images"]:
        total_imgs += 1

    # get number of images in the image data folder
    folder_name = "train" if "train" in json_file else "test"
    num_train_imgs = len([name for name in os.listdir(f"./{image_folder_name}")])
    print(f"json images: {total_imgs}")
    print(f"{folder_name} images: {num_train_imgs}\n")

In [8]:
# Check if images were downloaded properly
check_num_images("train.json", "training-data")
check_num_images("eval.json", "test-data")

json images: 8058
train images: 8058

json images: 2686
test images: 2686



In [9]:
# size of datasets
!du -sh ./training-data
!du -sh ./test-data

7.2G	./training-data
2.3G	./test-data


In [10]:
# image labels
with open("train.json") as f:
    data = json.load(f)

print(f"COCO obj detection format: {list(data.keys())}\n")

# analysing the json
print(f"Number of images: {len(data['images'])}")
print(f"Number of annotations: {len(data['annotations'])}")
print(f"Number of categories: {len(data['categories'])}\n")

# annotation format
data["images"][12], data["annotations"][10_000], data["categories"][17]

COCO obj detection format: ['info', 'images', 'licenses', 'annotations', 'categories']

Number of images: 8058
Number of annotations: 35954
Number of categories: 18



({'id': 20,
  'width': 720,
  'height': 486,
  'file_name': '86c480bc-9be5-4252-ade1-66826ef979f6.png',
  'license': 0,
  'flickr_url': 'https://fathomnet.org/static/m3/staging/Doc%20Ricketts/images/0565/01_07_11_07.png',
  'coco_url': 'https://fathomnet.org/static/m3/staging/Doc%20Ricketts/images/0565/01_07_11_07.png',
  'date_captured': '2013-12-12 00:00:00'},
 {'id': 13748,
  'image_id': 20,
  'category_id': 18,
  'segmentation': [],
  'area': 260.0,
  'bbox': [581.0, 231.0, 13.0, 20.0],
  'iscrowd': 0},
 {'id': 18, 'name': 'Worm', 'supercategory': ''})