Mount Google Drive to get access to folders within Google Drive.

In [None]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


Script for loading, processing, and categorizing raw dataset images by their labels. Source code: https://www.kaggle.com/lyakaap/fast-resized-image-download-python-3. Raw dataset contained in a `.csv` file with the following structure: `(id, url, label)`.

In [None]:
import multiprocessing
import os
from io import BytesIO
from urllib import request
import pandas as pd
import re
import tqdm
from PIL import Image

PROJECT_FOLDER = '/content/drive/My Drive/machine_learning_pipeline/'
RAW_DATASET_FILE = PROJECT_FOLDER + 'raw_training_dataset.csv'
DATASET_FOLDER = PROJECT_FOLDER + 'training_dataset/'
LABEL_FOLDER = ''
IMG_SIZE = 224     #Image size
IMG_QUALITY = 90   #Image quality
NUM_WORKERS = 8    #Num of CPUs

def overwrite_urls(df):
    def reso_overwrite(url_tail):
      pattern = 's[0-9]+'
      search_result = re.match(pattern, url_tail)
      if search_result is None:
          return url_tail
      else:
          return 's{}'.format(IMG_SIZE)

    def join_url(parsed_url, s_reso):
        parsed_url[-2] = s_reso
        return '/'.join(parsed_url)

    parsed_url = df.url.apply(lambda x: x.split('/'))
    
    train_url_tail = parsed_url.apply(lambda x: x[-2])
    resos = train_url_tail.apply(lambda x: reso_overwrite(x))

    overwritten_df = pd.concat([parsed_url, resos], axis=1)
    overwritten_df.columns = ['url', 's_reso']
    df['url'] = overwritten_df.apply(lambda x: join_url(x['url'], x['s_reso']), 
                                     axis=1)
    return df

def parse_data(df):
    key_url_list = [line[:2] for line in df.values]
    return key_url_list

def download_image(key_url):
    (key, url) = key_url
    filename = os.path.join(LABEL_FOLDER, '{}.jpg'.format(key))
    print(filename)

    if os.path.exists(filename):
        print('Image {} already exists. Skipping download.'.format(filename))
        return 0

    try:
        response = request.urlopen(url)
        image_data = response.read()
    except:
        print('Warning: Could not download image {} from {}'.format(key, url))
        return 1

    try:
        pil_image = Image.open(BytesIO(image_data))
    except:
        print('Warning: Failed to parse image {}'.format(key))
        return 1

    try:
        pil_image = pil_image.convert('RGB')
    except:
        print('Warning: Failed to convert image {} to RGB'.format(key))
        return 1

    try:
        pil_image = pil_image.resize((IMG_SIZE, IMG_SIZE))
    except:
        print('Warning: Failed to resize image {}'.format(key))
        return 1

    try:
        pil_image.save(filename, format='JPEG', quality=IMG_QUALITY)
    except:
        print('Warning: Failed to save image {}'.format(filename))
        return 1

    return 0

def loader(df):
    key_url_list = parse_data(df)
    pool = multiprocessing.Pool(processes=NUM_WORKERS)
    failures = sum(tqdm.tqdm(pool.imap_unordered(download_image, key_url_list),
                             total=len(key_url_list)))
    print('Total number of download failures:', failures)
    pool.close()
    pool.terminate()

#Our wrapper function below
def load_data(df):
    if not os.path.exists(DATASET_FOLDER):
        print('Creating folder called %s to store dataset images...' %DATASET_FOLDER)
        os.mkdir(DATASET_FOLDER)
    labels = df['state_id'].value_counts().index.to_numpy()
    for i in range(len(labels)):
        label = labels[i]
        target_df = df.loc[df['state_id'] == label]
        global LABEL_FOLDER
        LABEL_FOLDER = DATASET_FOLDER + str(label) + '/'
        if not os.path.exists(LABEL_FOLDER):
            print('Creating folder to store dataset images with label %s...' %label)
            os.mkdir(LABEL_FOLDER)
        print('Processing %i image(s) for label %s...' %(target_df.shape[0], label))
        loader(overwrite_urls(target_df))

Run the snippet below to load, process, and categorize the raw dataset according to their labels.

In [None]:
df = pd.read_excel(RAW_DATASET_FILE).query('url != "None"')
load_data(df)

Creating folder to store dataset images with label state_10...
Processing 2 image(s) for label state_10...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  0%|          | 0/2 [00:00<?, ?it/s]

/content/drive/My Drive/machine_learning_pipeline/training_dataset/state_10/67eda47e8894efab.jpg
/content/drive/My Drive/machine_learning_pipeline/training_dataset/state_10/4c41866f83acbb36.jpg


100%|██████████| 2/2 [00:00<00:00,  3.99it/s]

Total number of download failures: 1
Processing 2 image(s) for label state_2...



  0%|          | 0/2 [00:00<?, ?it/s]

/content/drive/My Drive/machine_learning_pipeline/training_dataset/state_2/650c989dd3493748.jpg
Image /content/drive/My Drive/machine_learning_pipeline/training_dataset/state_2/650c989dd3493748.jpg already exists. Skipping download.
/content/drive/My Drive/machine_learning_pipeline/training_dataset/state_2/675e1bb968bc5150.jpg


100%|██████████| 2/2 [00:00<00:00,  6.59it/s]

Total number of download failures: 0
Processing 2 image(s) for label state_3...



  0%|          | 0/2 [00:00<?, ?it/s]

/content/drive/My Drive/machine_learning_pipeline/training_dataset/state_3/05e63ca9b2cde1f4.jpg
/content/drive/My Drive/machine_learning_pipeline/training_dataset/state_3/192333daaf6119cf.jpg


100%|██████████| 2/2 [00:00<00:00,  6.01it/s]

Total number of download failures: 1
Processing 2 image(s) for label state_9...



  0%|          | 0/2 [00:00<?, ?it/s]

/content/drive/My Drive/machine_learning_pipeline/training_dataset/state_9/20eb403510b50595.jpg
/content/drive/My Drive/machine_learning_pipeline/training_dataset/state_9/d9da398f41f4ffab.jpg
Image /content/drive/My Drive/machine_learning_pipeline/training_dataset/state_9/20eb403510b50595.jpg already exists. Skipping download.


100%|██████████| 2/2 [00:00<00:00,  3.55it/s]

Total number of download failures: 0
Processing 2 image(s) for label state_6...





/content/drive/My Drive/machine_learning_pipeline/training_dataset/state_6/056708de792326b9.jpg


  0%|          | 0/2 [00:00<?, ?it/s]

/content/drive/My Drive/machine_learning_pipeline/training_dataset/state_6/05982415c5276aab.jpg
Image /content/drive/My Drive/machine_learning_pipeline/training_dataset/state_6/056708de792326b9.jpg already exists. Skipping download.


100%|██████████| 2/2 [00:00<00:00,  6.83it/s]

Total number of download failures: 0
Processing 2 image(s) for label state_1...



  0%|          | 0/2 [00:00<?, ?it/s]

/content/drive/My Drive/machine_learning_pipeline/training_dataset/state_1/97c0a12e07ae8dd5.jpg
/content/drive/My Drive/machine_learning_pipeline/training_dataset/state_1/d369520abdc48e11.jpg
Image /content/drive/My Drive/machine_learning_pipeline/training_dataset/state_1/97c0a12e07ae8dd5.jpg already exists. Skipping download.


100%|██████████| 2/2 [00:00<00:00,  5.05it/s]

Total number of download failures: 0
Processing 2 image(s) for label state_7...



  0%|          | 0/2 [00:00<?, ?it/s]

/content/drive/My Drive/machine_learning_pipeline/training_dataset/state_7/6ae1e206c579f649.jpg
Image /content/drive/My Drive/machine_learning_pipeline/training_dataset/state_7/6ae1e206c579f649.jpg already exists. Skipping download.
/content/drive/My Drive/machine_learning_pipeline/training_dataset/state_7/0545b16a0dd7bd69.jpg


100%|██████████| 2/2 [00:00<00:00,  3.97it/s]

Total number of download failures: 0
Processing 2 image(s) for label state_5...



  0%|          | 0/2 [00:00<?, ?it/s]

/content/drive/My Drive/machine_learning_pipeline/training_dataset/state_5/fc49cb32ef7f1e89.jpg
/content/drive/My Drive/machine_learning_pipeline/training_dataset/state_5/e5154234ecf68940.jpg
Image /content/drive/My Drive/machine_learning_pipeline/training_dataset/state_5/fc49cb32ef7f1e89.jpg already exists. Skipping download.


100%|██████████| 2/2 [00:00<00:00,  7.91it/s]

Total number of download failures: 0
Processing 2 image(s) for label state_4...



  0%|          | 0/2 [00:00<?, ?it/s]

/content/drive/My Drive/machine_learning_pipeline/training_dataset/state_4/08672eddcb2b7c93.jpg
/content/drive/My Drive/machine_learning_pipeline/training_dataset/state_4/c9764ed7eed88330.jpg
Image /content/drive/My Drive/machine_learning_pipeline/training_dataset/state_4/08672eddcb2b7c93.jpg already exists. Skipping download.


100%|██████████| 2/2 [00:00<00:00,  5.37it/s]

Total number of download failures: 0
Processing 2 image(s) for label state_8...



  0%|          | 0/2 [00:00<?, ?it/s]

/content/drive/My Drive/machine_learning_pipeline/training_dataset/state_8/7b1e476a7c89bb34.jpg
Image /content/drive/My Drive/machine_learning_pipeline/training_dataset/state_8/7b1e476a7c89bb34.jpg already exists. Skipping download.
/content/drive/My Drive/machine_learning_pipeline/training_dataset/state_8/ec9f2f8587976142.jpg


100%|██████████| 2/2 [00:00<00:00,  4.34it/s]

Total number of download failures: 0



