In [1]:
# From Kaggle: https://www.kaggle.com/c/dog-breed-identification
# Author: Morpheus Hsieh

from __future__ import print_function, division

import os, sys
import argparse
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from PIL import Image
from pandas import Series, DataFrame

import torch
from torch.utils.data import Dataset, DataLoader

import torchvision
from torchvision import datasets, models, transforms, utils

In [2]:
proj_path = 'dog-breed'

base_path = r'/home/jovyan/'
print('Base path      :', base_path)

data_path = os.path.join(base_path, 'data', proj_path)
print('Data path      :', data_path)

models_path = os.path.join(base_path, 'models', proj_path)
print('models path    :', models_path)

proc_path = os.path.join(base_path, 'processed', proj_path)
print('Processed path :', proc_path)

trained_path = os.path.join(base_path, 'pretrained', proj_path)
print('Pretrained path:', trained_path)

Base path      : /home/jovyan/
Data path      : /home/jovyan/data/dog-breed
models path    : /home/jovyan/models/dog-breed
Processed path : /home/jovyan/processed/dog-breed
Pretrained path: /home/jovyan/pretrained/dog-breed


In [3]:
labels_filename = 'labels.csv'
label_pathfile = os.path.join(data_path, labels_filename)
print('Label pathfile: ', label_pathfile)

df_label = pd.read_csv(label_pathfile)
label_columns = list(df_label.columns)
print(); print('Label columns: ', label_columns)

print(); print(df_label.info())
print(); print(df_label.head())

Label pathfile:  /home/jovyan/data/dog-breed/labels.csv

Label columns:  ['id', 'breed']

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10222 entries, 0 to 10221
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      10222 non-null  object
 1   breed   10222 non-null  object
dtypes: object(2)
memory usage: 159.8+ KB
None

                                 id             breed
0  000bec180eb18c7604dcecc8fe0dba07       boston_bull
1  001513dfcb2ffafc82cccf4d8bbaba97             dingo
2  001cdf01b096e06d78e9e5112d419397          pekinese
3  00214f311d5d2247d5dfe4fe24b2303d          bluetick
4  0021f9ceb3235effd7fcde7f7538ed62  golden_retriever


In [4]:
# Check train image exist or not

def check_image_path(id, base_path, ext='.jpg'):
    file_path = os.path.join(base_path, id + ext)
    return file_path if os.path.exists(file_path) else None

train_img_path = os.path.join(data_path, 'train')

df_label['image'] = df_label['id'].apply(lambda x: check_image_path(x, train_img_path))

num = 10
print('Top %d data of dataframe:'%num)
print(df_label.head(num))

# if image not exist?
none_count = df_label['image'].isna().sum()
print("\nNum of 'None' in 'image' column:", none_count)

Top 10 data of dataframe:
                                 id               breed  \
0  000bec180eb18c7604dcecc8fe0dba07         boston_bull   
1  001513dfcb2ffafc82cccf4d8bbaba97               dingo   
2  001cdf01b096e06d78e9e5112d419397            pekinese   
3  00214f311d5d2247d5dfe4fe24b2303d            bluetick   
4  0021f9ceb3235effd7fcde7f7538ed62    golden_retriever   
5  002211c81b498ef88e1b40b9abf84e1d  bedlington_terrier   
6  00290d3e1fdd27226ba27a8ce248ce85  bedlington_terrier   
7  002a283a315af96eaea0e28e7163b21b              borzoi   
8  003df8b8a8b05244b1d920bb6cf451f9             basenji   
9  0042188c895a2f14ef64a918ed9c7b64  scottish_deerhound   

                                               image  
0  /home/jovyan/data/dog-breed/train/000bec180eb1...  
1  /home/jovyan/data/dog-breed/train/001513dfcb2f...  
2  /home/jovyan/data/dog-breed/train/001cdf01b096...  
3  /home/jovyan/data/dog-breed/train/00214f311d5d...  
4  /home/jovyan/data/dog-breed/train/0021f9ceb323

In [5]:
# Create breed dict
def getBreedDict(series):
    # Collating the breed classes
    cls_set = set(series)
    cls_set_len = len(cls_set)
    print('Breed class: ', cls_set_len)

    # Create breed dict, both forward and backward dict
    cls_list = list(cls_set)
    cls_list.sort()
    breed_dict = { v:(i+1) for i, v in enumerate(cls_list) }
    return breed_dict

SersBreed = Series.to_numpy(df_label["breed"]) 
breed_dict = getBreedDict(SersBreed)

print('\nBreed dict:')
print(json.dumps(breed_dict, indent=4))

Breed class:  120

Breed dict:
{
    "affenpinscher": 1,
    "afghan_hound": 2,
    "african_hunting_dog": 3,
    "airedale": 4,
    "american_staffordshire_terrier": 5,
    "appenzeller": 6,
    "australian_terrier": 7,
    "basenji": 8,
    "basset": 9,
    "beagle": 10,
    "bedlington_terrier": 11,
    "bernese_mountain_dog": 12,
    "black-and-tan_coonhound": 13,
    "blenheim_spaniel": 14,
    "bloodhound": 15,
    "bluetick": 16,
    "border_collie": 17,
    "border_terrier": 18,
    "borzoi": 19,
    "boston_bull": 20,
    "bouvier_des_flandres": 21,
    "boxer": 22,
    "brabancon_griffon": 23,
    "briard": 24,
    "brittany_spaniel": 25,
    "bull_mastiff": 26,
    "cairn": 27,
    "cardigan": 28,
    "chesapeake_bay_retriever": 29,
    "chihuahua": 30,
    "chow": 31,
    "clumber": 32,
    "cocker_spaniel": 33,
    "collie": 34,
    "curly-coated_retriever": 35,
    "dandie_dinmont": 36,
    "dhole": 37,
    "dingo": 38,
    "doberman": 39,
    "english_foxhound": 40,
    

In [6]:
# save breed dict to csv file

import csv

def pair2dict(keys, data):
    dic = []
    for b, bid in data.items():
        vals = [b, bid]
        obj = {}
        for i in range(len(keys)):
            obj[keys[i]] = vals[i]
        dic.append(obj)
    return dic    

keys = ['breed', 'breed_id']
toCSV = pair2dict(keys, breed_dict)
print(json.dumps(toCSV[:5], indent=2))

csv_abspath = os.path.join(proc_path, 'breeds_dict.csv')

with open(csv_abspath, 'w', newline='') as outfile:
    writer = csv.DictWriter(outfile, keys)
    writer.writeheader()
    writer.writerows(toCSV)

[
  {
    "breed": "affenpinscher",
    "breed_id": 1
  },
  {
    "breed": "afghan_hound",
    "breed_id": 2
  },
  {
    "breed": "african_hunting_dog",
    "breed_id": 3
  },
  {
    "breed": "airedale",
    "breed_id": 4
  },
  {
    "breed": "american_staffordshire_terrier",
    "breed_id": 5
  }
]


In [7]:
# Append breed ID to dataframe
bid_list = [breed_dict[b] for b in SersBreed]
df_label['breed_id'] = bid_list

num = 10
print('Top %d data of dataframe:'%num)
print(df_label.head(num))

# save information to csv
csv_lbls_processed = os.path.join(proc_path, 'labels_processed.csv')
print("\nProcessed csv: '{}'".format(csv_lbls_processed))

df_label.to_csv(csv_lbls_processed, index=False)

Top 10 data of dataframe:
                                 id               breed  \
0  000bec180eb18c7604dcecc8fe0dba07         boston_bull   
1  001513dfcb2ffafc82cccf4d8bbaba97               dingo   
2  001cdf01b096e06d78e9e5112d419397            pekinese   
3  00214f311d5d2247d5dfe4fe24b2303d            bluetick   
4  0021f9ceb3235effd7fcde7f7538ed62    golden_retriever   
5  002211c81b498ef88e1b40b9abf84e1d  bedlington_terrier   
6  00290d3e1fdd27226ba27a8ce248ce85  bedlington_terrier   
7  002a283a315af96eaea0e28e7163b21b              borzoi   
8  003df8b8a8b05244b1d920bb6cf451f9             basenji   
9  0042188c895a2f14ef64a918ed9c7b64  scottish_deerhound   

                                               image  breed_id  
0  /home/jovyan/data/dog-breed/train/000bec180eb1...        20  
1  /home/jovyan/data/dog-breed/train/001513dfcb2f...        38  
2  /home/jovyan/data/dog-breed/train/001cdf01b096...        86  
3  /home/jovyan/data/dog-breed/train/00214f311d5d...        16  

In [8]:
# Split total rows to train and valid rows

FracForTrain = 0.8
print('Frac for train: ', FracForTrain)

df_rows = df_label.shape[0]
print('\nTotal rows: ', df_rows)

train_len = int(float(FracForTrain) * float(df_rows))
print('\nTrain len: ', train_len)
print('Valid len: ', (df_rows - train_len))

# train_imgs = img_list[:train_len]
# valid_imgs = img_list[train_len:]
train_imgs = df_label['image'][:train_len].tolist()
valid_imgs = df_label['image'][train_len:].tolist()

# train_lbls = bid_list[:train_len]
# valid_lbls = bid_list[train_len:]
train_lbls = df_label['breed_id'][:train_len].tolist()
valid_lbls = df_label['breed_id'][train_len:].tolist()

num = 10
print('\nTop %d train images:'%num)
print('\n'.join(train_imgs[:10]))

print('\nTop %d valid images:'%num)
print('\n'.join(valid_imgs[:10]))

print('\nTop %d train labels:'%num)
print(train_lbls[:10])

print('\nTop %d valid labels:'%num)
print(valid_lbls[:10])

Frac for train:  0.8

Total rows:  10222

Train len:  8177
Valid len:  2045

Top 10 train images:
/home/jovyan/data/dog-breed/train/000bec180eb18c7604dcecc8fe0dba07.jpg
/home/jovyan/data/dog-breed/train/001513dfcb2ffafc82cccf4d8bbaba97.jpg
/home/jovyan/data/dog-breed/train/001cdf01b096e06d78e9e5112d419397.jpg
/home/jovyan/data/dog-breed/train/00214f311d5d2247d5dfe4fe24b2303d.jpg
/home/jovyan/data/dog-breed/train/0021f9ceb3235effd7fcde7f7538ed62.jpg
/home/jovyan/data/dog-breed/train/002211c81b498ef88e1b40b9abf84e1d.jpg
/home/jovyan/data/dog-breed/train/00290d3e1fdd27226ba27a8ce248ce85.jpg
/home/jovyan/data/dog-breed/train/002a283a315af96eaea0e28e7163b21b.jpg
/home/jovyan/data/dog-breed/train/003df8b8a8b05244b1d920bb6cf451f9.jpg
/home/jovyan/data/dog-breed/train/0042188c895a2f14ef64a918ed9c7b64.jpg

Top 10 valid images:
/home/jovyan/data/dog-breed/train/cc93915e06bc55626a02af95006a48c2.jpg
/home/jovyan/data/dog-breed/train/cc964d3bf1e317c9fbb0c0d4c8bc6b8f.jpg
/home/jovyan/data/dog-breed/

In [9]:
# Save numpy array as .npy file

phase = ['train', 'valid']
types = ['imgs', 'labels']

fname = ['{}_{}.npy'.format(y, x) for x in types for y in phase]

data = [train_imgs, valid_imgs, train_lbls, valid_lbls]

print('Process start...')
for i in range(len(fname)):
    f_abspath = os.path.join(proc_path, fname[i])
    print("'{}' processing...".format(f_abspath))
    np.save(f_abspath, data[i])
print('Process end.')

Process start...
'/home/jovyan/processed/dog-breed/train_imgs.npy' processing...
'/home/jovyan/processed/dog-breed/valid_imgs.npy' processing...
'/home/jovyan/processed/dog-breed/train_labels.npy' processing...
'/home/jovyan/processed/dog-breed/valid_labels.npy' processing...
Process end.
