In [1]:
# From Kaggle: https://www.kaggle.com/c/dog-breed-identification
# Author: Morpheus Hsieh

from __future__ import print_function, division

import os, sys
import argparse
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from PIL import Image
from pandas import Series, DataFrame

import torch
from torch.utils.data import Dataset, DataLoader

import torchvision
from torchvision import datasets, models, transforms, utils

from configs.config_train import get_cfg_defaults

In [2]:
RawPath = r'D:\GitWork\dog_breed\data\raw'
print('Raw path: ', RawPath)

label_fname = 'labels.csv'
df = pd.read_csv(os.path.join(RawPath, label_fname))

csv_columns = list(df.columns)
print('\nColumns: ', csv_columns)

print(); print(df.info())
print(); print(df.head())

Raw path:  D:\GitWork\dog_breed\data\raw

Columns:  ['id', 'breed']

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10222 entries, 0 to 10221
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      10222 non-null  object
 1   breed   10222 non-null  object
dtypes: object(2)
memory usage: 159.8+ KB
None

                                 id             breed
0  000bec180eb18c7604dcecc8fe0dba07       boston_bull
1  001513dfcb2ffafc82cccf4d8bbaba97             dingo
2  001cdf01b096e06d78e9e5112d419397          pekinese
3  00214f311d5d2247d5dfe4fe24b2303d          bluetick
4  0021f9ceb3235effd7fcde7f7538ed62  golden_retriever


In [3]:
# Verify image exist or not
def id2ImgPath(path, ext='.jpg'):
    return (
        lambda f: os.path.join(path, f+ext) \
        if os.path.exists(os.path.join(path, f+ext)) else None
    )

img_path = os.path.join(RawPath, 'train')
id2imgP = id2ImgPath(img_path)

SersId = Series.to_numpy(df["id"])
# print('Type of Series id:' ,type(SersId))

img_list = [id2imgP(v) for v in SersId]
# print('Type of series imgs: ', type(img_list))
df['image'] = img_list

num = 10
print('Top %d data of dataframe:'%num)
print(df.head(num))

# if image not exist?
cnt_no_img = sum(x is None for x in img_list)
print('\nCount of none imgs: ', cnt_no_img)

Type of Series id: <class 'numpy.ndarray'>
Type of series imgs:  <class 'list'>
Top 10 data of dataframe:
                                 id               breed  \
0  000bec180eb18c7604dcecc8fe0dba07         boston_bull   
1  001513dfcb2ffafc82cccf4d8bbaba97               dingo   
2  001cdf01b096e06d78e9e5112d419397            pekinese   
3  00214f311d5d2247d5dfe4fe24b2303d            bluetick   
4  0021f9ceb3235effd7fcde7f7538ed62    golden_retriever   
5  002211c81b498ef88e1b40b9abf84e1d  bedlington_terrier   
6  00290d3e1fdd27226ba27a8ce248ce85  bedlington_terrier   
7  002a283a315af96eaea0e28e7163b21b              borzoi   
8  003df8b8a8b05244b1d920bb6cf451f9             basenji   
9  0042188c895a2f14ef64a918ed9c7b64  scottish_deerhound   

                                               image  
0  D:\GitWork\dog_breed\data\raw\train\000bec180e...  
1  D:\GitWork\dog_breed\data\raw\train\001513dfcb...  
2  D:\GitWork\dog_breed\data\raw\train\001cdf01b0...  
3  D:\GitWork\dog_breed\

In [4]:
# Create breed dict
def getBreedDict(series):
    # Collating the breed classes
    cls_set = set(series)
    cls_set_len = len(cls_set)
    print('Breed class: ', cls_set_len)

    # Create breed dict, both forward and backward dict
    cls_list = list(cls_set)
    cls_list.sort()
    breed_dict = { v:i for i, v in enumerate(cls_list) }
    return breed_dict

SersBreed = Series.to_numpy(df["breed"]) 
breed_dict = getBreedDict(SersBreed)

print('\nBreed dict:')
print(json.dumps(breed_dict, indent=4))

Breed class:  120

Breed dict:
{
    "affenpinscher": 0,
    "afghan_hound": 1,
    "african_hunting_dog": 2,
    "airedale": 3,
    "american_staffordshire_terrier": 4,
    "appenzeller": 5,
    "australian_terrier": 6,
    "basenji": 7,
    "basset": 8,
    "beagle": 9,
    "bedlington_terrier": 10,
    "bernese_mountain_dog": 11,
    "black-and-tan_coonhound": 12,
    "blenheim_spaniel": 13,
    "bloodhound": 14,
    "bluetick": 15,
    "border_collie": 16,
    "border_terrier": 17,
    "borzoi": 18,
    "boston_bull": 19,
    "bouvier_des_flandres": 20,
    "boxer": 21,
    "brabancon_griffon": 22,
    "briard": 23,
    "brittany_spaniel": 24,
    "bull_mastiff": 25,
    "cairn": 26,
    "cardigan": 27,
    "chesapeake_bay_retriever": 28,
    "chihuahua": 29,
    "chow": 30,
    "clumber": 31,
    "cocker_spaniel": 32,
    "collie": 33,
    "curly-coated_retriever": 34,
    "dandie_dinmont": 35,
    "dhole": 36,
    "dingo": 37,
    "doberman": 38,
    "english_foxhound": 39,
    "

In [5]:
# Append breed ID to dataframe
bid_list = [breed_dict[b] for b in SersBreed]
df['breed_id'] = bid_list

num = 10
print('Top %d data of dataframe:'%num)
print(df.head(num))

# save information to csv
ProcPath = r'D:\GitWork\dog_breed\data\processed'
csv_processed = os.path.join(ProcPath, 'processed_labels.csv')
print("\nProcessed csv: '{}'".format(csv_processed))

df.to_csv(csv_processed, index=False)

Processed csv: 'D:\GitWork\dog_breed\data\processed\processed_labels.csv'


In [6]:
# Split total rows to train and valid rows

FracForTrain = 0.8
print('Frac for train: ', FracForTrain)

df_rows = df.shape[0]
print('\nTotal rows: ', df_rows)

train_len = int(float(FracForTrain) * float(df_rows))
print('\nTrain len: ', train_len)
print('Valid len: ', (df_rows - train_len))

train_imgs = img_list[:train_len]
valid_imgs = img_list[train_len:]

train_lbls = bid_list[:train_len]
valid_lbls = bid_list[train_len:]

num = 10
print('\nTop %d train images:'%num)
print('\n'.join(train_imgs[:10]))

print('\nTop %d valid images:'%num)
print('\n'.join(train_imgs[:10]))

print('\nTop %d train labels:'%num)
print(train_lbls[:10])

print('\nTop %d valid labels:'%num)
print(valid_lbls[:10])

Frac for train:  0.8

Total rows:  10222

Train len:  8177
Valid len:  2045

Top 10 train images:
D:\GitWork\dog_breed\data\raw\train\000bec180eb18c7604dcecc8fe0dba07.jpg
D:\GitWork\dog_breed\data\raw\train\001513dfcb2ffafc82cccf4d8bbaba97.jpg
D:\GitWork\dog_breed\data\raw\train\001cdf01b096e06d78e9e5112d419397.jpg
D:\GitWork\dog_breed\data\raw\train\00214f311d5d2247d5dfe4fe24b2303d.jpg
D:\GitWork\dog_breed\data\raw\train\0021f9ceb3235effd7fcde7f7538ed62.jpg
D:\GitWork\dog_breed\data\raw\train\002211c81b498ef88e1b40b9abf84e1d.jpg
D:\GitWork\dog_breed\data\raw\train\00290d3e1fdd27226ba27a8ce248ce85.jpg
D:\GitWork\dog_breed\data\raw\train\002a283a315af96eaea0e28e7163b21b.jpg
D:\GitWork\dog_breed\data\raw\train\003df8b8a8b05244b1d920bb6cf451f9.jpg
D:\GitWork\dog_breed\data\raw\train\0042188c895a2f14ef64a918ed9c7b64.jpg

Top 10 valid images:
D:\GitWork\dog_breed\data\raw\train\000bec180eb18c7604dcecc8fe0dba07.jpg
D:\GitWork\dog_breed\data\raw\train\001513dfcb2ffafc82cccf4d8bbaba97.jpg
D:\G

In [7]:
# Save numpy array as .npy file

phase = ['train', 'valid']
types = ['imgs', 'labels']

fname = ['{}_{}.npy'.format(y, x) for x in types for y in phase]

data = [train_imgs, valid_imgs, train_lbls, valid_lbls]

proc_path = r'D:\GitWork\dog_breed\data\processed'

print('Process start...')
for i in range(len(fname)):
    f_abspath = os.path.join(proc_path, fname[i])
    print("'{}' processing...".format(f_abspath))
    np.save(f_abspath, data[i])
print('Process end.')

Process start...
'D:\GitWork\dog_breed\data\processed\train_imgs.npy' processing...
'D:\GitWork\dog_breed\data\processed\valid_imgs.npy' processing...
'D:\GitWork\dog_breed\data\processed\train_labels.npy' processing...
'D:\GitWork\dog_breed\data\processed\valid_labels.npy' processing...
Process end.
