In [1]:
# From Kaggle: https://www.kaggle.com/c/dog-breed-identification
# Author: Morpheus Hsieh

from __future__ import print_function, division

import os, sys
import argparse
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from PIL import Image
from pandas import Series, DataFrame

import torch
from torch.utils.data import Dataset, DataLoader

import torchvision
from torchvision import datasets, models, transforms, utils

from configs.config_train import get_cfg_defaults

In [2]:
DataPath = r'D:\GitWork\dog_breed\data'
print('Data path: ', DataPath)

RawPath = r'D:\GitWork\dog_breed\data\raw'
print('Raw path: ', RawPath)

label_fname = 'labels.csv'
df = pd.read_csv(os.path.join(RawPath, label_fname))

SersId = Series.to_numpy(df["id"]) 
SersBd = Series.to_numpy(df["breed"]) 

csv_columns = list(df.columns)
print('\nColumns: ', csv_columns)

print(); print(df.info())
print(); print(df.head())

Data path:  D:\GitWork\dog_breed\data
Raw path:  D:\GitWork\dog_breed\data\raw

Columns:  ['id', 'breed']

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10222 entries, 0 to 10221
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      10222 non-null  object
 1   breed   10222 non-null  object
dtypes: object(2)
memory usage: 159.8+ KB
None

                                 id             breed
0  000bec180eb18c7604dcecc8fe0dba07       boston_bull
1  001513dfcb2ffafc82cccf4d8bbaba97             dingo
2  001cdf01b096e06d78e9e5112d419397          pekinese
3  00214f311d5d2247d5dfe4fe24b2303d          bluetick
4  0021f9ceb3235effd7fcde7f7538ed62  golden_retriever


In [3]:
# Parameter for data split

data_size = df.shape[0]
print('Data size: ', data_size)

FracForTrain = 0.8
print('\nFrac for train: ', FracForTrain)

len_for_train = int(float(FracForTrain) * float(data_size))
len_for_valid = data_size - len_for_train
print('\nLen for train: ', len_for_train)
print('Len for valid: ', len_for_valid)

Data size:  10222

Frac for train:  0.8

Len for train:  8177
Len for valid:  2045


In [4]:
# Process image data

train_img_ids = SersId[:len_for_train]
valid_img_ids = SersId[len_for_train:]

def showSampleSeres(ids, title=None):
    padding= ' '
    width = 32
    print('{}\n{}'.format(title, ('-' * width)))
    for i in range(10):
        print('{}'.format(ids[i]))
    return

showSampleSeres(train_img_ids, 'train')
showSampleSeres(valid_img_ids, '\nvalid')

# Verify image exist or not
def isImgExist(path, ext='.jpg'):
    return (lambda f: os.path.join(path, f+ext) \
    if os.path.exists(os.path.join(path, f+ext)) else None)

img_path = os.path.join(RawPath, 'train')
ire = isImgExist(img_path)

train_imgs = [ire(v) for v in train_img_ids]
valid_imgs = [ire(v) for v in valid_img_ids]

print('\nVerify train image exist:')
print('\n'.join(train_imgs[:10]))

print('\nVerify valid image exist:')
print('\n'.join(valid_imgs[:10]))

train_noimg_cnt = sum(x is None for x in train_imgs)
valid_noimg_cnt = sum(x is None for x in valid_imgs)
print('\nTrain no imgs cnt: ', train_noimg_cnt)
print('Train no imgs cnt: ', valid_noimg_cnt)

train
--------------------------------
000bec180eb18c7604dcecc8fe0dba07
001513dfcb2ffafc82cccf4d8bbaba97
001cdf01b096e06d78e9e5112d419397
00214f311d5d2247d5dfe4fe24b2303d
0021f9ceb3235effd7fcde7f7538ed62
002211c81b498ef88e1b40b9abf84e1d
00290d3e1fdd27226ba27a8ce248ce85
002a283a315af96eaea0e28e7163b21b
003df8b8a8b05244b1d920bb6cf451f9
0042188c895a2f14ef64a918ed9c7b64

valid
--------------------------------
cc93915e06bc55626a02af95006a48c2
cc964d3bf1e317c9fbb0c0d4c8bc6b8f
cc97041986abdb8566a3ed4317f40c27
cc99de39a169a9aebaf34d4a514e266b
cc9b4190a7063f8e92dd21ff25152643
cca773094173965bbd04f829eea6eec7
ccb296c8257649527e45affde75d331d
ccb75b5d00281575fe98f1d56d23d7a9
ccbf2d7da8e85a3b60eb0ff8a87af58f
ccc369e93d792e44329a5f13ae6ae582

Verify train image exist:
D:\GitWork\dog_breed\data\raw\train\000bec180eb18c7604dcecc8fe0dba07.jpg
D:\GitWork\dog_breed\data\raw\train\001513dfcb2ffafc82cccf4d8bbaba97.jpg
D:\GitWork\dog_breed\data\raw\train\001cdf01b096e06d78e9e5112d419397.jpg
D:\GitWork\dog_

In [28]:
# Process breed data

def getTwoWayDict(series):
    # Collating the breed classes
    classes = set(series)
    classes_len = len(classes)
    print('Breed class: ', classes_len)

    # Create breed dict, both forward and backward dict
    cls_list = list(classes)
    cls_list.sort()
    fw_dict = { v:i for i, v in enumerate(cls_list) }
    bw_dict = { i:v for i, v in enumerate(cls_list) }
    
    return fw_dict, bw_dict

breed_sers = Series.to_numpy(df['breed'])
breed_dict_fw, breed_dirct_bw = getTwoWayDict(breed_sers)

print('\nBreed dict forward:')
print(json.dumps(breed_dict_fw, indent=4))

Breed class:  120

Breed dict:
{
    "affenpinscher": 0,
    "afghan_hound": 1,
    "african_hunting_dog": 2,
    "airedale": 3,
    "american_staffordshire_terrier": 4,
    "appenzeller": 5,
    "australian_terrier": 6,
    "basenji": 7,
    "basset": 8,
    "beagle": 9,
    "bedlington_terrier": 10,
    "bernese_mountain_dog": 11,
    "black-and-tan_coonhound": 12,
    "blenheim_spaniel": 13,
    "bloodhound": 14,
    "bluetick": 15,
    "border_collie": 16,
    "border_terrier": 17,
    "borzoi": 18,
    "boston_bull": 19,
    "bouvier_des_flandres": 20,
    "boxer": 21,
    "brabancon_griffon": 22,
    "briard": 23,
    "brittany_spaniel": 24,
    "bull_mastiff": 25,
    "cairn": 26,
    "cardigan": 27,
    "chesapeake_bay_retriever": 28,
    "chihuahua": 29,
    "chow": 30,
    "clumber": 31,
    "cocker_spaniel": 32,
    "collie": 33,
    "curly-coated_retriever": 34,
    "dandie_dinmont": 35,
    "dhole": 36,
    "dingo": 37,
    "doberman": 38,
    "english_foxhound": 39,
    "

In [33]:
# Show breed and class according to the order of ids that exist image
def showBreedIds(ids, num=10):
    sample = ids[:num]
    breeds = [df.loc[df['id']==i].breed.item() for i in sample]
    bids = [breed_dict_fw[i] for i in breeds]
    
    col_w1 = 32
    col_w2 = max(len(i) for i in breeds)
    col_w3 = 4
    
    print('ids, breed, breed_class')
    print('-' * (col_w1+col_w2+col_w3+4))
    for i in range(len(sample)):
        v = sample[i]
        breed_str = ''.join(breeds[i].ljust(col_w2))
        str3 = str(bids[i]).rjust(4, ' ');
        print('{}, {}, {}'.format(v, breed_str, str3))
    return
        
showBreedIds(train_img_ids)

# Use lambda function to convert id to breed class
def getBreedClass(df, dic):
    return lambda x: dic[df.loc[df['id']==x].breed.item()]

func = getBreedClass(df, breed_dict_fw)

# Create breed_ids list
train_lbls = [func(x) for x in train_img_ids]
print('\nTop 10 tran labels: '); print(train_lbls[:10])

valid_lbls = [func(x) for x in valid_img_ids]
print('\nTop 10 valid labels: '); print(valid_lbls[:10])

ids, breed, breed_class
----------------------------------------------------------
000bec180eb18c7604dcecc8fe0dba07, boston_bull       ,   19
001513dfcb2ffafc82cccf4d8bbaba97, dingo             ,   37
001cdf01b096e06d78e9e5112d419397, pekinese          ,   85
00214f311d5d2247d5dfe4fe24b2303d, bluetick          ,   15
0021f9ceb3235effd7fcde7f7538ed62, golden_retriever  ,   49
002211c81b498ef88e1b40b9abf84e1d, bedlington_terrier,   10
00290d3e1fdd27226ba27a8ce248ce85, bedlington_terrier,   10
002a283a315af96eaea0e28e7163b21b, borzoi            ,   18
003df8b8a8b05244b1d920bb6cf451f9, basenji           ,    7
0042188c895a2f14ef64a918ed9c7b64, scottish_deerhound,   97

Top 10 tran labels: 
[19, 37, 85, 15, 49, 10, 10, 18, 7, 97]

Top 10 valid labels: 
[56, 54, 12, 84, 70, 78, 112, 100, 3, 52]
