In [1]:
# Kaggle: https://www.kaggle.com/c/dog-breed-identification
# Author: Morpheus Hsieh

from __future__ import print_function, division

import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from PIL import Image
from pandas import Series, DataFrame
from os.path import join, exists

import torch
from torch.utils.data import Dataset, DataLoader

import torchvision
from torchvision import datasets, models, transforms, utils

print('torch: ', torch.__version__)
print('torchvision: ', torchvision.__version__)

torch:  1.5.1
torchvision:  0.6.1


In [2]:
RawPath = r'D:\GitWork\dog_breed\data\raw'
print('Raw path:', RawPath)

ImgTrain = join(RawPath, 'train')
ImgTest  = join(RawPath, 'test')
 
csv_labels = join(RawPath, 'labels.csv')
print("labsels.csv: '{}'".format(csv_labels))

ProcPath = r'D:\GitWork\dog_breed\data\processed'
print('\nProc path:', ProcPath)

csv_breed_dict = join(ProcPath, 'breeds_dict.csv')
print("breed_dict.csv: '{}'".format(csv_breed_dict))

csv_labels_processed = join(ProcPath, 'labels_processed.csv')

# NUM_CLASSES_BREED = 16
FRAC_FOR_TRAIN = 0.8

Raw path: D:\GitWork\dog_breed\data\raw
labsels.csv: 'D:\GitWork\dog_breed\data\raw\labels.csv'

Proc path: D:\GitWork\dog_breed\data\processed
breed_dict.csv: 'D:\GitWork\dog_breed\data\processed\breeds_dict.csv'


In [3]:
df_labels = pd.read_csv(csv_labels)

print(df_labels.info())
print(); print(df_labels.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10222 entries, 0 to 10221
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      10222 non-null  object
 1   breed   10222 non-null  object
dtypes: object(2)
memory usage: 159.8+ KB
None

                                 id             breed
0  000bec180eb18c7604dcecc8fe0dba07       boston_bull
1  001513dfcb2ffafc82cccf4d8bbaba97             dingo
2  001cdf01b096e06d78e9e5112d419397          pekinese
3  00214f311d5d2247d5dfe4fe24b2303d          bluetick
4  0021f9ceb3235effd7fcde7f7538ed62  golden_retriever


In [4]:
# Create breed dictionary 

def getBreedDict(df):
    df1 = df.groupby('breed').count().sort_values(by='id', ascending=False)
    df1.insert(0, 'breed', df1.index)
    df1 = df1.rename(columns={'id': 'count'})
    df1 = df1.reset_index(drop=True)
    df1['breed_id'] = df1.index + 1
    return df1

print(); print(df_labels)
df_breedict = getBreedDict(df_labels)
print(); print(df_breedict)

# Save breed dict as csv file
isExist = exists(csv_breed_dict)
if not isExist:
    df_breedict.to_csv(csv_breed_dict, index=False)
else:
    print('\n{} exist.'.format(csv_breed_dict))


                                     id                     breed
0      000bec180eb18c7604dcecc8fe0dba07               boston_bull
1      001513dfcb2ffafc82cccf4d8bbaba97                     dingo
2      001cdf01b096e06d78e9e5112d419397                  pekinese
3      00214f311d5d2247d5dfe4fe24b2303d                  bluetick
4      0021f9ceb3235effd7fcde7f7538ed62          golden_retriever
...                                 ...                       ...
10217  ffd25009d635cfd16e793503ac5edef0                    borzoi
10218  ffd3f636f7f379c51ba3648a9ff8254f            dandie_dinmont
10219  ffe2ca6c940cddfee68fa3cc6c63213f                  airedale
10220  ffe5f6d8e2bff356e9482a80a6e29aac        miniature_pinscher
10221  fff43b07992508bc822f33d8ffd902ae  chesapeake_bay_retriever

[10222 rows x 2 columns]

                    breed  count  breed_id
0      scottish_deerhound    126         1
1             maltese_dog    117         2
2            afghan_hound    116         3
3       

In [5]:
# Process labels

mapping = dict(df_breedict[['breed', 'breed_id']].values)
df_labels['breed_id'] = df_labels.breed.map(mapping)
print(df_labels)

# Verify image exist or not accaording to id
def id2ImgPath(path, ext='.jpg'):
    return (
        lambda f: join(path, f+ext) if exists(join(path, f+ext)) else None
    )

id2imgP = id2ImgPath(join(RawPath, 'train'))

SersId = Series.to_numpy(df_labels['id'])
df_labels['image'] = [id2imgP(v) for v in SersId]
print(df_labels)

# Save processed labels 
isExist = exists(csv_labels_processed)
if not isExist:
    df_labels.to_csv(csv_labels_processed, index=False)
else:
    print('\n{} exist.'.format(csv_labels_processed))

                                     id                     breed  breed_id
0      000bec180eb18c7604dcecc8fe0dba07               boston_bull        43
1      001513dfcb2ffafc82cccf4d8bbaba97                     dingo        73
2      001cdf01b096e06d78e9e5112d419397                  pekinese        95
3      00214f311d5d2247d5dfe4fe24b2303d                  bluetick        51
4      0021f9ceb3235effd7fcde7f7538ed62          golden_retriever       116
...                                 ...                       ...       ...
10217  ffd25009d635cfd16e793503ac5edef0                    borzoi        94
10218  ffd3f636f7f379c51ba3648a9ff8254f            dandie_dinmont        37
10219  ffe2ca6c940cddfee68fa3cc6c63213f                  airedale        11
10220  ffe5f6d8e2bff356e9482a80a6e29aac        miniature_pinscher        19
10221  fff43b07992508bc822f33d8ffd902ae  chesapeake_bay_retriever        54

[10222 rows x 3 columns]
                                     id                     br

In [6]:
# Split dataframe to two part 

df_rows = df_labels.shape[0]
print('data len:', df_rows)

train_len = int(float(FRAC_FOR_TRAIN) * float(df_rows))
# print('Train len: ', train_len)

train_imgs = df_labels['image'][:train_len]
valid_imgs = df_labels['image'][train_len:]

train_lbls = df_labels['breed_id'][:train_len]
valid_lbls = df_labels['breed_id'][train_len:]

print('\nTrain size: img({}), lbl({})'.format(len(train_imgs), len(train_lbls)))
print('Valid size: img({}), lbl({})'.format(len(valid_imgs), len(valid_lbls)))


def showTopN(array, title=None, num=10):
    if title: print('\n{}:'.format(title))
    print('\n'.join(array[:num]))
    return

showTopN(train_imgs, 'Train images')
showTopN(valid_imgs, 'Valid images')

print('\nTrain labels:', list(train_lbls[:10]))
print('\nValid labels:', list(valid_lbls[:10]))


data len: 10222

Train size: img(8177), lbl(8177)
Valid size: img(2045), lbl(2045)

Train images:
D:\GitWork\dog_breed\data\raw\train\000bec180eb18c7604dcecc8fe0dba07.jpg
D:\GitWork\dog_breed\data\raw\train\001513dfcb2ffafc82cccf4d8bbaba97.jpg
D:\GitWork\dog_breed\data\raw\train\001cdf01b096e06d78e9e5112d419397.jpg
D:\GitWork\dog_breed\data\raw\train\00214f311d5d2247d5dfe4fe24b2303d.jpg
D:\GitWork\dog_breed\data\raw\train\0021f9ceb3235effd7fcde7f7538ed62.jpg
D:\GitWork\dog_breed\data\raw\train\002211c81b498ef88e1b40b9abf84e1d.jpg
D:\GitWork\dog_breed\data\raw\train\00290d3e1fdd27226ba27a8ce248ce85.jpg
D:\GitWork\dog_breed\data\raw\train\002a283a315af96eaea0e28e7163b21b.jpg
D:\GitWork\dog_breed\data\raw\train\003df8b8a8b05244b1d920bb6cf451f9.jpg
D:\GitWork\dog_breed\data\raw\train\0042188c895a2f14ef64a918ed9c7b64.jpg

Valid images:
D:\GitWork\dog_breed\data\raw\train\cc93915e06bc55626a02af95006a48c2.jpg
D:\GitWork\dog_breed\data\raw\train\cc964d3bf1e317c9fbb0c0d4c8bc6b8f.jpg
D:\GitWork\

In [7]:
# Save numpy array as .npy file

phase = ['train', 'valid']
types = ['images', 'labels']

fname = ['{}_{}.npy'.format(y, x) for x in types for y in phase]
data = [train_imgs, valid_imgs, train_lbls, valid_lbls]

print('Process start...')
for i in range(len(fname)):
    f_abspath = join(ProcPath, fname[i])
    print("'{}' processing...".format(f_abspath))
    np.save(f_abspath, data[i])
print('Process end.')

Process start...
'D:\GitWork\dog_breed\data\processed\train_images.npy' processing...
'D:\GitWork\dog_breed\data\processed\valid_images.npy' processing...
'D:\GitWork\dog_breed\data\processed\train_labels.npy' processing...
'D:\GitWork\dog_breed\data\processed\valid_labels.npy' processing...
Process end.
