In [1]:
# From: https://www.kaggle.com/c/dog-breed-identification/data
# Author: Morpheus Hsieh

from __future__ import print_function, division

import os, sys
import copy
import io
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import time
from mpl_toolkits.axes_grid1 import ImageGrid
from os import listdir
from os.path import join, isfile
from PIL import Image

import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader

import torchvision
from torchvision import datasets, models, transforms, utils

print("PyTorch Version: ",torch.__version__)
print("Torchvision Version: ",torchvision.__version__)

PyTorch Version:  1.5.1
Torchvision Version:  0.6.1


In [2]:
ProcPath = r'D:\GitWork\dog_breed\data\processed'
print('Proc path:', ProcPath)

TestPath = r'D:\GitWork\dog_breed\data\raw\test'
print("Test path: '{}'".format(TestPath))

ModelPath = r'D:\GitWork\dog_breed\models'
print("Model path: '{}'".format(ModelPath))

OutPath = r'D:\GitWork\dog_breed\output'
print("Output path: '{}'".format(OutPath))

BestModel = '20200916-160223_resnet50_acc94.pth'

BatchSize = 16
NumClasses = 16

Proc path: D:\GitWork\dog_breed\data\processed
Test path: 'D:\GitWork\dog_breed\data\raw\test'
Model path: 'D:\GitWork\dog_breed\models'
Output path: 'D:\GitWork\dog_breed\output'


In [3]:
# Read breed information from csv
CsvBreedsProc = 'breeds_processed.csv'
f_abspath = join(ProcPath, CsvBreedsProc)

df_breeds = pd.read_csv(f_abspath)

print(df_breeds.info())
print(); print(df_breeds.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120 entries, 0 to 119
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   breed_id  120 non-null    int64 
 1   breed     120 non-null    object
 2   count     120 non-null    int64 
dtypes: int64(2), object(1)
memory usage: 2.9+ KB
None

   breed_id                 breed  count
0         0    scottish_deerhound    126
1         1           maltese_dog    117
2         2          afghan_hound    116
3         3           entlebucher    115
4         4  bernese_mountain_dog    114


In [4]:
# Get most popular breeds

def getMostPopularBreeds(df, numClasses=16):
    df1 = df.sort_values(['count', 'breed'], ascending=(False, True))
    df1 = df1.head(numClasses)
    return df1

df_breeds_selected = getMostPopularBreeds(df_breeds, NumClasses)

selected_brds = list(df_breeds_selected['breed'])
print('\nSelected breeds: [\n  {}\n]'.format('\n  '.join(selected_brds)))

selected_bids = list(df_breeds_selected['breed_id'])
print('\nSelected breed ids:\n  {}'.format(selected_bids))


Selected breeds: [
  scottish_deerhound
  maltese_dog
  afghan_hound
  entlebucher
  bernese_mountain_dog
  shih-tzu
  great_pyrenees
  pomeranian
  basenji
  samoyed
  airedale
  tibetan_terrier
  cairn
  leonberg
  beagle
  japanese_spaniel
]

Selected breed ids:
  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 12, 14, 15]


In [5]:
# Build breed dictionaries

def df2dict(df, dire='forward'):
    dic = {}
    for i, row in df.iterrows():
        if dire == 'forward':
            dic[row['breed']] = row['breed_id']
        elif dire == 'reverse':
            dic[row['breed_id']] = row['breed']
    return dic

dict_breed_fw = df2dict(df_breeds_selected)
dict_breed_bw = df2dict(df_breeds_selected, 'reverse')

print('Breeds dict forward:'); 
print(json.dumps(dict_breed_fw, indent=2))

            
def prettyPrint(d, indent=0):
    print('{')
    for key, value in d.items():
        if isinstance(value, dict):
            print('  ' * indent + str(key))
            pretty(value, indent+1)
        else:
            print('  ' * (indent+1) + f"{key}: {value}")
    print('}')
                
print('\nBreeds dict reverse:'); 
prettyPrint(dict_breed_bw)

Breeds dict forward:
{
  "scottish_deerhound": 0,
  "maltese_dog": 1,
  "afghan_hound": 2,
  "entlebucher": 3,
  "bernese_mountain_dog": 4,
  "shih-tzu": 5,
  "great_pyrenees": 6,
  "pomeranian": 7,
  "basenji": 8,
  "samoyed": 9,
  "airedale": 10,
  "tibetan_terrier": 11,
  "cairn": 13,
  "leonberg": 12,
  "beagle": 14,
  "japanese_spaniel": 15
}

Breeds dict reverse:
{
  0: scottish_deerhound
  1: maltese_dog
  2: afghan_hound
  3: entlebucher
  4: bernese_mountain_dog
  5: shih-tzu
  6: great_pyrenees
  7: pomeranian
  8: basenji
  9: samoyed
  10: airedale
  11: tibetan_terrier
  13: cairn
  12: leonberg
  14: beagle
  15: japanese_spaniel
}


In [6]:
# Selected labels

def dfInfo2Str(df, indent=4):
    buf = io.StringIO()
    df.info(buf=buf)
    pad_str = (' ' * indent)
    old_str = '\n'
    new_str = '\n' + pad_str
    outstr = buf.getvalue().replace(old_str, new_str)
    return pad_str + outstr

CsvLabelsProc = 'labels_processed.csv'
f_abspath = join(ProcPath, CsvLabelsProc)

df_labels = pd.read_csv(f_abspath)
print('Origin labels:\n')
print(dfInfo2Str(df_labels))

df_labels_selected = df_labels[df_labels['breed_id'].isin(selected_bids)]

print('\nSelected labels:\n')
print(dfInfo2Str(df_labels_selected))


def dfHead2Str(df, num=10, indent=4):
    inStr = df.head(num).to_string()
    padstr = ' ' * indent
    outStr = padstr + inStr.replace('\n', '\n'+padstr)
    return outStr

print('\nSelected labels Head:')
print(dfHead2Str(df_labels_selected))

Origin labels:

    <class 'pandas.core.frame.DataFrame'>
    RangeIndex: 10222 entries, 0 to 10221
    Data columns (total 2 columns):
     #   Column    Non-Null Count  Dtype 
    ---  ------    --------------  ----- 
     0   image     10222 non-null  object
     1   breed_id  10222 non-null  int64 
    dtypes: int64(1), object(1)
    memory usage: 159.8+ KB
    

Selected labels:

    <class 'pandas.core.frame.DataFrame'>
    Int64Index: 1777 entries, 8 to 10219
    Data columns (total 2 columns):
     #   Column    Non-Null Count  Dtype 
    ---  ------    --------------  ----- 
     0   image     1777 non-null   object
     1   breed_id  1777 non-null   int64 
    dtypes: int64(1), object(1)
    memory usage: 41.6+ KB
    

Selected labels Head:
                                                                           image  breed_id
    8   D:\GitWork\dog_breed\data\raw\train\003df8b8a8b05244b1d920bb6cf451f9.jpg         8
    9   D:\GitWork\dog_breed\data\raw\train\0042188c895a

In [7]:
# Transform
transform = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor(),
])

class myDataset(Dataset):

    def __init__(self, df, transform=None):

        self.images = list(df['image'])
        self.labels = list(df['breed_id'])
        self.len = len(self.images)

        self.transform = transform

    def __getitem__(self, index):
        img_path = self.images[index]
        img_pil = Image.open(img_path)

        if self.transform is not None:
            img = self.transform(img_pil)

        lbl = int(self.labels[index])
        img_id = os.path.split(img_path)[1].replace('.jpg', '')
        
        return [img, lbl, img_id]

    def __len__(self):
        return self.len

    
dataSet = myDataset(df_labels_selected, transform=transform)
dataLoader = DataLoader(dataSet, batch_size=BatchSize, shuffle=False)
dataSize = len(dataSet)

imgs, lbls, ids = next(iter(dataLoader))
print('\nImage type:', type(imgs))
print('      size: ', imgs.size())

print('\nLabel type:', type(lbls))
print('      size: ', lbls.size())

img = imgs[0]
print('\nImage shape:', img.shape)
print(); print(img)

print('\nLabels:', lbls)


Image type: <class 'torch.Tensor'>
      size:  torch.Size([16, 3, 224, 224])

Label type: <class 'torch.Tensor'>
      size:  torch.Size([16])

Image shape: torch.Size([3, 224, 224])

tensor([[[0.5059, 0.4980, 0.5412,  ..., 0.5176, 0.4510, 0.3686],
         [0.5451, 0.5451, 0.5686,  ..., 0.5608, 0.4275, 0.3529],
         [0.6392, 0.6039, 0.5098,  ..., 0.5804, 0.5843, 0.4941],
         ...,
         [0.4588, 0.4667, 0.5098,  ..., 0.8392, 0.5412, 0.4510],
         [0.3882, 0.4510, 0.5686,  ..., 0.8627, 0.7294, 0.4588],
         [0.3333, 0.4863, 0.4980,  ..., 0.5490, 0.5529, 0.5725]],

        [[0.5373, 0.5373, 0.5843,  ..., 0.5451, 0.4941, 0.4235],
         [0.5647, 0.5725, 0.6118,  ..., 0.5804, 0.4627, 0.4000],
         [0.6392, 0.6196, 0.5490,  ..., 0.5961, 0.6078, 0.5255],
         ...,
         [0.5451, 0.5490, 0.5843,  ..., 0.8275, 0.5255, 0.4392],
         [0.4706, 0.5176, 0.6353,  ..., 0.8157, 0.6745, 0.4196],
         [0.4196, 0.5451, 0.5608,  ..., 0.4824, 0.4863, 0.5294]],

  

In [10]:
import torch.nn.functional as F

torch.cuda.empty_cache()    # Clearing GPU memory

# prediction
use_gpu = torch.cuda.is_available()
device = torch.device("cuda:0" if use_gpu else "cpu")
print(device)

resnet = models.resnet50(pretrained=True)

# New final layer with NumClasses
num_ftrs = resnet.fc.in_features
resnet.fc = nn.Linear(num_ftrs, NumClasses)

resnet = resnet.cuda()

PreTrainedModel = 'resnet50_acc93_20200918-1645.pth' # BatchSize=100
# PreTrainedModel = 'resnet50_acc95_20200918-2043.pth'
model_abspath = join(OutPath, PreTrainedModel)
resnet.load_state_dict(torch.load(model_abspath))

resnet.eval()

col_names = ['id', 'breed', 'predict']
df_preds = pd.DataFrame(columns=col_names)

start_time = time.time()
print('\nStart testing...')

for i, (inputs, labels, ids) in enumerate(dataLoader):
    print(i, end=', ')

    inputs = Variable(inputs.cuda())
        
    with torch.set_grad_enabled(True):
        outputs = resnet(inputs)
        preds = torch.argmax(outputs, dim=1)
        # print(preds)
    
    df_tmp = pd.DataFrame({
        'id': ids,
        'breed': labels,
        'predict': preds.tolist()
    }) 
    df_preds = df_preds.append(df_tmp)
print()    

print('Testing time: {:10f} minutes'.format((time.time()-start_time)/60))    

print(); print(df_preds.info())
print(); print(df_preds.head())

cuda:0

Start testing...
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, Testing time:   0.270606 minutes

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1777 entries, 0 to 0
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       1777 non-null   object
 1   breed    1777 non-null   object
 2   predict  1777 non-null   object
dtypes: object(3)
memory usage: 55.5+ KB
None

                                 id breed predict
0  003df8b8a8b05244b1d920bb6cf451f9     8       8
1  0042188c895a2f14ef64a918ed9c7b64     0       2
2  006

In [11]:
from datetime import datetime

currDT = datetime.now()
currStr = currDT.strftime("%Y%m%d-%H%M%S")
fname_infer = 'Inference_{}.csv'.format(currStr)

df_preds.to_csv(join(OutPath, fname_infer), index=False)