In [1]:
# From: https://www.kaggle.com/c/dog-breed-identification/data
# Author: Morpheus Hsieh

from __future__ import print_function, division

import os, sys
import copy
import io
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import time
from mpl_toolkits.axes_grid1 import ImageGrid
from os import listdir
from os.path import join, isfile
from PIL import Image

import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader

import torchvision
from torchvision import datasets, models, transforms, utils

print("PyTorch Version: ",torch.__version__)
print("Torchvision Version: ",torchvision.__version__)

PyTorch Version:  1.5.1
Torchvision Version:  0.6.1


In [2]:
ProcPath = r'D:\GitWork\dog_breed\processed'
print('Proc path:', ProcPath)

TestPath = r'D:\GitWork\dog_breed\data\test'
print("Test path: '{}'".format(TestPath))

PreTranPath = r'D:\GitWork\dog_breed\pretrained'
print("Pretrained path: '{}'".format(PreTranPath))

OutPath = r'D:\GitWork\dog_breed\output'
print("Output path: '{}'".format(OutPath))

PreTranModel = 'resnet50_20200918-2043_acc95.pth'
print("Pretrained model: '{}'".format(PreTranModel))

BatchSize = 16
NumClasses = 16

Proc path: D:\GitWork\dog_breed\processed
Test path: 'D:\GitWork\dog_breed\data\test'
Pretrained path: 'D:\GitWork\dog_breed\pretrained'
Output path: 'D:\GitWork\dog_breed\output'
Pretrained model: 'resnet50_20200918-2043_acc95.pth'


In [3]:
# Read breed information from csv
CsvBreedsProc = 'breeds_processed.csv'
f_abspath = join(ProcPath, CsvBreedsProc)

df_breeds = pd.read_csv(f_abspath)

print(df_breeds.info())
print(); print(df_breeds.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120 entries, 0 to 119
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   breed     120 non-null    object
 1   count     120 non-null    int64 
 2   breed_id  120 non-null    int64 
dtypes: int64(2), object(1)
memory usage: 2.9+ KB
None

                  breed  count  breed_id
0    scottish_deerhound    126         0
1           maltese_dog    117         1
2          afghan_hound    116         2
3           entlebucher    115         3
4  bernese_mountain_dog    114         4


In [4]:
# Get most popular breeds

def getMostPopularBreeds(df, numClasses=16):
    df1 = df.sort_values(['count', 'breed'], ascending=(False, True))
    df1 = df1.head(numClasses)
    return df1

df_breeds_selected = getMostPopularBreeds(df_breeds, NumClasses)

selected_brds = list(df_breeds_selected['breed'])
print('\nSelected breeds: [\n  {}\n]'.format('\n  '.join(selected_brds)))

selected_bids = list(df_breeds_selected['breed_id'])
print('\nSelected breed ids:\n  {}'.format(selected_bids))


Selected breeds: [
  scottish_deerhound
  maltese_dog
  afghan_hound
  entlebucher
  bernese_mountain_dog
  shih-tzu
  great_pyrenees
  pomeranian
  basenji
  samoyed
  airedale
  tibetan_terrier
  cairn
  leonberg
  beagle
  japanese_spaniel
]

Selected breed ids:
  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 12, 14, 15]


In [5]:
# Build breed dictionaries

def df2dict(df, dire='forward'):
    dic = {}
    for i, row in df.iterrows():
        if dire == 'forward':
            dic[row['breed']] = row['breed_id']
        elif dire == 'reverse':
            dic[row['breed_id']] = row['breed']
    return dic

dict_breed_fw = df2dict(df_breeds_selected)
dict_breed_rv = df2dict(df_breeds_selected, 'reverse')

print('Breeds dict forward:'); 
print(json.dumps(dict_breed_fw, indent=2))

            
def prettyPrint(d, indent=0):
    print('{')
    for key, value in d.items():
        if isinstance(value, dict):
            print('  ' * indent + str(key))
            pretty(value, indent+1)
        else:
            print('  ' * (indent+1) + f"{key}: {value}")
    print('}')
                
print('\nBreeds dict reverse:'); 
prettyPrint(dict_breed_rv)

Breeds dict forward:
{
  "scottish_deerhound": 0,
  "maltese_dog": 1,
  "afghan_hound": 2,
  "entlebucher": 3,
  "bernese_mountain_dog": 4,
  "shih-tzu": 5,
  "great_pyrenees": 6,
  "pomeranian": 7,
  "basenji": 8,
  "samoyed": 9,
  "airedale": 10,
  "tibetan_terrier": 11,
  "cairn": 13,
  "leonberg": 12,
  "beagle": 14,
  "japanese_spaniel": 15
}

Breeds dict reverse:
{
  0: scottish_deerhound
  1: maltese_dog
  2: afghan_hound
  3: entlebucher
  4: bernese_mountain_dog
  5: shih-tzu
  6: great_pyrenees
  7: pomeranian
  8: basenji
  9: samoyed
  10: airedale
  11: tibetan_terrier
  13: cairn
  12: leonberg
  14: beagle
  15: japanese_spaniel
}


In [6]:
# Selected labels

def dfInfo2Str(df, indent=4):
    buf = io.StringIO()
    df.info(buf=buf)
    pad_str = (' ' * indent)
    old_str = '\n'
    new_str = '\n' + pad_str
    outstr = buf.getvalue().replace(old_str, new_str)
    return pad_str + outstr

CsvLabelsProc = 'labels_processed.csv'
f_abspath = join(ProcPath, CsvLabelsProc)

df_labels = pd.read_csv(f_abspath)
print('Origin labels:\n')
print(dfInfo2Str(df_labels))

df_labels_selected = df_labels[df_labels['breed_id'].isin(selected_bids)]

print('\nSelected labels:\n')
print(dfInfo2Str(df_labels_selected))


def dfHead2Str(df, num=10, indent=4):
    inStr = df.head(num).to_string()
    padstr = ' ' * indent
    outStr = padstr + inStr.replace('\n', '\n'+padstr)
    return outStr

print('\nSelected labels Head:')
print(dfHead2Str(df_labels_selected))

Origin labels:

    <class 'pandas.core.frame.DataFrame'>
    RangeIndex: 10222 entries, 0 to 10221
    Data columns (total 4 columns):
     #   Column    Non-Null Count  Dtype 
    ---  ------    --------------  ----- 
     0   id        10222 non-null  object
     1   breed     10222 non-null  object
     2   breed_id  10222 non-null  int64 
     3   image     10222 non-null  object
    dtypes: int64(1), object(3)
    memory usage: 319.6+ KB
    

Selected labels:

    <class 'pandas.core.frame.DataFrame'>
    Int64Index: 1777 entries, 8 to 10219
    Data columns (total 4 columns):
     #   Column    Non-Null Count  Dtype 
    ---  ------    --------------  ----- 
     0   id        1777 non-null   object
     1   breed     1777 non-null   object
     2   breed_id  1777 non-null   int64 
     3   image     1777 non-null   object
    dtypes: int64(1), object(3)
    memory usage: 69.4+ KB
    

Selected labels Head:
                                      id               breed  breed_id

In [7]:
# Transform
transform = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor(),
])

class myDataset(Dataset):

    def __init__(self, df, transform=None):

        self.images = list(df['image'])
        self.labels = list(df['breed_id'])
        self.len = len(self.images)

        self.transform = transform

    def __getitem__(self, index):
        img_path = self.images[index]
        img_pil = Image.open(img_path)

        if self.transform is not None:
            img = self.transform(img_pil)

        lbl = int(self.labels[index])
        iid = os.path.split(img_path)[1].replace('.jpg', '')
        
        return [img, lbl, iid]

    def __len__(self):
        return self.len

    
dataSet = myDataset(df_labels_selected, transform=transform)
dataLoader = DataLoader(dataSet, batch_size=BatchSize, shuffle=False)
dataSize = len(dataSet)

imgs, lbls, iids = next(iter(dataLoader))
print('\nImage type:', type(imgs))
print('      size: ', imgs.size())

print('\nLabel type:', type(lbls))
print('      size: ', lbls.size())

print('\nImage ids:')
id_list = [''.join(iid) for iid in iids]
print('  '+'\n  '.join(id_list))


img = imgs[0]
print('\nImage shape:', img.shape)
print(); print(img)

print('\nLabels:', lbls)


Image type: <class 'torch.Tensor'>
      size:  torch.Size([16, 3, 224, 224])

Label type: <class 'torch.Tensor'>
      size:  torch.Size([16])

Image ids:
  003df8b8a8b05244b1d920bb6cf451f9
  0042188c895a2f14ef64a918ed9c7b64
  00693b8bc2470375cc744a6391d397ec
  00bee065dcec471f26394855c5c2f3de
  013f8fdf6d638c7bb042f5f17e8a9fdc
  0162b3e0144fb1d1ab82fbff3ace9938
  01b36cb1b80ab8c3a7d2b7128ad21bdc
  01e787576c003930f96c966f9c3e1d44
  01ee3c7ff9bcaba9874183135877670e
  021b5a49189665c0442c19b5b33e8cf1
  022b34fd8734b39995a9f38a4f3e7b6b
  0267c5f1acbab52ae4a7927e0398612b
  0287b3374c33346e2b41f73af3a36261
  02a8ed20109bd62bd5894f276c08c8a2
  02c90d8109d9a48739b9887349d92b1f
  02d54f0dfb40038765e838459ae8c956

Image shape: torch.Size([3, 224, 224])

tensor([[[0.5059, 0.4980, 0.5412,  ..., 0.5176, 0.4510, 0.3686],
         [0.5451, 0.5451, 0.5686,  ..., 0.5608, 0.4275, 0.3529],
         [0.6392, 0.6039, 0.5098,  ..., 0.5804, 0.5843, 0.4941],
         ...,
         [0.4588, 0.4667, 0.5098,

In [8]:
# import torch.nn.functional as F

# prediction
use_gpu = torch.cuda.is_available()
device = torch.device("cuda:0" if use_gpu else "cpu")
print(device)

cuda:0


In [9]:
# Model 

model = models.resnet50(pretrained=True)

# freeze all model parameters
for param in model.parameters():
    param.requires_grad = False

# New final layer with NumClasses
num_ftrs = model.fc.in_features
model.fc = nn.Linear(num_ftrs, NumClasses)

# load pretrained mode
f_abspath = join(PreTranPath, PreTranModel)
model.load_state_dict(torch.load(f_abspath))

if use_gpu: model = model.cuda()

print(model)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [10]:
# Pediction
import torch.nn.functional as nnf

# col_names = ['id', 'breed', 'predict']
# df_preds = pd.DataFrame(columns=col_names)

# Output submission
fname_submission = 'submission.csv'
f_abspath = join(OutPath, fname_submission)

cols_preds = ['id', 'prediction']
df_preds = pd.DataFrame(columns=cols_preds)

cols_probs = ['id'] + selected_brds
df_probs = pd.DataFrame(columns=cols_probs)

start_time = time.time()
print('\nStart testing...')

model.eval()

for i, (inputs, labels, iids) in enumerate(dataLoader):

    inputs = Variable(inputs.cuda())
    iid_list = list(iids)
    
    with torch.set_grad_enabled(True):
        outputs = model(inputs)
        preds = torch.argmax(outputs, dim=1)
        probs = torch.nn.functional.softmax(outputs, dim=1)
        
        if i == 0:
            print(); print(len(iid_list)); print(iid_list)
            print(); print(probs.shape); print(type(preds)); print(probs)
            print(); print(preds.shape); print(preds)
            print()
    
    pred_list = preds.tolist()
    pred_breeds = [dict_breed_rv.get(x) for x in pred_list]
    
    df_tmp = pd.DataFrame({
        'id': iid_list,
        'prediction': pred_breeds
    })
    df_preds = df_preds.append(df_tmp)

    df_tmp = pd.DataFrame({'id': iid_list})
    df_tmp[selected_brds] = pd.DataFrame(probs.tolist())
    df_probs = df_probs.append(df_tmp)

    print(i, end=', ')
    

print()    

print('Testing time: {:10f} minutes'.format((time.time()-start_time)/60))    

print(); print(df_preds.info())
print(); print(df_preds.head())

print(); print(df_probs.info())
print(); print(df_probs.head())



Start testing...

16
['003df8b8a8b05244b1d920bb6cf451f9', '0042188c895a2f14ef64a918ed9c7b64', '00693b8bc2470375cc744a6391d397ec', '00bee065dcec471f26394855c5c2f3de', '013f8fdf6d638c7bb042f5f17e8a9fdc', '0162b3e0144fb1d1ab82fbff3ace9938', '01b36cb1b80ab8c3a7d2b7128ad21bdc', '01e787576c003930f96c966f9c3e1d44', '01ee3c7ff9bcaba9874183135877670e', '021b5a49189665c0442c19b5b33e8cf1', '022b34fd8734b39995a9f38a4f3e7b6b', '0267c5f1acbab52ae4a7927e0398612b', '0287b3374c33346e2b41f73af3a36261', '02a8ed20109bd62bd5894f276c08c8a2', '02c90d8109d9a48739b9887349d92b1f', '02d54f0dfb40038765e838459ae8c956']

torch.Size([16, 16])
<class 'torch.Tensor'>
tensor([[2.9665e-03, 8.5591e-04, 4.1428e-04, 1.2475e-02, 2.8350e-04, 1.1190e-04,
         2.5877e-04, 6.4222e-04, 9.6132e-01, 6.1602e-04, 2.5547e-03, 6.8683e-04,
         7.9547e-04, 1.6549e-03, 1.3718e-02, 6.4648e-04],
        [3.8631e-02, 1.0760e-01, 5.7525e-01, 1.3992e-02, 8.2670e-03, 9.9899e-03,
         2.5783e-02, 1.6165e-02, 6.7822e-03, 3.4850e-02

In [11]:
from datetime import datetime

currDT = datetime.now()
currStr = currDT.strftime("%Y%m%d-%H%M%S")

fname = 'Prediction_{}.csv'.format(currStr)
df_preds.to_csv(join(OutPath, fname), index=False)

fname = 'Submission_{}.csv'.format(currStr)
df_probs.to_csv(join(OutPath, fname), index=False)