In [1]:
# From: https://www.kaggle.com/c/dog-breed-identification/data
# Author: Morpheus Hsieh

from __future__ import print_function, division

import os, sys
import copy
import io
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import time
from mpl_toolkits.axes_grid1 import ImageGrid
from os import listdir
from os.path import join, isfile
from PIL import Image

import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader

import torchvision
from torchvision import datasets, models, transforms, utils

print("PyTorch Version: ",torch.__version__)
print("Torchvision Version: ",torchvision.__version__)

PyTorch Version:  1.5.1
Torchvision Version:  0.6.1


In [2]:
ProcPath = r'D:\GitWork\dog_breed\data\processed'
print('Proc path:', ProcPath)

TestPath = r'D:\GitWork\dog_breed\data\raw\test'
print("Test path: '{}'".format(TestPath))

ModelPath = r'D:\GitWork\dog_breed\models'
print("Model path: '{}'".format(ModelPath))

OutPath = r'D:\GitWork\dog_breed\output'
print("Output path: '{}'".format(OutPath))

BestModel = '20200916-160223_resnet50_acc94.pth'

BatchSize = 6
NumClasses = 16

Proc path: D:\GitWork\dog_breed\data\processed
Test path: 'D:\GitWork\dog_breed\data\raw\test'
Model path: 'D:\GitWork\dog_breed\models'
Output path: 'D:\GitWork\dog_breed\output'


In [3]:
# Read breed information from csv
CsvBreedsProc = 'breeds_processed.csv'
f_abspath = join(ProcPath, CsvBreedsProc)

df_breeds = pd.read_csv(f_abspath)

print(df_breeds.info())
print(); print(df_breeds.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120 entries, 0 to 119
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   breed_id  120 non-null    int64 
 1   breed     120 non-null    object
 2   count     120 non-null    int64 
dtypes: int64(2), object(1)
memory usage: 2.9+ KB
None

   breed_id                 breed  count
0         0    scottish_deerhound    126
1         1           maltese_dog    117
2         2          afghan_hound    116
3         3           entlebucher    115
4         4  bernese_mountain_dog    114


In [4]:
# Get most popular breeds

def getMostPopularBreeds(df, numClasses=16):
    df1 = df.sort_values(['count', 'breed'], ascending=(False, True))
    df1 = df1.head(numClasses)
    return df1

df_breeds_selected = getMostPopularBreeds(df_breeds, NumClasses)

selected_brds = list(df_breeds_selected['breed'])
print('\nSelected breeds: [\n  {}\n]'.format('\n  '.join(selected_brds)))

selected_bids = list(df_breeds_selected['breed_id'])
print('\nSelected breed ids:\n  {}'.format(selected_bids))


Selected breeds: [
  scottish_deerhound
  maltese_dog
  afghan_hound
  entlebucher
  bernese_mountain_dog
  shih-tzu
  great_pyrenees
  pomeranian
  basenji
  samoyed
  airedale
  tibetan_terrier
  cairn
  leonberg
  beagle
  japanese_spaniel
]

Selected breed ids:
  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 12, 14, 15]


In [5]:
# Build breed dictionaries

def df2dict(df):
    dic = {}
    for i, row in df.iterrows():
        dic[row['breed']] = row['breed_id']
    return dic

dict_breed_fw = df2dict(df_breeds_selected)

print('Breeds dict forward:'); 
print(json.dumps(dict_breed_fw, indent=2))

Breeds dict forward:
{
  "scottish_deerhound": 0,
  "maltese_dog": 1,
  "afghan_hound": 2,
  "entlebucher": 3,
  "bernese_mountain_dog": 4,
  "shih-tzu": 5,
  "great_pyrenees": 6,
  "pomeranian": 7,
  "basenji": 8,
  "samoyed": 9,
  "airedale": 10,
  "tibetan_terrier": 11,
  "cairn": 13,
  "leonberg": 12,
  "beagle": 14,
  "japanese_spaniel": 15
}


In [6]:
# Selected labels

def dfInfo2Str(df, indent=4):
    buf = io.StringIO()
    df.info(buf=buf)
    pad_str = (' ' * indent)
    old_str = '\n'
    new_str = '\n' + pad_str
    outstr = buf.getvalue().replace(old_str, new_str)
    return pad_str + outstr

CsvLabelsProc = 'labels_processed.csv'
f_abspath = join(ProcPath, CsvLabelsProc)

df_labels = pd.read_csv(f_abspath)
print('Origin labels:\n')
print(dfInfo2Str(df_labels))

df_labels_selected = df_labels[df_labels['breed_id'].isin(selected_bids)]

print('\nSelected labels:\n')
print(dfInfo2Str(df_labels_selected))


def dfHead2Str(df, num=10, indent=4):
    inStr = df.head(num).to_string()
    padstr = ' ' * indent
    outStr = padstr + inStr.replace('\n', '\n'+padstr)
    return outStr

print('\nSelected labels Head:')
print(dfHead2Str(df_labels_selected))

Origin labels:

    <class 'pandas.core.frame.DataFrame'>
    RangeIndex: 10222 entries, 0 to 10221
    Data columns (total 2 columns):
     #   Column    Non-Null Count  Dtype 
    ---  ------    --------------  ----- 
     0   image     10222 non-null  object
     1   breed_id  10222 non-null  int64 
    dtypes: int64(1), object(1)
    memory usage: 159.8+ KB
    

Selected labels:

    <class 'pandas.core.frame.DataFrame'>
    Int64Index: 1777 entries, 8 to 10219
    Data columns (total 2 columns):
     #   Column    Non-Null Count  Dtype 
    ---  ------    --------------  ----- 
     0   image     1777 non-null   object
     1   breed_id  1777 non-null   int64 
    dtypes: int64(1), object(1)
    memory usage: 41.6+ KB
    

Selected labels Head:
                                                                           image  breed_id
    8   D:\GitWork\dog_breed\data\raw\train\003df8b8a8b05244b1d920bb6cf451f9.jpg         8
    9   D:\GitWork\dog_breed\data\raw\train\0042188c895a

In [10]:
# prediction
use_gpu = torch.cuda.is_available()
device = torch.device("cuda:0" if use_gpu else "cpu")
print(device)

resnet = models.resnet50(pretrained=True)

# New final layer with NumClasses
num_ftrs = resnet.fc.in_features
resnet.fc = nn.Linear(num_ftrs, NumClasses)

PreTrainedModel = 'resnet50_acc94_20200918-1627.pth'
model_abspath = join(ModelPath, PreTrainedModel)
resnet.load_state_dict(torch.load(model_abspath))

resnet = resnet.cuda()

df_infer = pd.DataFrame(columns=['id', 'breed_id', 'inference'])

# Transform
transform = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor(),
    transforms.Normalize(
        mean = [0.485, 0.456, 0.406],
        std  = [0.229, 0.224, 0.225]
    )
])

# num_test = 100
# df_test = df_labels_selected.head(num_test)
df_test = df_labels_selected

start_time = time.time()
print('\nStart testing...')

for i, row in df_test.iterrows():
    img_path = row['image']
    img_pil = Image.open(img_path)
    img_tensor = transform(img_pil).unsqueeze_(0)
    inp = Variable(img_tensor)
    inp = inp.to(device)
    res = resnet(inp)
    index = res.data.cpu().numpy().argmax()
   
    img_id = os.path.split(img_path)[1].replace('.jpg', '')
    breed_id = row['breed_id']
    df_infer.loc[len(df_infer)] = [img_id, breed_id, index]

print('Testing time: {:10f} minutes'.format((time.time()-start_time)/60))    

print(); print(df_infer.info())
print(); print(df_infer.head())

cuda:0

Start testing...
Testing time:   1.032762 minutes

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1777 entries, 0 to 1776
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         1777 non-null   object
 1   breed_id   1777 non-null   object
 2   inference  1777 non-null   object
dtypes: object(3)
memory usage: 55.5+ KB
None

                                 id breed_id inference
0  003df8b8a8b05244b1d920bb6cf451f9        8         6
1  0042188c895a2f14ef64a918ed9c7b64        0         6
2  00693b8bc2470375cc744a6391d397ec        1        11
3  00bee065dcec471f26394855c5c2f3de       13        11
4  013f8fdf6d638c7bb042f5f17e8a9fdc       11        11


In [11]:
from datetime import datetime

currDT = datetime.now()
currStr = currDT.strftime("%Y%m%d-%H%M%S")
fname_infer = 'Inference_{}.csv'.format(currStr)

df_infer.to_csv(join(OutPath, fname_infer), index=False)