In [1]:
from __future__ import print_function, division

# import os, sys
import copy
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import time
from mpl_toolkits.axes_grid1 import ImageGrid
from os import listdir
from os.path import join
from PIL import Image

import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader

import torchvision
from torchvision import datasets, models, transforms, utils

In [2]:
ProcPath = r'D:\GitWork\dog_breed\processed'
print("Processed path: '{}'".format(ProcPath))

Processed path: 'D:\GitWork\dog_breed\processed'


In [3]:
csv_proc_breeds = 'breeds_processed.csv'

df_breeds = pd.read_csv(join(ProcPath, csv_proc_breeds))
print('\nBreeds info:'); print(df_breeds.info())
print('\nBreeds head:'); print(df_breeds.head())

# Get most popular breeds
NumClasses = 16

selected_breeds = list(df_breeds['breed'][:NumClasses] )
selected_bids = list(df_breeds['breed_id'][:NumClasses] )

print('\nSelected breeds: [\n  {}\n]'.format('\n  '.join(selected_breeds)))
print('\nSelected breed ids:', selected_bids)

df_breeds_selected = df_breeds[df_breeds['breed'].isin(selected_breeds)]
print(); print(df_breeds_selected)


Breeds info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120 entries, 0 to 119
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   breed_id  120 non-null    int64 
 1   breed     120 non-null    object
 2   count     120 non-null    int64 
dtypes: int64(2), object(1)
memory usage: 2.9+ KB
None

Breeds head:
   breed_id                 breed  count
0         0    scottish_deerhound    126
1         1           maltese_dog    117
2         2          afghan_hound    116
3         3           entlebucher    115
4         4  bernese_mountain_dog    114

Selected breeds: [
  scottish_deerhound
  maltese_dog
  afghan_hound
  entlebucher
  bernese_mountain_dog
  shih-tzu
  great_pyrenees
  pomeranian
  basenji
  samoyed
  airedale
  tibetan_terrier
  leonberg
  cairn
  beagle
  japanese_spaniel
]

Selected breed ids: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]

    breed_id                 breed  count
0          0

In [4]:
fname = 'train_data.npz'

load_data = np.load(join(ProcPath, fname), allow_pickle=True)
df1 = pd.DataFrame.from_dict({item: load_data[item] for item in load_data.files})
print(df1.info())
print(df1.head())

col_names = df1.columns
print(); print(col_names)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1421 entries, 0 to 1420
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   images  1421 non-null   object
 1   labels  1421 non-null   int32 
dtypes: int32(1), object(1)
memory usage: 16.8+ KB
None
                                              images  labels
0  D:\GitWork\dog_breed\data\raw\train\003df8b8a8...       8
1  D:\GitWork\dog_breed\data\raw\train\0042188c89...       0
2  D:\GitWork\dog_breed\data\raw\train\00693b8bc2...       1
3  D:\GitWork\dog_breed\data\raw\train\00bee065dc...      13
4  D:\GitWork\dog_breed\data\raw\train\013f8fdf6d...      11

Index(['images', 'labels'], dtype='object')


In [5]:
fname = 'train_data.npz'

load_data = np.load(join(ProcPath, fname), allow_pickle=True)
column_names = load_data.files

df = pd.DataFrame(data=[load_data[x] for x in column_names]).T
df.columns = load_data.files

print(df.info())
print(); print(df.head)

df_delected = df[df['labels'].isin(selected_bids)]
print(); print(df_delected.info())
print(); print(df_delected.head)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1421 entries, 0 to 1420
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   images  1421 non-null   object
 1   labels  1421 non-null   object
dtypes: object(2)
memory usage: 22.3+ KB
None

<bound method NDFrame.head of                                                  images labels
0     D:\GitWork\dog_breed\data\raw\train\003df8b8a8...      8
1     D:\GitWork\dog_breed\data\raw\train\0042188c89...      0
2     D:\GitWork\dog_breed\data\raw\train\00693b8bc2...      1
3     D:\GitWork\dog_breed\data\raw\train\00bee065dc...     13
4     D:\GitWork\dog_breed\data\raw\train\013f8fdf6d...     11
...                                                 ...    ...
1416  D:\GitWork\dog_breed\data\raw\train\cbcd264608...      3
1417  D:\GitWork\dog_breed\data\raw\train\cbd8e6c8f9...      2
1418  D:\GitWork\dog_breed\data\raw\train\cbe63371eb...     11
1419  D:\GitWork\dog_breed\data\raw\train\c

In [6]:
fname = 'valid_data.npz'

load_data = np.load(join(ProcPath, fname), allow_pickle=True)
column_names = load_data.files

df = pd.DataFrame(data=[load_data[x] for x in column_names]).T
df.columns = load_data.files

print(df.info())
print(); print(df.head)

df_delected = df[df['labels'].isin(selected_bids)]
print(); print(df_delected.info())
print(); print(df_delected.head)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 356 entries, 0 to 355
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   images  356 non-null    object
 1   labels  356 non-null    object
dtypes: object(2)
memory usage: 5.7+ KB
None

<bound method NDFrame.head of                                                 images labels
0    D:\GitWork\dog_breed\data\raw\train\cc77de33c6...      5
1    D:\GitWork\dog_breed\data\raw\train\ccb75b5d00...      5
2    D:\GitWork\dog_breed\data\raw\train\ccbf2d7da8...     10
3    D:\GitWork\dog_breed\data\raw\train\ccc369e93d...      6
4    D:\GitWork\dog_breed\data\raw\train\ccd907aaac...     14
..                                                 ...    ...
351  D:\GitWork\dog_breed\data\raw\train\ffa4e1bf95...      8
352  D:\GitWork\dog_breed\data\raw\train\ffc532991d...     11
353  D:\GitWork\dog_breed\data\raw\train\ffcde16e7d...     10
354  D:\GitWork\dog_breed\data\raw\train\ffcffab7e4...  