In [24]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
from glob import glob
from src import train_to_implicit, implicit_to_species_aggregate
import pickle

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
train_data = pd.read_csv("../data/train_images.csv")
class_names =  np.load("../data/class_names.npy", allow_pickle=True)

In [None]:
subspecies = {v: k.split(".")[-1] for k, v in class_names.item().items()}
species = {vs: [k for k, v in subspecies.items() if vs in v] for vs in set([x.split("_")[-1] for x in subspecies.values()])}

## Get Class Distribution

In [None]:
fig, ax = plt.subplots(1,1)
ax.hist(train_data.label, bins=len(subspecies))
ax.set_ylabel("Frequency")
ax.set_xlabel("Sub-Species")
ax.set_xticklabels(subspecies.values(), rotation=60)

plt.show()

## Check Image Properties

In [None]:
widths = []
heights = []

for img in glob("../data/train_images/train_images/**.jpg"):
    im = Image.open(img)
    widths.append(im.size[0])
    heights.append(im.size[1])

AVG_HEIGHT = round(sum(heights)/len(heights))
AVG_WIDTH = round(sum(widths)/len(widths))

In [None]:
plt.scatter(widths, heights)
plt.scatter(AVG_WIDTH, AVG_HEIGHT, color="r", label="Average")
plt.xlabel("Width")
plt.ylabel("Height")
plt.legend()
plt.show()

## Process Data

In [None]:
# make subspecies into implicit folder structure (for tf pipeline)
train_to_implicit("../data/train_images/train_images", train_data)

In [25]:
# aggregate implicit folder structure to species structure
implicit_to_species_aggregate("../data/train_images/train_images", species)

100%|██████████| 70/70 [02:12<00:00,  1.90s/it]


In [None]:
with open("mapping.pickle", "wb+") as f: 
    pickle.dump(species, f)