### This nb creates a subset of the Happywhale training images and gets them all set up for use with `ImageDataGenerator`

If we decide to use the full set, it will be easy enough to adapt this code to structure all images for the data generator

In [10]:
import os
import random
import pandas as pd
import shutil

In [11]:
!pwd

/courses/EDS232/whales/whales-deep-learning


#### Creating file structure

In [12]:
# reading in csv with image file labels and subsetting into new dfs based on the indices selected above
df = pd.read_csv('data/train.csv')
image_names = df['image']
sp_names = pd.unique(df['species'])

In [13]:
train_dir = 'data/train_images/'
train_subset_dir = 'data/subset/train_subset/'
validation_subset_dir = 'data/subset/validation_subset/'

if not os.path.exists(train_subset_dir):

    os.mkdir(train_subset_dir)
    os.mkdir(validation_subset_dir)

    for i in range(0, len(sp_names)):
        os.mkdir(train_subset_dir + sp_names[i])
        os.mkdir(validation_subset_dir + sp_names[i])

In [14]:
train_nsamples = len(os.listdir(train_dir))

In [15]:
# randomly selecting 4000 training images and 1000 validation images
seed_value = 71993
random.seed(seed_value)
sample_indices = random.sample(range(0, train_nsamples), 5000)
train_indices = sample_indices[0:4000]
validation_indices = sample_indices[4000:5000]

#### Copying files into new directory structure

In [16]:
train_subset_df = df.filter(items = train_indices, axis=0)
validation_subset_df = df.filter(items = validation_indices, axis=0)

if len(os.listdir(train_subset_dir + sp_names[0]))==0:
    
    for sp in sp_names:
        sp_df = train_subset_df[train_subset_df['species']==sp]

        for i in range(0, len(sp_df)):
            src = train_dir + sp_df['image'].iloc[i]
            dst = train_subset_dir + sp + '/' + sp_df['image'].iloc[i]
            shutil.copyfile(src, dst)

        sp_df = validation_subset_df[validation_subset_df['species']==sp]

        for i in range(0, len(sp_df)):
            src = train_dir + sp_df['image'].iloc[i]
            dst = validation_subset_dir + sp + '/' + sp_df['image'].iloc[i]
            shutil.copyfile(src, dst)


#### Looking at the representation of each species in the subset

Not super well dispersed, but neither is the full set. We can manually incorporate more of the underrepresented species if we want, but that sounds like a pain..

In [17]:
for sp in sp_names:
    sp_dir = train_subset_dir + sp
    nsamples = len(os.listdir(sp_dir))
    print(f'{sp}: {nsamples}')

melon_headed_whale: 125
humpback_whale: 590
false_killer_whale: 246
bottlenose_dolphin: 738
beluga: 606
minke_whale: 121
fin_whale: 115
blue_whale: 357
gray_whale: 79
southern_right_whale: 73
common_dolphin: 28
kiler_whale: 85
pilot_whale: 18
dusky_dolphin: 256
killer_whale: 127
long_finned_pilot_whale: 15
sei_whale: 37
spinner_dolphin: 135
bottlenose_dolpin: 90
cuviers_beaked_whale: 28
spotted_dolphin: 35
globis: 8
brydes_whale: 9
commersons_dolphin: 12
white_sided_dolphin: 16
short_finned_pilot_whale: 30
rough_toothed_dolphin: 5
pantropic_spotted_dolphin: 8
pygmy_killer_whale: 5
frasiers_dolphin: 3
