# This Notebook

In this notebook we will create a DF that contains all the images with columns for their path, Image ID and labels of various forms for training. We will take our Fish DF, go to the folder location, make a column with a list of all the file names per species in that row, then explode the DF on the filename column. Then we can get a look at our image distributions and create our train test val split.

In [1]:
import os, shutil
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
fish_DF = pd.read_pickle('./files/fishspecies_reefguide_info_3.pkl')

In [3]:
fish_DF.head()

Unnamed: 0,scientific_name,scientific_family,common_name,common_family,size_range_cm,depth_range_m,geodist,smithsonian_href,stj_href
0,Halichoeres bivittatus,Labridae,Slippery Dick,Wrasses,"(12.0, 20.0)","(2.0, 12.0)","[Caribbean, Bahamas, Florida, Bermuda, Gulf of...",spe/3894,slippery-dick-wrasse.html
1,Rypticus bistrispinus,Serranidae,Freckled Soapfish,Soapfishes,"(7.5, 13.0)","(3.0, 21.0)","[Caribbean, Bahamas, South Florida, Brazil]",spe/3537,
2,Apogon binotatus,Apogonidae,Barred Cardinalfish,Cardinalfishes,"(0.0, 10.0)","(1.0, 45.0)","[Caribbean, Bahamas, South Florida]",spe/3595,belted-cardinalfish.html
3,Lutjanus mahogoni,Lutjanidae,Mahogany Snapper,Snappers,"(18.0, 30.0)","(6.0, 18.0)","[Caribbean, Bahamas, Florida, Gulf of Mexico]",spe/3690,mahogany-snapper.html
4,Lutjanus synagris,Lutjanidae,Lane Snapper,Snappers,"(20.0, 30.0)","(2.0, 40.0)","[Caribbean, Bahamas, Florida, Gulf of Mexico, ...",spe/3692,lane-snapper.html


In [4]:
fish_DF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 263 entries, 0 to 262
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   scientific_name    263 non-null    object
 1   scientific_family  263 non-null    object
 2   common_name        263 non-null    object
 3   common_family      263 non-null    object
 4   size_range_cm      263 non-null    object
 5   depth_range_m      263 non-null    object
 6   geodist            263 non-null    object
 7   smithsonian_href   263 non-null    object
 8   stj_href           152 non-null    object
dtypes: object(9)
memory usage: 18.6+ KB


In [5]:
def get_species_images_paths(scientific_name,scientific_family):

    path_prefix = 'E:/LargeDatasets/SpeciesID-Images/'
    folder_path = path_prefix + scientific_family + '/' + scientific_name.replace(' ','_') + '_'
    image_names_list = os.listdir(folder_path)

    image_paths_list = []
    for name in image_names_list:
        image_paths_list.append(folder_path+'/'+name)

    return image_paths_list

In [6]:
fish_DF.loc[:2]

Unnamed: 0,scientific_name,scientific_family,common_name,common_family,size_range_cm,depth_range_m,geodist,smithsonian_href,stj_href
0,Halichoeres bivittatus,Labridae,Slippery Dick,Wrasses,"(12.0, 20.0)","(2.0, 12.0)","[Caribbean, Bahamas, Florida, Bermuda, Gulf of...",spe/3894,slippery-dick-wrasse.html
1,Rypticus bistrispinus,Serranidae,Freckled Soapfish,Soapfishes,"(7.5, 13.0)","(3.0, 21.0)","[Caribbean, Bahamas, South Florida, Brazil]",spe/3537,
2,Apogon binotatus,Apogonidae,Barred Cardinalfish,Cardinalfishes,"(0.0, 10.0)","(1.0, 45.0)","[Caribbean, Bahamas, South Florida]",spe/3595,belted-cardinalfish.html


In [7]:
fish_DF['image_paths'] = pd.Series(dtype=object)
fish_DF['image_paths'] = fish_DF.apply(lambda x: get_species_images_paths(x.scientific_name, x.scientific_family), axis = 1)

In [8]:
fish_DF.head()

Unnamed: 0,scientific_name,scientific_family,common_name,common_family,size_range_cm,depth_range_m,geodist,smithsonian_href,stj_href,image_paths
0,Halichoeres bivittatus,Labridae,Slippery Dick,Wrasses,"(12.0, 20.0)","(2.0, 12.0)","[Caribbean, Bahamas, Florida, Bermuda, Gulf of...",spe/3894,slippery-dick-wrasse.html,[E:/LargeDatasets/SpeciesID-Images/Labridae/Ha...
1,Rypticus bistrispinus,Serranidae,Freckled Soapfish,Soapfishes,"(7.5, 13.0)","(3.0, 21.0)","[Caribbean, Bahamas, South Florida, Brazil]",spe/3537,,[E:/LargeDatasets/SpeciesID-Images/Serranidae/...
2,Apogon binotatus,Apogonidae,Barred Cardinalfish,Cardinalfishes,"(0.0, 10.0)","(1.0, 45.0)","[Caribbean, Bahamas, South Florida]",spe/3595,belted-cardinalfish.html,[E:/LargeDatasets/SpeciesID-Images/Apogonidae/...
3,Lutjanus mahogoni,Lutjanidae,Mahogany Snapper,Snappers,"(18.0, 30.0)","(6.0, 18.0)","[Caribbean, Bahamas, Florida, Gulf of Mexico]",spe/3690,mahogany-snapper.html,[E:/LargeDatasets/SpeciesID-Images/Lutjanidae/...
4,Lutjanus synagris,Lutjanidae,Lane Snapper,Snappers,"(20.0, 30.0)","(2.0, 40.0)","[Caribbean, Bahamas, Florida, Gulf of Mexico, ...",spe/3692,lane-snapper.html,[E:/LargeDatasets/SpeciesID-Images/Lutjanidae/...


In [9]:
expanded_fish_DF = fish_DF.explode('image_paths', ignore_index=True)

In [10]:
expanded_fish_DF.head()

Unnamed: 0,scientific_name,scientific_family,common_name,common_family,size_range_cm,depth_range_m,geodist,smithsonian_href,stj_href,image_paths
0,Halichoeres bivittatus,Labridae,Slippery Dick,Wrasses,"(12.0, 20.0)","(2.0, 12.0)","[Caribbean, Bahamas, Florida, Bermuda, Gulf of...",spe/3894,slippery-dick-wrasse.html,E:/LargeDatasets/SpeciesID-Images/Labridae/Hal...
1,Halichoeres bivittatus,Labridae,Slippery Dick,Wrasses,"(12.0, 20.0)","(2.0, 12.0)","[Caribbean, Bahamas, Florida, Bermuda, Gulf of...",spe/3894,slippery-dick-wrasse.html,E:/LargeDatasets/SpeciesID-Images/Labridae/Hal...
2,Halichoeres bivittatus,Labridae,Slippery Dick,Wrasses,"(12.0, 20.0)","(2.0, 12.0)","[Caribbean, Bahamas, Florida, Bermuda, Gulf of...",spe/3894,slippery-dick-wrasse.html,E:/LargeDatasets/SpeciesID-Images/Labridae/Hal...
3,Halichoeres bivittatus,Labridae,Slippery Dick,Wrasses,"(12.0, 20.0)","(2.0, 12.0)","[Caribbean, Bahamas, Florida, Bermuda, Gulf of...",spe/3894,slippery-dick-wrasse.html,E:/LargeDatasets/SpeciesID-Images/Labridae/Hal...
4,Halichoeres bivittatus,Labridae,Slippery Dick,Wrasses,"(12.0, 20.0)","(2.0, 12.0)","[Caribbean, Bahamas, Florida, Bermuda, Gulf of...",spe/3894,slippery-dick-wrasse.html,E:/LargeDatasets/SpeciesID-Images/Labridae/Hal...


In [11]:
expanded_fish_DF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7006 entries, 0 to 7005
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   scientific_name    7006 non-null   object
 1   scientific_family  7006 non-null   object
 2   common_name        7006 non-null   object
 3   common_family      7006 non-null   object
 4   size_range_cm      7006 non-null   object
 5   depth_range_m      7006 non-null   object
 6   geodist            7006 non-null   object
 7   smithsonian_href   7006 non-null   object
 8   stj_href           5121 non-null   object
 9   image_paths        7006 non-null   object
dtypes: object(10)
memory usage: 547.5+ KB


In [12]:
expanded_fish_DF.image_paths.nunique()

7006

Hmm.. some duplicate images (There were, see below, but I fixed it already)!

In [13]:
expanded_fish_DF.loc[expanded_fish_DF.image_paths.duplicated(keep=False)]

Unnamed: 0,scientific_name,scientific_family,common_name,common_family,size_range_cm,depth_range_m,geodist,smithsonian_href,stj_href,image_paths


Okay, looks like I missed deleting some pixel.gifs from STJ and also there were some errors with how things were saved from STJ when there was a second species. I will manually look at the duplicates and decide which pictures go where as well as delete the undesirables.

In [14]:
fish_DF['image_paths'] = pd.Series(dtype=object)
fish_DF['image_paths'] = fish_DF.apply(lambda x: get_species_images_paths(x.scientific_name, x.scientific_family), axis = 1)
expanded_fish_DF = fish_DF.explode('image_paths', ignore_index=True)

In [15]:
expanded_fish_DF.head()

Unnamed: 0,scientific_name,scientific_family,common_name,common_family,size_range_cm,depth_range_m,geodist,smithsonian_href,stj_href,image_paths
0,Halichoeres bivittatus,Labridae,Slippery Dick,Wrasses,"(12.0, 20.0)","(2.0, 12.0)","[Caribbean, Bahamas, Florida, Bermuda, Gulf of...",spe/3894,slippery-dick-wrasse.html,E:/LargeDatasets/SpeciesID-Images/Labridae/Hal...
1,Halichoeres bivittatus,Labridae,Slippery Dick,Wrasses,"(12.0, 20.0)","(2.0, 12.0)","[Caribbean, Bahamas, Florida, Bermuda, Gulf of...",spe/3894,slippery-dick-wrasse.html,E:/LargeDatasets/SpeciesID-Images/Labridae/Hal...
2,Halichoeres bivittatus,Labridae,Slippery Dick,Wrasses,"(12.0, 20.0)","(2.0, 12.0)","[Caribbean, Bahamas, Florida, Bermuda, Gulf of...",spe/3894,slippery-dick-wrasse.html,E:/LargeDatasets/SpeciesID-Images/Labridae/Hal...
3,Halichoeres bivittatus,Labridae,Slippery Dick,Wrasses,"(12.0, 20.0)","(2.0, 12.0)","[Caribbean, Bahamas, Florida, Bermuda, Gulf of...",spe/3894,slippery-dick-wrasse.html,E:/LargeDatasets/SpeciesID-Images/Labridae/Hal...
4,Halichoeres bivittatus,Labridae,Slippery Dick,Wrasses,"(12.0, 20.0)","(2.0, 12.0)","[Caribbean, Bahamas, Florida, Bermuda, Gulf of...",spe/3894,slippery-dick-wrasse.html,E:/LargeDatasets/SpeciesID-Images/Labridae/Hal...


In [16]:
expanded_fish_DF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7006 entries, 0 to 7005
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   scientific_name    7006 non-null   object
 1   scientific_family  7006 non-null   object
 2   common_name        7006 non-null   object
 3   common_family      7006 non-null   object
 4   size_range_cm      7006 non-null   object
 5   depth_range_m      7006 non-null   object
 6   geodist            7006 non-null   object
 7   smithsonian_href   7006 non-null   object
 8   stj_href           5121 non-null   object
 9   image_paths        7006 non-null   object
dtypes: object(10)
memory usage: 547.5+ KB


In [17]:
expanded_fish_DF.image_paths.nunique()

7006

Perfect!

In [18]:
expanded_fish_DF.scientific_family.value_counts()

Serranidae         779
Labridae           461
Scaridae           427
Pomacentridae      407
Gobiidae           388
                  ... 
Albulidae           15
Cynoglossidae       11
Paralichthyidae     11
Fistulariidae       11
Rhincodontidae       9
Name: scientific_family, Length: 62, dtype: int64

In [19]:
expanded_fish_DF.common_family.value_counts()

Wrasses           461
Parrotfishes      427
Gobies            388
Groupers          328
Damselfishes      318
                 ... 
Soldierfishes      14
Tonguefishes       11
Cornetfishes       11
Sand Flounders     11
Whale Sharks        9
Name: common_family, Length: 68, dtype: int64

Hmm.. so the reason I was checking this is that I was thinking about training the first model on either common or scientific family. If there was drastically less of one I would probably choose that, but it looks like there is about the same amount. I suspect I can reduce the number of common families a bit as well, plus I imagine the common family name is better correlated with the fishes looks than the scientific family, making it so that we might get slightly better results with our classifier. Right now I am imagining having an ensemble, maybe a predictor of family and then a predictor of species, but at the very least we can develop our model and check for impacts on performance based on training methodology before going into species ID from scratch or by transfer learning.

In [20]:
expanded_fish_DF.common_family.value_counts().index

Index(['Wrasses', 'Parrotfishes', 'Gobies', 'Groupers', 'Damselfishes',
       'Grunts', 'Hamlets', 'Angelfishes', 'Boxfishes', 'Snappers', 'Jacks',
       'Moray Eels', 'Filefishes', 'Labrisomid Blennies', 'Seabasses',
       'Cardinalfishes', 'Tube Blennies', 'Porcupinefishes', 'Scorpionfishes',
       'Butterflyfishes', 'Pufferfishes', 'Drums', 'Surgeonfishes',
       'Jawfishes', 'Lefteye Flounders', 'Chromis', 'Squirrelfishes',
       'Triggerfishes', 'Porgies', 'Requiem Sharks', 'Nurse Sharks',
       'Combtooth Blennies', 'Lionfishes', 'Chubs', 'Lizardfishes', 'Bigeyes',
       'Soapfishes', 'Remoras', 'Snake Eels', 'Barracudas', 'Goatfishes',
       'Trumpetfishes', 'Eagle Rays', 'Basslets', 'Toadfishes',
       'Round Stingrays', 'Flying Gurnards', 'Sweepers', 'Seahorses',
       'Tarpons', 'Needlefishes', 'Batfishes', 'Halfbeaks', 'Tilefishes',
       'Triplefin Blennies', 'Spadefishes', 'Dragonets', 'Hawkfishes',
       'Garden Eels', 'Mojarras', 'Bonnetmouths', 'Mackerels a

For example, I think I might try to reduce all the blennies into a singular common family just to start out with. We can also combine the two flounder categories, the three shark categories, and the three eel categories.

In [21]:
def family_translator(common_family):

    family_dict = {
        'Labrisomid Blennies' : 'Blennies',
        'Tube Blennies' : 'Blennies',
        'Combtooth Blennies' : 'Blennies',
        'Triplefin Blennies' : 'Blennies',
        'Lefteye Flounders' : 'Flounders',
        'Sand Flounders' : 'Flounders',
        'Whale Sharks' : 'Sharks',
        'Requiem Sharks' : 'Sharks',
        'Nurse Sharks' : 'Sharks',
        'Moray Eels' : 'Eels',
        'Garden Eels' : 'Eels',
        'Snake Eels' : 'Eels',
        'Eagle Rays' : 'Rays',
        'Round Stingrays' : 'Rays',
    }

    if common_family in family_dict.keys():
        common_family = family_dict.get(common_family)

    return common_family

In [22]:
expanded_fish_DF.common_family = expanded_fish_DF.common_family.apply(lambda x: family_translator(x))

In [23]:
expanded_fish_DF.common_family.value_counts().index

Index(['Wrasses', 'Parrotfishes', 'Blennies', 'Gobies', 'Groupers',
       'Damselfishes', 'Grunts', 'Eels', 'Hamlets', 'Angelfishes', 'Boxfishes',
       'Snappers', 'Jacks', 'Filefishes', 'Sharks', 'Cardinalfishes',
       'Seabasses', 'Porcupinefishes', 'Scorpionfishes', 'Butterflyfishes',
       'Pufferfishes', 'Drums', 'Flounders', 'Surgeonfishes', 'Jawfishes',
       'Chromis', 'Squirrelfishes', 'Triggerfishes', 'Porgies', 'Rays',
       'Lionfishes', 'Chubs', 'Lizardfishes', 'Bigeyes', 'Soapfishes',
       'Remoras', 'Barracudas', 'Trumpetfishes', 'Goatfishes', 'Basslets',
       'Toadfishes', 'Flying Gurnards', 'Sweepers', 'Seahorses', 'Tarpons',
       'Batfishes', 'Needlefishes', 'Halfbeaks', 'Tilefishes', 'Spadefishes',
       'Dragonets', 'Hawkfishes', 'Bonnetmouths', 'Mackerels and Tunas',
       'Mojarras', 'Bonefishes', 'Soldierfishes', 'Tonguefishes',
       'Cornetfishes'],
      dtype='object')

In [24]:
expanded_fish_DF.common_family.value_counts()

Wrasses                461
Parrotfishes           427
Blennies               409
Gobies                 388
Groupers               328
Damselfishes           318
Grunts                 309
Eels                   281
Hamlets                252
Angelfishes            245
Boxfishes              231
Snappers               219
Jacks                  216
Filefishes             208
Sharks                 149
Cardinalfishes         143
Seabasses              143
Porcupinefishes        140
Scorpionfishes         129
Butterflyfishes        129
Pufferfishes           117
Drums                  114
Flounders              107
Surgeonfishes          106
Jawfishes              103
Chromis                 89
Squirrelfishes          86
Triggerfishes           85
Porgies                 76
Rays                    75
Lionfishes              61
Chubs                   61
Lizardfishes            58
Bigeyes                 57
Soapfishes              56
Remoras                 54
Barracudas              51
T

I'm seeing that there are a lot of blenny within each category so in the future I may undo that grouping but for now we will leave it. So we are definitely seeing that there is pretty significant class imbalance so that will be something to consider in the future. It looks like our floor for the number of images within a category is roughly 10. We can always augment the train set in the future, but we won't be able to do so for our val and test sets, so it begs the question how will we balance having enough images of a species to train while also giving it a couple chances during validation and test. I think what I want to do is do a 80-10-10 split, but require that the minimum pictures in a val and test set be 2, so it will actually modulate to like a 60-20-20 split for the categories where we have less pictures.

In [25]:
train, test, split = np.array_split(expanded_fish_DF[:10].to_numpy(),3)

In [26]:
expanded_fish_DF.common_family.unique()

array(['Wrasses', 'Soapfishes', 'Cardinalfishes', 'Snappers',
       'Lizardfishes', 'Angelfishes', 'Grunts', 'Parrotfishes',
       'Tilefishes', 'Eels', 'Tarpons', 'Boxfishes', 'Gobies', 'Groupers',
       'Basslets', 'Seabasses', 'Hamlets', 'Butterflyfishes',
       'Pufferfishes', 'Surgeonfishes', 'Jacks', 'Sharks', 'Jawfishes',
       'Porcupinefishes', 'Drums', 'Bigeyes', 'Damselfishes', 'Chromis',
       'Hawkfishes', 'Seahorses', 'Mackerels and Tunas', 'Blennies',
       'Remoras', 'Rays', 'Squirrelfishes', 'Scorpionfishes',
       'Bonefishes', 'Porgies', 'Sweepers', 'Cornetfishes', 'Batfishes',
       'Triggerfishes', 'Chubs', 'Toadfishes', 'Filefishes', 'Flounders',
       'Goatfishes', 'Bonnetmouths', 'Lionfishes', 'Needlefishes',
       'Spadefishes', 'Trumpetfishes', 'Flying Gurnards', 'Tonguefishes',
       'Soldierfishes', 'Dragonets', 'Halfbeaks', 'Mojarras',
       'Barracudas'], dtype=object)

In [27]:
pd.DataFrame(train)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,Halichoeres bivittatus,Labridae,Slippery Dick,Wrasses,"(12.0, 20.0)","(2.0, 12.0)","[Caribbean, Bahamas, Florida, Bermuda, Gulf of...",spe/3894,slippery-dick-wrasse.html,E:/LargeDatasets/SpeciesID-Images/Labridae/Hal...
1,Halichoeres bivittatus,Labridae,Slippery Dick,Wrasses,"(12.0, 20.0)","(2.0, 12.0)","[Caribbean, Bahamas, Florida, Bermuda, Gulf of...",spe/3894,slippery-dick-wrasse.html,E:/LargeDatasets/SpeciesID-Images/Labridae/Hal...
2,Halichoeres bivittatus,Labridae,Slippery Dick,Wrasses,"(12.0, 20.0)","(2.0, 12.0)","[Caribbean, Bahamas, Florida, Bermuda, Gulf of...",spe/3894,slippery-dick-wrasse.html,E:/LargeDatasets/SpeciesID-Images/Labridae/Hal...
3,Halichoeres bivittatus,Labridae,Slippery Dick,Wrasses,"(12.0, 20.0)","(2.0, 12.0)","[Caribbean, Bahamas, Florida, Bermuda, Gulf of...",spe/3894,slippery-dick-wrasse.html,E:/LargeDatasets/SpeciesID-Images/Labridae/Hal...


In [28]:
def split_stratified_into_train_val_test(df_in, stratify_col):
    
    categories = df_in[stratify_col].unique()
    columns = df_in.columns
    train_df = pd.DataFrame(columns=columns)
    val_df = pd.DataFrame(columns=columns)
    test_df = pd.DataFrame(columns=columns)

    for cat in categories:
        cat_df = df_in.loc[df_in[stratify_col] == cat]
        if len(cat_df) <= 15:
            train_frac, val_frac, test_frac = 0.7, 0.15, 0.15
        else:
            train_frac, val_frac, test_frac = 0.8, 0.1, 0.1
        
        train_arr, val_arr, test_arr = np.array_split(cat_df.to_numpy(),
                                                      [int((1-(val_frac+test_frac))*len(cat_df)),
                                                                         int((1-test_frac)*len(cat_df))])
        
        train_df = pd.concat([train_df,pd.DataFrame(train_arr,columns = columns)])
        val_df = pd.concat([val_df,pd.DataFrame(val_arr,columns = columns)])
        test_df = pd.concat([test_df,pd.DataFrame(test_arr,columns = columns)])
        

    return train_df, val_df, test_df

In [29]:
train_expanded, val_expanded, test_expanded = split_stratified_into_train_val_test(expanded_fish_DF,'common_family')

In [30]:
train_expanded.head()

Unnamed: 0,scientific_name,scientific_family,common_name,common_family,size_range_cm,depth_range_m,geodist,smithsonian_href,stj_href,image_paths
0,Halichoeres bivittatus,Labridae,Slippery Dick,Wrasses,"(12.0, 20.0)","(2.0, 12.0)","[Caribbean, Bahamas, Florida, Bermuda, Gulf of...",spe/3894,slippery-dick-wrasse.html,E:/LargeDatasets/SpeciesID-Images/Labridae/Hal...
1,Halichoeres bivittatus,Labridae,Slippery Dick,Wrasses,"(12.0, 20.0)","(2.0, 12.0)","[Caribbean, Bahamas, Florida, Bermuda, Gulf of...",spe/3894,slippery-dick-wrasse.html,E:/LargeDatasets/SpeciesID-Images/Labridae/Hal...
2,Halichoeres bivittatus,Labridae,Slippery Dick,Wrasses,"(12.0, 20.0)","(2.0, 12.0)","[Caribbean, Bahamas, Florida, Bermuda, Gulf of...",spe/3894,slippery-dick-wrasse.html,E:/LargeDatasets/SpeciesID-Images/Labridae/Hal...
3,Halichoeres bivittatus,Labridae,Slippery Dick,Wrasses,"(12.0, 20.0)","(2.0, 12.0)","[Caribbean, Bahamas, Florida, Bermuda, Gulf of...",spe/3894,slippery-dick-wrasse.html,E:/LargeDatasets/SpeciesID-Images/Labridae/Hal...
4,Halichoeres bivittatus,Labridae,Slippery Dick,Wrasses,"(12.0, 20.0)","(2.0, 12.0)","[Caribbean, Bahamas, Florida, Bermuda, Gulf of...",spe/3894,slippery-dick-wrasse.html,E:/LargeDatasets/SpeciesID-Images/Labridae/Hal...


In [31]:
train_expanded.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5572 entries, 0 to 39
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   scientific_name    5572 non-null   object
 1   scientific_family  5572 non-null   object
 2   common_name        5572 non-null   object
 3   common_family      5572 non-null   object
 4   size_range_cm      5572 non-null   object
 5   depth_range_m      5572 non-null   object
 6   geodist            5572 non-null   object
 7   smithsonian_href   5572 non-null   object
 8   stj_href           4183 non-null   object
 9   image_paths        5572 non-null   object
dtypes: object(10)
memory usage: 478.8+ KB


In [32]:
val_expanded.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 704 entries, 0 to 4
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   scientific_name    704 non-null    object
 1   scientific_family  704 non-null    object
 2   common_name        704 non-null    object
 3   common_family      704 non-null    object
 4   size_range_cm      704 non-null    object
 5   depth_range_m      704 non-null    object
 6   geodist            704 non-null    object
 7   smithsonian_href   704 non-null    object
 8   stj_href           486 non-null    object
 9   image_paths        704 non-null    object
dtypes: object(10)
memory usage: 60.5+ KB


In [33]:
test_expanded.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 730 entries, 0 to 5
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   scientific_name    730 non-null    object
 1   scientific_family  730 non-null    object
 2   common_name        730 non-null    object
 3   common_family      730 non-null    object
 4   size_range_cm      730 non-null    object
 5   depth_range_m      730 non-null    object
 6   geodist            730 non-null    object
 7   smithsonian_href   730 non-null    object
 8   stj_href           452 non-null    object
 9   image_paths        730 non-null    object
dtypes: object(10)
memory usage: 62.7+ KB


In [34]:
train_expanded.common_family.value_counts()

Wrasses                368
Parrotfishes           341
Blennies               327
Gobies                 310
Groupers               262
Damselfishes           254
Grunts                 247
Eels                   224
Hamlets                201
Angelfishes            196
Boxfishes              184
Snappers               175
Jacks                  172
Filefishes             166
Sharks                 119
Cardinalfishes         114
Seabasses              114
Porcupinefishes        112
Scorpionfishes         103
Butterflyfishes        103
Pufferfishes            93
Drums                   91
Flounders               85
Surgeonfishes           84
Jawfishes               82
Chromis                 71
Triggerfishes           68
Squirrelfishes          68
Porgies                 60
Rays                    60
Lionfishes              48
Chubs                   48
Lizardfishes            46
Bigeyes                 45
Soapfishes              44
Remoras                 43
Barracudas              40
T

In [35]:
val_expanded.common_family.value_counts()

Wrasses                46
Parrotfishes           43
Blennies               41
Gobies                 39
Groupers               33
Damselfishes           32
Grunts                 31
Eels                   28
Hamlets                25
Angelfishes            24
Boxfishes              23
Jacks                  22
Snappers               22
Filefishes             21
Sharks                 15
Porcupinefishes        14
Cardinalfishes         14
Seabasses              14
Butterflyfishes        13
Scorpionfishes         13
Pufferfishes           12
Surgeonfishes          11
Flounders              11
Drums                  11
Jawfishes              10
Chromis                 9
Squirrelfishes          9
Triggerfishes           8
Porgies                 8
Rays                    7
Bigeyes                 6
Lionfishes              6
Chubs                   6
Soapfishes              6
Lizardfishes            6
Barracudas              5
Remoras                 5
Trumpetfishes           4
Basslets    

In [36]:
test_expanded.common_family.value_counts()

Wrasses                47
Parrotfishes           43
Blennies               41
Gobies                 39
Groupers               33
Damselfishes           32
Grunts                 31
Eels                   29
Hamlets                26
Angelfishes            25
Boxfishes              24
Jacks                  22
Snappers               22
Filefishes             21
Sharks                 15
Seabasses              15
Cardinalfishes         15
Porcupinefishes        14
Scorpionfishes         13
Butterflyfishes        13
Pufferfishes           12
Drums                  12
Flounders              11
Surgeonfishes          11
Jawfishes              11
Triggerfishes           9
Chromis                 9
Squirrelfishes          9
Porgies                 8
Rays                    8
Chubs                   7
Lionfishes              7
Barracudas              6
Remoras                 6
Bigeyes                 6
Soapfishes              6
Lizardfishes            6
Trumpetfishes           5
Goatfishes  

In [37]:
train_expanded.image_paths.values[0]

'E:/LargeDatasets/SpeciesID-Images/Labridae/Halichoeres_bivittatus_/3894_13014.jpg'

In [38]:
train_expanded.to_pickle('./files/train_set_1.pkl')
val_expanded.to_pickle('./files/val_set_1.pkl')
test_expanded.to_pickle('./files/test_set_1.pkl')

## Reorganising files for AWS upload

In [2]:
train_expanded = pd.read_pickle('./files/train_set_1.pkl')
val_expanded = pd.read_pickle('./files/val_set_1.pkl')
test_expanded = pd.read_pickle('./files/test_set_1.pkl')

In [3]:
train_expanded.head()

Unnamed: 0,scientific_name,scientific_family,common_name,common_family,size_range_cm,depth_range_m,geodist,smithsonian_href,stj_href,image_paths
0,Halichoeres bivittatus,Labridae,Slippery Dick,Wrasses,"(12.0, 20.0)","(2.0, 12.0)","[Caribbean, Bahamas, Florida, Bermuda, Gulf of...",spe/3894,slippery-dick-wrasse.html,E:/LargeDatasets/SpeciesID-Images/Labridae/Hal...
1,Halichoeres bivittatus,Labridae,Slippery Dick,Wrasses,"(12.0, 20.0)","(2.0, 12.0)","[Caribbean, Bahamas, Florida, Bermuda, Gulf of...",spe/3894,slippery-dick-wrasse.html,E:/LargeDatasets/SpeciesID-Images/Labridae/Hal...
2,Halichoeres bivittatus,Labridae,Slippery Dick,Wrasses,"(12.0, 20.0)","(2.0, 12.0)","[Caribbean, Bahamas, Florida, Bermuda, Gulf of...",spe/3894,slippery-dick-wrasse.html,E:/LargeDatasets/SpeciesID-Images/Labridae/Hal...
3,Halichoeres bivittatus,Labridae,Slippery Dick,Wrasses,"(12.0, 20.0)","(2.0, 12.0)","[Caribbean, Bahamas, Florida, Bermuda, Gulf of...",spe/3894,slippery-dick-wrasse.html,E:/LargeDatasets/SpeciesID-Images/Labridae/Hal...
4,Halichoeres bivittatus,Labridae,Slippery Dick,Wrasses,"(12.0, 20.0)","(2.0, 12.0)","[Caribbean, Bahamas, Florida, Bermuda, Gulf of...",spe/3894,slippery-dick-wrasse.html,E:/LargeDatasets/SpeciesID-Images/Labridae/Hal...


In [8]:
train_expanded.image_paths.values[0]

'E:/LargeDatasets/SpeciesID-Images/Labridae/Halichoeres_bivittatus_/3894_13014.jpg'

In [11]:
train_expanded.image_paths.values[0].split('/')[-1]

'3894_13014.jpg'

In [14]:
train_expanded['common_family'].nunique()

59

In [12]:
# for every file in each dataset
# maintain test, val, or train designation
# move to folder eg. head/train/common-name

# make new root
new_root_dir = 'E:\LargeDatasets\SpeciesID-Images-ForAWS'
os.mkdir(new_root_dir)

# make train, val, test subdivisions
subset_names = ['train','val','test']
for d in subset_names:
    new_dir = os.path.join(new_root_dir,d)
    os.mkdir(new_dir)

# add photos, further subdividing by common name
for common_family in list(test_expanded.common_family.unique()):

    new_dir = os.path.join(new_root_dir,'test',common_family)
    os.mkdir(new_dir)

    common_family_images = test_expanded[test_expanded.common_family == common_family]

    for origin in common_family_images.image_paths.values:
        filename = origin.split('/')[-1]
        destination = os.path.join(new_root_dir,'test',common_family,filename)
        shutil.copy(origin, destination)

In [None]:
for common_family in list(train_expanded.common_family.unique()):

    new_dir = os.path.join(new_root_dir,'train',common_family)
    os.mkdir(new_dir)

    common_family_images = train_expanded[train_expanded.common_family == common_family]

    for origin in common_family_images.image_paths.values:
        filename = origin.split('/')[-1]
        destination = os.path.join(new_root_dir,'train',common_family,filename)
        shutil.copy(origin, destination)

In [13]:
for common_family in list(val_expanded.common_family.unique()):

    new_dir = os.path.join(new_root_dir,'val',common_family)
    os.mkdir(new_dir)

    common_family_images = val_expanded[val_expanded.common_family == common_family]

    for origin in common_family_images.image_paths.values:
        filename = origin.split('/')[-1]
        destination = os.path.join(new_root_dir,'val',common_family,filename)
        shutil.copy(origin, destination)