# Dataset Creation - Classification
Sep 15 2020: Created

In [1]:
# import libraries
import os
import shutil
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
# set directories
os.chdir("../") # change current working directory to root directory of project
brazil_amld_interim = "./data/interim/brazil_amld"
image_classification = "./data/processed/image_classification"
dataset = "dataset01"

## Extract Relevant Info for Dataset 

In [3]:
# read csv
df_brazil_amld = pd.read_csv(brazil_amld_interim + "/brazil_amld.csv", dtype="str")

In [4]:
# extract only images
df_brazil_amld_image = df_brazil_amld[(df_brazil_amld.data_type == "image") & 
                                      (df_brazil_amld.species_binary != "Exclude")].reset_index(drop=True)

In [5]:
# keep selected columns
df_imgclass_dataset01 = df_brazil_amld_image.iloc[:, np.r_[:10, 12:20]]

In [6]:
# generate ds_id
df_imgclass_dataset01.index.rename("ds_id", inplace=True)
df_imgclass_dataset01 = df_imgclass_dataset01.reset_index()
df_imgclass_dataset01.ds_id = df_imgclass_dataset01.ds_id.astype("str").str.zfill(4)

In [7]:
df_imgclass_dataset01

Unnamed: 0,ds_id,file_id,cam_trap,species,num_animal,year,month,day,hour,minute,second,file_type,data_type,image_id,num_animal_new,file_path_new,file_name_new,species_binary,species_category
0,0000,0000,SSAMLD2_2,Guerlinguetus,01,2019,04,11,08,48,26,jpg,image,0000,01,./data/interim/brazil_amld/image/Guerlinguetus,0000_SSAMLD2_2_Guerlinguetus_01_2019_04_11_08_...,Animal,Rodents
1,0001,0002,SSAMLD2_2,Guerlinguetus,01,2019,04,11,08,48,24,jpg,image,0001,01,./data/interim/brazil_amld/image/Guerlinguetus,0001_SSAMLD2_2_Guerlinguetus_01_2019_04_11_08_...,Animal,Rodents
2,0002,0003,SSAMLD2_2,Guerlinguetus,01,2019,04,11,08,48,20,jpg,image,0002,01,./data/interim/brazil_amld/image/Guerlinguetus,0002_SSAMLD2_2_Guerlinguetus_01_2019_04_11_08_...,Animal,Rodents
3,0003,0004,SSAMLD2_2,Guerlinguetus,01,2019,04,11,08,48,22,jpg,image,0003,01,./data/interim/brazil_amld/image/Guerlinguetus,0003_SSAMLD2_2_Guerlinguetus_01_2019_04_11_08_...,Animal,Rodents
4,0004,0006,SSAMLD2_2,Guerlinguetus,01,2019,03,27,07,26,16,jpg,image,0004,01,./data/interim/brazil_amld/image/Guerlinguetus,0004_SSAMLD2_2_Guerlinguetus_01_2019_03_27_07_...,Animal,Rodents
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1392,1392,1965,CT03,CanisLupusFamiliaris,01,2018,08,16,13,30,21,jpg,image,1447,01,./data/interim/brazil_amld/image/CanisLupusFam...,1447_CT03_CanisLupusFamiliaris_01_2018_08_16_1...,Animal,Canines
1393,1393,1968,CT03,CanisLupusFamiliaris,01,2018,08,16,13,29,57,jpg,image,1448,01,./data/interim/brazil_amld/image/CanisLupusFam...,1448_CT03_CanisLupusFamiliaris_01_2018_08_16_1...,Animal,Canines
1394,1394,1970,CT03,Ghost,01,2018,08,15,20,44,18,jpg,image,1449,01,./data/interim/brazil_amld/image/Ghost,1449_CT03_Ghost_01_2018_08_15_20_44_18.jpg,Ghost,Ghost
1395,1395,1972,CT03,Ghost,01,2018,08,19,15,30,41,jpg,image,1450,01,./data/interim/brazil_amld/image/Ghost,1450_CT03_Ghost_01_2018_08_19_15_30_41.jpg,Ghost,Ghost


## Train-val-test split

In [8]:
# train, val, test split
df_train_val, df_test = train_test_split(df_imgclass_dataset01, test_size=0.2,
                                         stratify=df_imgclass_dataset01.species_binary, random_state=0)
df_train, df_val = train_test_split(df_train_val, test_size=0.125,
                                         stratify=df_train_val.species_binary, random_state=0)

In [9]:
# same split info into df
df_train = df_train.assign(split = "train")
df_val = df_val.assign(split = "val")
df_test = df_test.assign(split = "test")
# append splits back together
df_imgclass_dataset01_split = df_train.append(df_val).append(df_test)

In [12]:
# generate file path for dataset
df_imgclass_dataset01_split['label'] = df_imgclass_dataset01_split.species_binary.replace({"Ghost":0, "Animal":1})
df_imgclass_dataset01_split['file_path_ds'] = "/".join((image_classification, dataset)) + "/" + \
df_imgclass_dataset01_split.apply(lambda x:'%s/%s' % (x['split'],x['label']),axis=1)

In [None]:
# export df to csv
df_imgclass_dataset01_split.to_csv(image_classification + dataset + f"/{dataset}.csv", index=False)

## Create Dataset

In [13]:
# convert df into dict
dataset01_dict = df_imgclass_dataset01_split.to_dict('index')

In [14]:
# copy all data from raw data directory to interim data directory with renamed file name
for k, v in dataset01_dict.items():
    os.makedirs(v['file_path_ds'], exist_ok=True)
    shutil.copyfile(v['file_path_new'] + '/' + v['file_name_new'], v['file_path_ds'] + "/" + v['file_name_new'])