## Data Preprocessing

#### Steps:

<div> 
&#8211; Load all images with quality classification<br>
&#8211; Filter data to only include images classified higher than a threashold <br>
&#8211; Resize images so they are correct size for inputting into CNN <br>
&#8211; Save directory contatining filtered and resized images <br>

</div>

In [2]:
import os
import cv2
import numpy as np
import pandas as pd
import py_files 
from py_files import load_data, bird_info, const
import warnings
warnings.filterwarnings('ignore')

In [3]:
#Load images
images, labels = load_data.load_data_from_folder("birds")

Loading images....
0_common_blackbird
1_house_sparrow
2_great_tit
3_wood_pigeon
4_europen_robin
5_eurasian_magpie
6_eurasian_blue_tit
7_common_starling
8_european_goldfinch
9_long_tailed_tit


In [4]:
#Load quality classification - images and classification have a matching index
image_quality = np.load("image_quality_dir/all_birds.npy", allow_pickle=True) 

In [5]:
images_np = np.array(images)
labels_np = np.array(labels)
image_quality_np = np.array(image_quality)

In [6]:
pd_data=pd.DataFrame(columns=['Images', 'Labels', 'Image_Quality'])
pd_data['Images']=images
pd_data['Labels']=labels_np
pd_data['Image_Quality']=image_quality_np

In [7]:
pd_data.head()

Unnamed: 0,Images,Labels,Image_Quality
0,"[[[210, 206, 197], [210, 207, 198], [188, 185,...",0,2
1,"[[[63, 94, 34], [63, 94, 37], [68, 96, 45], [6...",0,3
2,"[[[13, 19, 9], [3, 8, 1], [8, 13, 6], [7, 9, 4...",0,4
3,"[[[131, 137, 137], [133, 138, 141], [134, 139,...",0,2
4,"[[[219, 230, 232], [219, 230, 232], [219, 230,...",0,3


In [8]:
def filter_data_by_image_quality(quality_boundary):
    df=pd_data[pd_data['Image_Quality']<=quality_boundary]
    return df

In [9]:
filtered_df=filter_data_by_image_quality(2)

def resize_images(df):
    resize_images=[]
    for i in df.index:
        resize_images.append(cv2.resize(df['Images'].loc[i], (256,256)))
    return resize_images
filtered_df['Images'] = resize_images(filtered_df)

In [10]:
image_quality_np_f = np.array(filtered_df['Image_Quality'])

In [12]:
# Save directory to be used to train CNN
for index_f in filtered_df.index:
    img=filtered_df['Images'][index_f]
    label_index=filtered_df['Labels'][index_f]
    path=const.BIRDS_DICT[label_index][1]
    path=path.replace(' ', '_').lower()
    path= str(label_index)+"_"+path
    if not os.path.exists("filtered_birds/"+path):
        os.mkdir("filtered_birds/"+path)
    cv2.imwrite("filtered_birds/"+path+"/" + path + "_img_" + str(index_f) + ".jpeg", img)

In [13]:
np.save("image_quality_dir/filtered_birds_002.npy", image_quality_np_f)