## EDA and data cleaning

This notebook is only for data inspection and cleaning, it doesn't apply any modeling to the dataset.

#### The following methods are applied:  
- Load the csv file into a dataframe
- Inspect number of images per dog
- Inspect most common dog breeds in the dataset
- Load image dimensions (width and height) and calculate aspect ratio for all images

#### Data cleaning
- Filter out images that are invalid, or too small (width or height < 100 pixels)
- Filter out images with outlier aspect ratio (keep images with aspect ratio between 0.75 and 1.33)
- Resize all images to 80x80 pixels
- Convert all images to greyscale
- Save the result into pkl file in the working directory


**Note**: this notebook takes ~40 minutes to run!

In [None]:
# basic imports
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()

# plotting imports
from matplotlib import pyplot as plt 
import matplotlib.image as mpimg
%matplotlib inline
import seaborn as sns 
sns.set()

# image libs imports
from PIL import Image

# global variables
# Root folder: /kaggle/input/dog-breed-photos/
# CSV file   : /kaggle/input/dog-breed-photos/dog_breed_photos.csv
# Images     : /kaggle/input/dog-breed-photos/dog_breed_photos/dog_breed_photos/<image>
image_folder = '/kaggle/input/dog-breed-photos/dog_breed_photos/dog_breed_photos/'

In [None]:
# load dataframe and look at random 5 rows
df = pd.read_csv('/kaggle/input/dog-breed-photos/dog_breed_photos.csv')
df.sample(5, random_state = 13)

In [None]:
# Show the histogram of number of photos per dog
ax = df[['DogId', 'ImageNumber']].groupby('DogId').max()['ImageNumber'].hist(bins = 10)
_ = ax.set(xticks=range(1,11), xlabel = '# Images per dog', ylabel='# Dogs')
_ = ax.set_title('Histogram showing distribution of the number of images per dog')

In [None]:
# Most common breeds
df['Breed'].value_counts().reset_index()\
    .rename(columns={'index': 'Most Common Breeds', 'Breed':'Dogs'}).head(10)

In [None]:
# inspecting sample images from the images folder!
df_plot = df.sample(9, random_state = 10).reset_index()
fig, ax = plt.subplots(3,3, figsize=(12,12))
for i, axi in enumerate(ax.flat):
    breed = df_plot.Breed[i]                       # get the breed
    filePath = image_folder + df_plot.Image[i]     # build file path
    axi.imshow(Image.open(filePath))               # show the image
    axi.set(xticks=[], yticks=[], xlabel = breed)  # put the breed as xlabel
    axi.patch.set_edgecolor('black')               # black border
    
plt.suptitle('Sample photos in the dataset')
fig.tight_layout()

In [None]:
# Get dimensions of images, some images are corrupted or cannot be read.
# return (0,0) in that case so we can filter out corrupted images.
def get_dimensions(path):
    try:
        image = Image.open(image_folder + path)
        return image.size
    except Exception as e:
        return (0,0) # if the image is corrupted or un-readable return size (0,0)
    
# Get the dimensions of the image 
# use progress_apply to show progress of slow operations
df[['Width', 'Height']] = df.progress_apply(lambda row: get_dimensions(row.Image) , axis = 1).tolist()

In [None]:
# 2-d histogram of image width/height
with sns.axes_style('white'):
    sns.jointplot(x="Width", y="Height", data=df, kind="hist", color ='blue')

In [None]:
# Add aspect ratio as a new variable
df['AspectRatio'] = np.where(df['Height'] > 0, df['Width']/df['Height'], 0)

In [None]:
# check the aspect ratio of all the images
_ = df['AspectRatio'].hist(bins = 10)

In [None]:
# OUTLIERS in aspect ratio:
# check the images with outlier aspect ratio
_ = df[['AspectRatio']].boxplot()

## Data Cleaning decisions:


In [None]:
## 1. Remove invalid or small images

print("Shape before removing invalid images {}".format(df.shape))

# remove rows with zero/tiny width and height
# this will remove corrupted images or very small images
df = df[(df['Width'] >= 100) & (df['Height'] >= 100)].copy()

print("Shape after removing invalid images {}".format(df.shape))

In [None]:
## 2. Remove rows with too small, or too high aspect ratio
print("Shape before removing outlier aspect ratio {}".format(df.shape))

df = df[(df['AspectRatio'] > 0.749) & (df['AspectRatio'] < 1.334)].copy()

print("Shape removing removing outlier aspect ratio {}".format(df.shape))

# check the boxplot of aspect ratio again to make sure we don't have outliers
df[['AspectRatio']].boxplot()

In [None]:
## 3. Resize images and convert to greyscale
def load_and_resize_image(path):
    img = Image.open(image_folder + path)
    img = img.resize((80, 80), Image.ANTIALIAS).convert(mode="L") 
    return np.array(img)

# show progress bar while resizing all images
df['data'] = df.progress_apply(lambda row: load_and_resize_image(row.Image), axis = 1)

## Saving clean data as pkl file

In [None]:
## save to working folder
df.to_pickle('/kaggle/working/dog_breed_photos_v1.pkl')