# Data cleaning

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import os, sys
import cv2

from tensorflow.keras.utils import to_categorical
from tensorflow.keras.applications.vgg16 import preprocess_input

from tqdm import tqdm
from PIL import Image

In [33]:
def load_flowers_data(loading_method):
    if loading_method == 'colab':
        data_path = '/content/drive/My Drive/shark-datasets/sharks'
    elif loading_method == 'direct':
        data_path = '../raw_data/sharks/'
    classes = {'basking': 0, 'blue': 1, 'hammerhead': 2, 'mako': 3, 'sand tiger': 4, 'tiger': 5, 'white' : 6,
               'blacktip': 7 , 'bull': 8, 'lemon':9 , 'nurse': 10, 'thresher': 11, 'whale': 12, 'whitetip': 13}
    imgs = []
    labels = []
    for (cl, i) in classes.items():
        images_path = [elt for elt in os.listdir(os.path.join(data_path, cl)) if elt.find('.jpg')>0]
        for img in tqdm(images_path[:300]):
            path = os.path.join(data_path, cl, img)
            if os.path.exists(path):
                image = Image.open(path).convert('RGB')
                image = image.resize((224, 224))
                imgs.append(np.array(image))
                labels.append(i)

    X = np.array(imgs)
    num_classes = len(set(labels))
    y = to_categorical(labels, num_classes)

    # Finally we shuffle:
    p = np.random.permutation(len(X))
    X, y = X[p], y[p]

    first_split = int(len(imgs) /6.)
    second_split = first_split + int(len(imgs) * 0.2)
    X_test, X_val, X_train = X[:first_split], X[first_split:second_split], X[second_split:]
    y_test, y_val, y_train = y[:first_split], y[first_split:second_split], y[second_split:]

    return X_train, y_train, X_val, y_val, X_test, y_test, num_classes

In [34]:
X_train, y_train, X_val, y_val, X_test, y_test, num_classes = load_flowers_data("colab")

100%|██████████| 90/90 [00:07<00:00, 11.75it/s]
100%|██████████| 64/64 [00:03<00:00, 19.06it/s]
100%|██████████| 124/124 [00:08<00:00, 15.40it/s]
100%|██████████| 37/37 [00:02<00:00, 14.57it/s]
100%|██████████| 95/95 [00:05<00:00, 18.81it/s]
100%|██████████| 94/94 [00:08<00:00, 10.73it/s]
100%|██████████| 126/126 [00:08<00:00, 14.91it/s]
100%|██████████| 13/13 [00:00<00:00, 22.20it/s]
100%|██████████| 103/103 [00:08<00:00, 12.73it/s]
100%|██████████| 115/115 [00:04<00:00, 25.76it/s]
100%|██████████| 62/62 [00:03<00:00, 19.58it/s]
100%|██████████| 108/108 [00:05<00:00, 21.56it/s]
100%|██████████| 126/126 [00:10<00:00, 11.71it/s]
100%|██████████| 119/119 [00:06<00:00, 17.49it/s]


In [27]:
X_train.shape

(849, 224, 224, 3)

In [36]:
X_train = preprocess_input(X_train)
X_val = preprocess_input(X_val)
X_test = preprocess_input(X_test)