# Construyendo un Model de Deep Learning que diferencie Perros de Gatos desde 0

Pasos:
1. Descargamos nuestro conjunto de datos de: https://www.kaggle.com/c/dogs-vs-cats-redux-kernels-edition/data y lo descomprimimos dentro del directorio `notebook/data/`
2. Exploración de datos
3. Preparación de datos
4. Entrenando modelo inicial usando una CNN + Datos Aumentados
5. Verificación manual de mi modelo

## 1. Descargamos y disponemos el conjunto de datos
Fuente: https://www.kaggle.com/c/dogs-vs-cats-redux-kernels-edition/data

#### 1.1 Descargamos y descomprimimos

Al descargar y descomprimir debe verse así:
```
notebook/data/
│
├── dogs-vs-cats-redux-kernels-edition.zip
└── dogs-vs-cats-redux-kernels-edition/
    ├── sample_submission.csv
    ├── test.zip
    └── train.zip
```

#### 1.2 Descomprimimos test.zip y train.zip en `notebook/data/` y finalmente, eliminamos ambos: el directorio `notebook/data/dogs-vs-cats-redux-kernels-edition/` y el zip `notebook/data/dogs-vs-cats-redux-kernels-edition.zip`

Al realizar esto quedamos con:
```
notebook/data/
│
├── test/
└── train/
```

Renombramos el folder `test/` a `unlabeled_test_data/`:
```
notebook/data/
│
├── unlabeled_test_data/
└── train/
```

## 2. Exploración de datos

In [None]:
import os
import matplotlib.pyplot as plt; plt.rcdefaults()
import numpy as np
import matplotlib.pyplot as plt
from os.path import join

In [None]:
DATA_PATH = 'data'

Separamos imagenes de gatos y perros en diferentes directorios

In [None]:
from tqdm import tqdm

if not os.path.exists(DATA_PATH):
    os.makedirs(DATA_PATH)

if not os.path.exists(join(DATA_PATH, 'dog')):
    os.makedirs(join(DATA_PATH, 'dog'))

if not os.path.exists(join(DATA_PATH, 'cat')):
    os.makedirs(join(DATA_PATH, 'cat'))

TRAIN_DATA_PATH = 'data/train/'

for file in tqdm(os.listdir(TRAIN_DATA_PATH)):
    new_file = file.replace('.jpg', '')
    # "cat.0.jpg" -> "cat.0"
    new_file = new_file.replace('.', '/')
    # "cat.0" -> "cat/0"
    new_file = new_file + '.jpg'
    # "cat/0" -> cat/0.jpg
    os.rename(join(TRAIN_DATA_PATH, file), join(DATA_PATH, new_file))
    # "data/train/cat.0.jpg"     ->   "data/cat/0.jpg"
    # "data/train/dog.1230.jpg"  ->   "data/dog/1230.jpg"

os.rmdir(TRAIN_DATA_PATH)

Entendiendo la distribución de mis datos

In [None]:
os.listdir(DATA_PATH)

In [None]:
total_cats = len(os.listdir(join(DATA_PATH, 'cat')))
total_dogs = len(os.listdir(join(DATA_PATH, 'dog')))
print("Número de imágenes de gatos: ", total_cats)
print("Número de imágenes de perros: ", total_dogs)

In [None]:
pets = ('Perro', 'Gato')
y_pos = np.arange(len(pets))
counts = [total_dogs, total_cats]

plt.bar(y_pos, counts)
plt.xticks(y_pos, pets)
plt.ylabel('Total')
plt.title('Número de mascotas. Diagrama de barras.')

plt.show()

## 3. Preparación de datos

Creamos los directorios `train`, `test` y `validation` para entrenamiento

In [None]:
# Train

if not os.path.exists(join(DATA_PATH, 'train')):
    os.makedirs(join(DATA_PATH, 'train'))

if not os.path.exists(join(DATA_PATH, 'train/dog')):
    os.makedirs(join(DATA_PATH, 'train/dog'))

if not os.path.exists(join(DATA_PATH, 'train/cat')):
    os.makedirs(join(DATA_PATH, 'train/cat'))

# Test

if not os.path.exists(join(DATA_PATH, 'test')):
    os.makedirs(join(DATA_PATH, 'test'))

if not os.path.exists(join(DATA_PATH, 'test/dog')):
    os.makedirs(join(DATA_PATH, 'test/dog'))

if not os.path.exists(join(DATA_PATH, 'test/cat')):
    os.makedirs(join(DATA_PATH, 'test/cat'))

# Validation

if not os.path.exists(join(DATA_PATH, 'validation')):
    os.makedirs(join(DATA_PATH, 'validation'))

if not os.path.exists(join(DATA_PATH, 'validation/dog')):
    os.makedirs(join(DATA_PATH, 'validation/dog'))

if not os.path.exists(join(DATA_PATH, 'validation/cat')):
    os.makedirs(join(DATA_PATH, 'validation/cat'))

print('Directorios creados...')

Dividimos los datos de entrenamiento de la siguiente forma:
- Train - 80%
- Test - 10%
- Validation - 10%

In [None]:
list_of_dogs = os.listdir(join(DATA_PATH, 'dog'))
list_of_cats = os.listdir(join(DATA_PATH, 'cat'))

In [None]:
import random

# Train

dog_train_size = int(len(list_of_dogs) * 0.8)
train_dog = random.sample(list_of_dogs, k=dog_train_size)

for dog in tqdm(train_dog):
    os.rename(join(DATA_PATH, 'dog', dog), join(DATA_PATH, 'train/dog', dog))

cat_train_size = int(len(list_of_cats) * 0.8)
train_cat = random.sample(list_of_cats, k=cat_train_size)

for cat in tqdm(train_cat):
    os.rename(join(DATA_PATH, 'cat', cat), join(DATA_PATH, 'train/cat', cat))

print('Datos de entrenamiento creados...')

# Test

list_of_dogs = os.listdir(join(DATA_PATH, 'dog'))
list_of_cats = os.listdir(join(DATA_PATH, 'cat'))

dog_test_size = int(len(list_of_dogs) * 0.5)
test_dog = random.sample(list_of_dogs, k=dog_test_size)

for dog in tqdm(test_dog):
    os.rename(join(DATA_PATH, 'dog', dog), join(DATA_PATH, 'test/dog', dog))

cat_test_size = int(len(list_of_cats) * 0.5)
test_cat = random.sample(list_of_cats, k=cat_test_size)

for cat in tqdm(test_cat):
    os.rename(join(DATA_PATH, 'cat', cat), join(DATA_PATH, 'test/cat', cat))

print('Datos de prueba creados...')

# Validation

list_of_dogs = os.listdir(join(DATA_PATH, 'dog'))
list_of_cats = os.listdir(join(DATA_PATH, 'cat'))

for dog in tqdm(list_of_dogs):
    os.rename(join(DATA_PATH, 'dog', dog), join(DATA_PATH, 'validation/dog', dog))

for cat in tqdm(list_of_cats):
    os.rename(join(DATA_PATH, 'cat', cat), join(DATA_PATH, 'validation/cat', cat))

os.rmdir(join(DATA_PATH, 'cat'))
os.rmdir(join(DATA_PATH, 'dog'))

print('Datos de validación creados...')

## 4. Entrenando modelo inicial usando una CNN + Datos Aumentados

Ver notebook `notebook/model_training.ipynb`

## 5. Verificación manual del modelo

In [None]:
def display_image(image_path):
    dog_img = plt.imread(image_path)
    plt.imshow(dog_img)
    plt.show()

def translate_pred(prediction: np.array) -> str:
    """Input prediction's shape: array([[X]], dtype=float32).
    
    Output examples:
        - ("Dog", 70.8)
        - ("Cat", 90.2)
    """
    if prediction[0][0] > 0.5:
        return "Dog", prediction[0][0] * 100
    else:
        return "Cat", (1 - prediction[0][0]) * 100

In [None]:
from tensorflow import keras
cnn_model = keras.models.load_model(join(DATA_PATH, 'model', 'cnn_model'))
cnn_model.metrics_names

In [None]:
def model_predict(image_uri: str):
    import numpy as np
    from keras.preprocessing.image import load_img, img_to_array, ImageDataGenerator

    display_image(image_uri)
    
    #load the image
    img_width, img_height = 32, 32
    my_image = load_img(image_uri, target_size=(img_width, img_height))

    #preprocess the image
    test_datagen = ImageDataGenerator(rescale=1. / 255)
    img_arr = np.expand_dims(img_to_array(my_image), axis=0)
    preprocessed_img = next(test_datagen.flow(img_arr, batch_size=1))

    prediction = cnn_model.predict(preprocessed_img)
    animal_kind, confidence = translate_pred(prediction)
    print(f"It's a {animal_kind} ({confidence:.2f}%)")

In [None]:
model_predict('data/unlabeled_test_data/1.jpg')