# **EVA** IMAGE CLASSIFIER

![img_ppal](https://www.innovacion-tecnologia.com/wp-content/uploads/2020/09/Historia-de-los-humanoides.jpg)

## *ÍNDICE:* 
---
0. INTRODUCCIÓN
1. **DEEP LEARNING**
    - 2.1. Preparación y limpieza de datos
    - 2.2. Feature Engineering
    - 2.3. Modelado
2. RESULTADOS
    - 3.1. Visualización y reporting de los resultados
    - 3.2. Creación de un pipeline para el flujo automatizado
---

# ***DEEP LEARNING***

---

### Librerías

In [1]:
import os
import pandas as pd
import shutil
from skimage.io import imread
import cv2
import random 
import matplotlib.pyplot as plt
import numpy as np
from google.colab import drive
import seaborn as sns
import tensorflow as tf

from PIL import Image

# batch ingestion of pics without pickle
from tensorflow.keras.preprocessing import image_dataset_from_directory

from tensorflow.keras.applications import VGG16, VGG19,Xception, InceptionResNetV2, DenseNet121, ResNet50 

from tensorflow.keras import Model, layers, optimizers, metrics, losses
from tensorflow.keras.models import Sequential, load_model

#from tensorflow.keras.layers import GlobalAveragePooling2D, Dense, Dropout, MaxPooling2D, Flatten # Input, Conv2D, 
#from tensorflow.keras.layers import Layer

# optimization
from tensorflow.keras.optimizers import Adam #, SGD
from tensorflow.keras.losses import SparseCategoricalCrossentropy, categorical_crossentropy
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.applications import VGG16, VGG19,Xception, InceptionResNetV2, DenseNet121, ResNet50 

# metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
#from sklearn.metrics import plot_precision_recall_curve
from itertools import chain # to flatten the real labels array from validation set
import json # to save in a file metrics
#from datetime import datetime # to name results

In [2]:
import pickle

In [3]:
#import yolo5

In [4]:
!pip install tensorflow_addons
import tensorflow_addons as tfa
from keras import backend as K

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow_addons
  Downloading tensorflow_addons-0.19.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
Collecting typeguard>=2.7
  Downloading typeguard-3.0.1-py3-none-any.whl (30 kB)
Installing collected packages: typeguard, tensorflow_addons
Successfully installed tensorflow_addons-0.19.0 typeguard-3.0.1


#### Funciones propias 
(reubicar posteriormente)

In [5]:
def define_subfolders(df: pd.DataFrame, class_names: dict, input_path: str, output_path: str,img_train_path: str, ds_img_path: str = 'path_img'):
    '''
    Objective: Create subfolder for each class within the training set folder

    args.
    ---
    df: pd.DataFrame; the training dataset

    class_names: dict; key define the class name and value refers to the integer label

    input_path: str; path where is the image set/ set
    
    output_path: str; path to send the image

    ret.
    ---
    None
    '''
    for k,v in class_names.items():
        image_list = list(df[df["label"]==v][ds_img_path])
        new_path_folder = img_train_path+output_path+"/"+k+"/"
        os.makedirs(new_path_folder, exist_ok=True)
        for i in image_list:
            image = i.split("/")[-1]
            old_path = img_train_path+input_path+"/"+image
            new_path = new_path_folder + image
            #print(k, old_path, "\n", new_path, "\n\n")
            shutil.copy(old_path, new_path)

In [6]:

def resize(s, input_path, input_path_res, class_names,csv_str='failed_resized.csv'): 
  
  img_not_resized = pd.DataFrame(columns=['folder_path','img_path'])
  for c in class_names:
    folder = input_path +"/"+ c
    pic_list = os.listdir(folder)
    folder_output = input_path_res +"/"+ c

    for p in pic_list:
      im_path = folder + "/" + p
      #print(im_path)
      try:
        im = Image.open(im_path)
        size = (s, int(im.size[1] * s / im.size[0]))
        im=im.convert('RGB')
        im_resized = im.resize(size, Image.ANTIALIAS)
        pname, ext = os.path.splitext(p)
        
        im_resized.save(folder_output + "/" + pname+'.jpeg',"JPEG")
      except:       
        img_not_resized=img_not_resized.append({'folder_path': folder,
                                                'img_path': pname+'.jpeg'},ignore_index=True)
        print('Fallo en: '+folder_output + "/" + pname +'.jpeg',"JPEG")
      
  img_not_resized.to_csv(IMG_TRAIN_PATH+csv_str)
  return img_not_resized

In [7]:

def resize2(s, input_path, input_path_res, class_names,csv_str='failed_resized.csv'): 
  
  img_not_resized = pd.DataFrame(columns=['folder_path','img_path'])
  for c in class_names:
    folder = input_path +"/"+ c
    pic_list = os.listdir(folder)
    folder_output = input_path_res +"/"+ c

    for p in pic_list:
      im_path = folder + "/" + p
      #print(im_path)
      try:
        im = Image.open(im_path)
        size = (s, s)
        im=im.convert('RGB')
        im_resized = im.resize(size, Image.ANTIALIAS)
        pname, ext = os.path.splitext(p)
        
        im_resized.save(folder_output + "/" + pname+'.jpeg',"JPEG")
      except:       
        img_not_resized=img_not_resized.append({'folder_path': folder,
                                                'img_path': pname+'.jpeg'},ignore_index=True)
        print('Fallo en: '+folder_output + "/" + pname +'.jpeg',"JPEG")
      
  img_not_resized.to_csv(IMG_TRAIN_PATH+csv_str)
  return img_not_resized

In [None]:
def resize_test(df,s,folder=REL_PATH+'/data/all_imgs', input_path_res = REL_PATH+'/data/test'): 
  
  pic_list = df['path_img']
  folder_output = input_path_res 
  fails_listpath = []
  for p in pic_list:
    im_path = folder + "/" + p

    im = Image.open(im_path)
    size = (s, s)
    im=im.convert('RGB')
    try:
      im_resized = im.resize(size, Image.ANTIALIAS)
      pname, ext = os.path.splitext(p)
      
      im_resized.save(folder_output + "/" + pname+'.jpeg',"JPEG")
    except:
      fails_listpath.append(p)  
      

In [8]:
def copy_folder_structure(src_folder, dst_folder):
    """
    Copia la estructura de la carpeta src_folder en la carpeta dst_folder
    sin copiar los archivos que contienen
    """
    for root, dirs, _ in os.walk(src_folder):
        for directory in dirs:
            new_dir_path = os.path.join(root.replace(src_folder, dst_folder), directory)
            os.makedirs(new_dir_path, exist_ok=True)

In [9]:
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision
    
def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

### Definición de constantes

In [41]:
# relative path
# REL PATH=os.getcwd()
REL_PATH = '/content/drive/MyDrive/ColabNotebooks/HACKATON/hackaton_nuwe_oracle/reto_1/NuweHackatonOracle_DL_Clf_reto1'

# paths
IMG_TRAIN_PATH = REL_PATH + '/data/' # + example_path (from csv train)
IMG_TEST_PATH = REL_PATH + '/data/test' # + example_path (from csv test)
TRAIN_PATH = REL_PATH + '/data/train.csv'
TEST_PATH = REL_PATH + '/data/test.csv'

IMG_TRAIN_PATH_RES = REL_PATH + '/data/train_modified/train_resized_WXH'
IMG_TRAIN_PATH_RES2 = REL_PATH + '/data/train_modified/train_resized_WXW'

# processing
IMAGE_WIDTH = 64
IMAGE_HEIGHT = 64
IMAGE_CHANNELS = 3
IMAGE_SIZE = (IMAGE_WIDTH, IMAGE_HEIGHT)
BATCH_SIZE = 32
EPOCHS = 10

In [None]:
%pwd

'/content'

In [None]:
os.chdir(REL_PATH)

In [None]:
ls

0_Introduccion.ipynb  1b_DL_optimizing.ipynb  [0m[01;34msetup[0m/
1a_DL_baseline.ipynb  [01;34mdata[0m/                   [01;34mutility[0m/


### Preparación y limpieza de los datos

In [11]:
ds_train=pd.read_csv(TRAIN_PATH)

In [12]:
categories = ['burger','chicken','donut','fries','hotdog','pizza','sandwich','potatoes']

In [13]:
class_names={str(i)+'-'+categories[i]: i for i in range(len(categories))}

In [14]:
class_names

{'0-burger': 0,
 '1-chicken': 1,
 '2-donut': 2,
 '3-fries': 3,
 '4-hotdog': 4,
 '5-pizza': 5,
 '6-sandwich': 6,
 '7-potatoes': 7}

In [15]:
df_train=ds_train.copy()

In [16]:
df_train[['path_img_folder', 'path_img']] = df_train['path_img'].str.split('/', expand=True)

In [17]:
df_train.head()

Unnamed: 0,idx_train,path_img,label,path_img_folder
0,0,bc7696f4-1460-4d0b-a63d-f84b3be4da0f.jpeg,0,all_imgs
1,1,f8d50663-60d8-4da5-a8b8-79f954aec503.jpeg,2,all_imgs
2,2,51df0f29-758b-4741-ab74-a0ff8e21c044.jpeg,4,all_imgs
3,3,f61b81d3-3b79-4162-b4d6-4f1b39518c4c.jpeg,0,all_imgs
4,4,b21e0668-bd09-4794-9e90-da8ecffc4c1c.jpeg,0,all_imgs


In [None]:
#define_subfolders(df_train, class_names, 'all_imgs', 'train',IMG_TRAIN_PATH, 'path_img')

In [None]:
#len(os.listdir(IMG_TRAIN_PATH+'train'+'/0-burger'))

In [None]:
1396*8

11168

In [18]:
train_img_path = IMG_TRAIN_PATH + 'train'


In [None]:
copy_folder_structure(train_img_path,IMG_TRAIN_PATH_RES2)

In [None]:
class_names.keys()

dict_keys(['0-burger', '1-chicken', '2-donut', '3-fries', '4-hotdog', '5-pizza', '6-sandwich', '7-potatoes'])

In [None]:
fails=resize(IMAGE_WIDTH, train_img_path, IMG_TRAIN_PATH_RES, class_names)



In [None]:
fails2=resize2(IMAGE_WIDTH, train_img_path, IMG_TRAIN_PATH_RES2, class_names)



In [None]:
#fails.head()

In [None]:
# train set
# https://www.tensorflow.org/tutorials/images/classification
train_ds = image_dataset_from_directory(
    IMG_TRAIN_PATH_RES2,
    class_names = class_names.keys(),
    labels='inferred',
    # label_mode='categorical',
    validation_split=0.3,
    subset="training",
    seed=42,
    image_size= IMAGE_SIZE,
    batch_size= BATCH_SIZE,
    color_mode='rgb'
)

Found 11189 files belonging to 8 classes.
Using 7833 files for training.


In [None]:
# validation set (use the same seed and split)
val_ds = image_dataset_from_directory( # utilizar el de validación como test
    IMG_TRAIN_PATH_RES2,
    labels='inferred',
    # label_mode='categorical',
    class_names=class_names.keys(),
    validation_split=0.3,
    subset="validation",
    seed=42,
    image_size= IMAGE_SIZE,
    batch_size= BATCH_SIZE,
    color_mode='rgb'
)

Found 11189 files belonging to 8 classes.
Using 3356 files for validation.


In [None]:
num_classes = len(class_names)

In [None]:
# https://www.tensorflow.org/guide/keras/preprocessing_layers
data_augmentation = Sequential([
  layers.RandomFlip("horizontal_and_vertical"),
  layers.RandomRotation(0.2),
  layers.RandomZoom(0.005),
])

In [None]:
num_classes = 8

# Define the model
model_baseline6 = Sequential([
  data_augmentation,
  layers.Rescaling(1./255),
  
  layers.Conv2D(16, kernel_size=(3,3), padding='same', activation='relu'),
  layers.MaxPooling2D(pool_size=(2,2)),
  layers.Conv2D(32, kernel_size=(3,3), padding='same', activation='relu'),
  layers.MaxPooling2D(pool_size=(2,2)),
  layers.Conv2D(64, kernel_size=(3,3), padding='same', activation='relu'),
  layers.MaxPooling2D(pool_size=(2,2)),
  layers.Dropout(0.2),
  layers.Flatten(input_shape=(64, 64, 3)),
  layers.Dense(128, activation='relu'),
  layers.Dense(num_classes, activation='softmax')
  ])
# Define the optimizer
optimizer = optimizers.Adam()

# Define the loss function
loss_fn = losses.SparseCategoricalCrossentropy()


# Compile the model
model_baseline6.compile(optimizer=optimizer, loss=losses.SparseCategoricalCrossentropy(), metrics=['accuracy',precision_m,recall_m,f1_m])

# Train the model
history6 = model_baseline6.fit(train_ds,
  validation_data=val_ds,
  epochs=EPOCHS, verbose=1)

Epoch 1/10




Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
model_baseline6.save(IMG_TRAIN_PATH+'models/model6.h5')

In [None]:
# https://www.tensorflow.org/guide/keras/preprocessing_layers
data_augmentation = Sequential([
  layers.RandomFlip("horizontal_and_vertical"),
  layers.RandomRotation(0.15),
  layers.RandomZoom(0.05),
])

num_classes = 8

# Define the model
model_baseline7 = Sequential([
  #data_augmentation,
  layers.Rescaling(1./255),
  
  layers.Conv2D(16, kernel_size=(3,3), padding='same', activation='relu'),
  layers.MaxPooling2D(pool_size=(2,2)),
  layers.Conv2D(32, kernel_size=(3,3), padding='same', activation='relu'),
  layers.MaxPooling2D(pool_size=(2,2)),
  layers.Conv2D(64, kernel_size=(3,3), padding='same', activation='relu'),
  layers.MaxPooling2D(pool_size=(2,2)),
  layers.Dropout(0.2),
  layers.Flatten(input_shape=(64, 64, 3)),
  layers.Dense(128, activation='relu'),
  layers.Dense(64, activation='relu'),
  layers.Dense(num_classes, activation='softmax')
  ])
# Define the optimizer
optimizer = optimizers.Adam()

# Define the loss function
loss_fn = losses.SparseCategoricalCrossentropy()


# Compile the model
model_baseline7.compile(optimizer=optimizer, loss=losses.SparseCategoricalCrossentropy(), 
                        metrics=['accuracy'])

# Train the model
history7 = model_baseline7.fit(train_ds, shuffle=True,
  validation_data=val_ds,
  epochs=EPOCHS, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
model_baseline7.save(IMG_TRAIN_PATH+'models/model7.h5')

#### **TRANSFER LEARNING VGG16**

In [None]:
def vgg19_vgg16(data_augmentation, base_model, dropout_layers: bool, dropout_position: str, dropout_percent: float, num_classes):
  '''
  Objective: both architectures has the same top model, so we'll built.

  args.
  ---
  
  ret.
  ---
  
  '''
  pre_trained = Sequential()

  if data_augmentation:
    #data augmentation
    pre_trained.add(data_augmentation)
    pre_trained.add(layers.Rescaling(1./255))
       
                                                                 
  # vgg16 (Functional)          
  pre_trained.add(base_model)

  # Freeze the layers 
  for layer in pre_trained.layers:
      layer.trainable = False

  # i had to insert this layer when using the data augmentation layer in order to avoid dimension errors with VGG16 and 19
  pre_trained.add(layers.GlobalAveragePooling2D())

  # adding top model with/without dropout in first/middle position
  # top moddel for vgg19-16 are a flatten layer, 2 dense layers of 4096n and pred layer 
  pre_trained.add(layers.Flatten())
  if dropout_layers is True:
      if dropout_position=="first":
          pre_trained.add(layers.Dropout(dropout_percent)) 
          pre_trained.add(layers.Dense(4096,activation=('relu')))

      elif dropout_position == "middle":                  
          pre_trained.add(layers.Dense(4096,activation=('relu')))
          pre_trained.add(layers.Dropout(dropout_percent)) 
      
  else: 
    pre_trained.add(layers.Dense(4096,activation=('relu')))
  
  pre_trained.add(layers.Dense(4096,activation=('relu')))
  pre_trained.add(layers.Dense(num_classes,activation=('softmax')))

  return pre_trained


In [None]:
base_model_vgg19 = VGG19(include_top=False, weights='imagenet', input_shape=(IMAGE_SIZE[0], IMAGE_SIZE[1], 3), classes = len(class_names), classifier_activation='softmax')
tl_model1 = vgg19_vgg16(data_augmentation=data_augmentation,  
                                                   base_model= base_model_vgg19, 
                                                   dropout_layers=False,  
                                                   dropout_position= None,  
                                                   dropout_percent=None,  
                                                   num_classes=len(class_names))

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg19/vgg19_weights_tf_dim_ordering_tf_kernels_notop.h5


In [None]:
# Compile the model
tl_model1.compile(optimizer=optimizers.Adam(), loss=losses.SparseCategoricalCrossentropy(), 
                        metrics=['accuracy', f1_m])

# Train the model
historytl1 = tl_model1.fit(train_ds, shuffle=True,
  validation_data=val_ds,
  epochs=EPOCHS, verbose=1)

Epoch 9/10
 11/245 [>.............................] - ETA: 9:18 - loss: 1.2438 - accuracy: 0.5597 - f1_m: 2.4843

KeyboardInterrupt: ignored

In [None]:
tl_model1.save(IMG_TRAIN_PATH+'models/model_tl1.h5')

## **Predicción**

In [19]:
ds_test = pd.read_csv(TEST_PATH)

In [20]:
ds_test.head()

Unnamed: 0,idx_test,path_img
0,0,all_imgs/ec4c75ba-e064-4ed0-a7b1-3ed899424110....
1,1,all_imgs/114fb94d-3223-4d95-b6d1-f6a6ac206ea2....
2,2,all_imgs/a80e7e27-a25a-40aa-a65c-a29bfa172b3d....
3,3,all_imgs/7848d24e-207a-4236-9cd7-df78122d8787....
4,4,all_imgs/57f726bc-d141-42fb-88fa-60d554499277....


In [21]:
df_predictions = ds_test.copy()

In [29]:
df_predictions[['path_img_folder', 'path_img']] = df_predictions['path_img'].str.split('/', expand=True)

In [30]:
df_predictions.head()

Unnamed: 0,idx_test,path_img,path_img_folder
0,0,ec4c75ba-e064-4ed0-a7b1-3ed899424110.jpeg,all_imgs
1,1,114fb94d-3223-4d95-b6d1-f6a6ac206ea2.jpeg,all_imgs
2,2,a80e7e27-a25a-40aa-a65c-a29bfa172b3d.jpeg,all_imgs
3,3,7848d24e-207a-4236-9cd7-df78122d8787.jpeg,all_imgs
4,4,57f726bc-d141-42fb-88fa-60d554499277.jpeg,all_imgs


In [42]:
IMG_TEST_PATH

'/content/drive/MyDrive/ColabNotebooks/HACKATON/hackaton_nuwe_oracle/reto_1/NuweHackatonOracle_DL_Clf_reto1/data/test'

In [49]:
len(os.listdir(IMG_TEST_PATH))

4797

In [44]:
len(df_predictions)

4797

In [48]:
resize_test(df_predictions,64)



In [58]:
df_predictions.head()

Unnamed: 0,idx_test,path_img,path_img_folder
0,0,ec4c75ba-e064-4ed0-a7b1-3ed899424110.jpeg,all_imgs
1,1,114fb94d-3223-4d95-b6d1-f6a6ac206ea2.jpeg,all_imgs
2,2,a80e7e27-a25a-40aa-a65c-a29bfa172b3d.jpeg,all_imgs
3,3,7848d24e-207a-4236-9cd7-df78122d8787.jpeg,all_imgs
4,4,57f726bc-d141-42fb-88fa-60d554499277.jpeg,all_imgs


In [60]:
predictions =[]

selected_model = load_model(IMG_TRAIN_PATH+'/models/model7.h5')
for i in range(len(df_predictions)):
  im_path = IMG_TEST_PATH + "/" + df_predictions["path_img"][i]
  img = cv2.imread(im_path)
  img = np.reshape(img,[1,64,64,3])
  predictions.append(selected_model.predict(img))
df_predictions['predictions'] = predictions


  



In [67]:
REL_PATH

'/content/drive/MyDrive/ColabNotebooks/HACKATON/hackaton_nuwe_oracle/reto_1/NuweHackatonOracle_DL_Clf_reto1'

In [77]:
pred_max=[]
for i in range(len(df_predictions)):
  pred_max.append(np.argmax(df_predictions['predictions'][i]))

df_predictions['predictions_max']=pred_max

In [79]:
data = {"target": {str(df_predictions['idx_test'][i]):int(df_predictions['predictions_max'][i]) for i in range(len(df_predictions))}}

with open(REL_PATH + '/predictions.json', 'w') as f:
    json.dump(data, f)

f.close()