<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Prototyping-DeepFace" data-toc-modified-id="Prototyping-DeepFace-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Prototyping DeepFace</a></span><ul class="toc-item"><li><span><a href="#1.-Mount-Google-Storage-Bucket" data-toc-modified-id="1.-Mount-Google-Storage-Bucket-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>1. Mount Google Storage Bucket</a></span></li><li><span><a href="#2.-Transfer-Learning" data-toc-modified-id="2.-Transfer-Learning-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>2. Transfer Learning</a></span></li></ul></li></ul></div>

# Prototyping DeepFace 

## 1. Mount Google Storage Bucket 

In [1]:
from google.colab import auth
auth.authenticate_user()

In [2]:
!echo "deb http://packages.cloud.google.com/apt gcsfuse-bionic main" > /etc/apt/sources.list.d/gcsfuse.list
!curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add -
!apt -qq update
!apt -qq install gcsfuse

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100  2537  100  2537    0     0  87482      0 --:--:-- --:--:-- --:--:-- 87482
OK
39 packages can be upgraded. Run 'apt list --upgradable' to see them.
The following NEW packages will be installed:
  gcsfuse
0 upgraded, 1 newly installed, 0 to remove and 39 not upgraded.
Need to get 11.3 MB of archives.
After this operation, 24.0 MB of additional disk space will be used.
Selecting previously unselected package gcsfuse.
(Reading database ... 155222 files and directories currently installed.)
Preparing to unpack .../gcsfuse_0.37.0_amd64.deb ...
Unpacking gcsfuse (0.37.0) ...
Setting up gcsfuse (0.37.0) ...


In [3]:
!mkdir data
!gcsfuse --implicit-dirs diversity-in-cinema-735 data 

2021/11/23 18:59:51.311335 Start gcsfuse/0.37.0 (Go version go1.17.2) for app "" using mount point: /content/data
2021/11/23 18:59:51.320934 Opening GCS connection...
2021/11/23 18:59:51.881226 Mounting file system "diversity-in-cinema-735"...
2021/11/23 18:59:51.913663 File system has been successfully mounted.


In [12]:
# imports
import numpy as np
import pandas as pd
import os

from PIL import Image


from tensorflow.keras import layers, models, Sequential 
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import optimizers

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

## 2. Transfer Learning

In [7]:
# build deepface model using published architecture
# https://www.cs.toronto.edu/~ranzato/publications/taigman_cvpr14.pdf

def load_model():

    model = Sequential()
    model.add(layers.Convolution2D(32, (11, 11), activation='relu', name='C1', input_shape=(152, 152, 3)))
    model.add(layers.MaxPooling2D(pool_size=3, strides=2, padding='same', name='M2'))
    model.add(layers.Convolution2D(16, (9, 9), activation='relu', name='C3'))
    model.add(layers.LocallyConnected2D(16, (9, 9), activation='relu', name='L4'))
    model.add(layers.LocallyConnected2D(16, (7, 7), strides=2, activation='relu', name='L5') )
    model.add(layers.LocallyConnected2D(16, (5, 5), activation='relu', name='L6'))
    

    # remove fully connected layers
    
    # model.add(layers.Flatten(name='F0'))
    # model.add(layers.Dense(4096, activation='relu', name='F7'))
    # model.add(layers.Dropout(rate=0.5, name='D0'))
    # model.add(layers.Dense(8631, activation='softmax', name='F8'))

    # load pretrained weigths
    # https://github.com/swghosh/DeepFace/releases
    model.load_weights("data/model/model_weights/DeepFace_weights.h5", skip_mismatch=True, by_name=True)

    return model

In [8]:
def set_nontrainable_layers(model):
    # Set the first layers to be untrainable
    model.trainable = False
    return model

In [64]:
def add_last_layers(model):
    '''Take a pre-trained model, set its parameters as non-trainables, and add additional trainable layers on top'''
    
    flattening_layer = layers.Flatten()
    dense_layer = layers.Dense(1000, activation="relu")
    prediction_layer = layers.Dense(14, activation="softmax")

    model = Sequential([
                        model,
                        flattening_layer,
                        dense_layer,
                        prediction_layer 
                        ])
    return model

In [65]:
def build_model():


    model = load_model()
    model = set_nontrainable_layers(model)
    model = add_last_layers(model)

    model.compile(loss="categorical_crossentropy",
                  optimizer=optimizers.Adam(learning_rate=1e-4),
                  metrics=["accuracy"])

    return model

In [66]:
def encode_labels(y):
    le = LabelEncoder()
    le.fit(y)
    return le.transform(y)

In [58]:
def get_labels():

  path = "data/data/training_data/train"

  df = pd.read_csv("data/data/training_data/fairface_label_train.csv")

  df["target"] = df["gender"] + "-" + df["race"]

  return df

In [59]:
def random_balanced_sample(dataframe, sample_size=5000, balance_on="target"):

  """
  function that given a dataframe returns a random balanced subsample balanced
  on a desired feature
  
  """

  classes = dataframe[balance_on].unique()
  number_of_classes = len(classes)

  sample_dfs = []
  for label in classes:

      df_class = dataframe[dataframe[balance_on] == label]
      sample = df_class.sample(int(sample_size/len(classes)))
      sample_dfs.append(sample)

  return pd.concat(sample_dfs, axis=0)


In [61]:
from tqdm import tqdm

def get_training_data():

    image_path = "data/data/training_data/train"

    labels = get_labels()
    labels = random_balanced_sample(labels, sample_size=5000, balance_on="target")

    img_list = []

    for image_name in tqdm(labels["file"].values):
        img_path = os.path.join("data/data/training_data", image_name)
        image = np.array(Image.open(img_path).resize((152,152)))
        img_list.append(image)
    
    X = np.array(img_list)
    print(len(X))
    y = labels["target"].values
    y_encoded = encode_labels(y)
    y_cat = to_categorical(y_encoded)  
    
    X_train, X_test, y_train, y_test = train_test_split(X, y_cat, test_size=0.3)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.3)
    
    return X_train, y_train, X_val, y_val, X_test, y_test

In [62]:
X_train, y_train, X_val, y_val, X_test, y_test = get_training_data()

4998


In [None]:
from tensorflow.keras.callbacks import EarlyStopping
es = EarlyStopping(patience=20, restore_best_weights=True)

model = build_model()
model.fit(x=X_train,
          y=y_train,
          validation_data=(X_val, y_val),
          batch_size=16,
          epochs=20,
          callbacks=[es],
          verbose=2)

Epoch 1/20
