# HCML project

## Finetuning VGG16

In [None]:
!pip install git+https://github.com/rcmalli/keras-vggface.git
!pip install keras_applications

In [None]:
import tensorflow as tf
from tensorflow import keras
from keras import layers
from keras.models import Model, Sequential
from keras.applications.vgg16 import VGG16
from keras_vggface.vggface import VGGFace

from keras.callbacks import ModelCheckpoint, EarlyStopping

import numpy as np
import os
import cv2
from more_itertools import chunked
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay


In [None]:
SEED = 78
BATCH_SIZE = 64
IMG_SIZE = 256 

In [None]:
def prepare_data(ds, data_aug):
    normalization = layers.Rescaling(1./255)
    ds = ds.map(lambda x, y: (normalization(x), y))

    if data_aug:
        brightness = layers.RandomBrightness((-0.3, 0.3), value_range=(0., 1.), seed=SEED)
        rotation = layers.RandomRotation(0.2, seed=SEED)

        rotated_ds = ds.map(lambda x, y: (rotation(x), y))
        brightness_ds = ds.map(lambda x, y: (brightness(x), y))
        rotated_brightness_ds = ds.map(lambda x, y: (brightness(rotation(x)), y))
                
        ds = tf.data.Dataset.sample_from_datasets(
                [ds, rotated_ds, brightness_ds, rotated_brightness_ds]
        )

    return ds.batch(BATCH_SIZE)

In [None]:
train_ds = tf.keras.utils.image_dataset_from_directory(
    "/kaggle/input/heysem-sorted-data/sorted_data/train",
    validation_split=0.3,
    label_mode='binary',
    subset="training",
    seed=SEED,
    image_size=(IMG_SIZE, IMG_SIZE),
    batch_size=None,
#     batch_size=BATCH_SIZE
)

val_ds = tf.keras.utils.image_dataset_from_directory(
    "/kaggle/input/heysem-sorted-data/sorted_data/train",
    validation_split=0.3,
    label_mode='binary',
    subset="validation",
    seed=SEED,
    image_size=(IMG_SIZE, IMG_SIZE),
    batch_size=None,
#     batch_size=BATCH_SIZE
)

train_ds = prepare_data(train_ds, data_aug=False)
val_ds = prepare_data(val_ds, data_aug=False)

#NOTE: the output here is incorrect and there is no verbose option...

In [None]:
vgg= VGGFace(model='vgg16', include_top=False, pooling='avg', input_shape=(256, 256, 3))

# Freeze the layers except the last 5
for layer in vgg.layers[:-5]:
    layer.trainable = False
# # Check the trainable status of the individual layers
# for layer in vgg.layers:
#     print(layer, layer.trainable)

# Model
model = keras.Sequential() # Add the VGG16 convolutional base model
model.add(vgg)
 
# Add new layers
model.add(keras.layers.Dense(64, activation='relu'))
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['binary_accuracy'])

In [None]:
early = EarlyStopping(monitor='val_binary_accuracy', 
                      patience=10, 
                      verbose=1, 
                      mode='auto')

hist = model.fit(x=train_ds,
                 validation_data=val_ds,
                 epochs=40,
                 callbacks=[early]
                )

In [None]:
model.save('/kaggle/working/vggface_v3_0.h5')

## Model on balanced data

In [None]:
balanced_train_ds = tf.keras.utils.image_dataset_from_directory(
    "/kaggle/input/balanced-data/balanced_data/train",
    validation_split=0.3,
    label_mode='binary',
    subset="training",
    seed=SEED,
    image_size=(IMG_SIZE, IMG_SIZE),
    batch_size=None,
)

balanced_val_ds = tf.keras.utils.image_dataset_from_directory(
    "/kaggle/input/balanced-data/balanced_data/train",
    validation_split=0.3,
    label_mode='binary',
    subset="validation",
    seed=SEED,
    image_size=(IMG_SIZE, IMG_SIZE),
    batch_size=None,
)

balanced_train_ds = prepare_data(balanced_train_ds, data_aug=False)
balanced_val_ds = prepare_data(balanced_val_ds, data_aug=False)

#NOTE: the output here is incorrect and there is no verbose option...

In [None]:
bal_vgg = VGGFace(model='vgg16', include_top=False, pooling='avg', input_shape=(256, 256, 3))

# Freeze the layers except the last 5
for layer in bal_vgg.layers[:-5]:
    layer.trainable = False
# # Check the trainable status of the individual layers
# for layer in vgg.layers:
#     print(layer, layer.trainable)

# Model
balanced_model = keras.Sequential() # Add the VGG16 convolutional base model
balanced_model.add(bal_vgg)
 
# Add new layers
balanced_model.add(keras.layers.Dense(64, activation='relu'))
balanced_model.add(keras.layers.BatchNormalization())
balanced_model.add(keras.layers.Dense(1, activation='sigmoid'))

balanced_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['binary_accuracy'])

In [None]:
early = EarlyStopping(monitor='val_binary_accuracy', 
                      patience=10, 
                      verbose=1, 
                      mode='auto')

balanced_hist = balanced_model.fit(x=balanced_train_ds,
                 validation_data=balanced_val_ds,
                 epochs=40,
                 callbacks=[early]
                )

In [None]:
balanced_model.save('/kaggle/working/vggface_balanced_v1_0.h5')

## Assessing performance

In [None]:
model = keras.models.load_model("/kaggle/working/vggface_balanced_v1_0.h5")

hist = balanced_hist

In [None]:
import matplotlib.pyplot as plt
plt.plot(hist.history["binary_accuracy"])
plt.plot(hist.history['val_binary_accuracy'])
# plt.plot(hist.history['loss'])
# plt.plot(hist.history['val_loss'])
plt.title("Model accuracy")
plt.ylabel("Accuracy")
plt.xlabel("Epoch")
plt.legend(["Accuracy","Validation Accuracy","Loss","Validation Loss"])
plt.show()

plt.plot(hist.history['loss'])
plt.plot(hist.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')

plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
meta_data = pd.read_csv("/kaggle/input/meta-data/eth_gender_anno_all.xlsx - eth_gender_trait_annotations_de.csv")
# use same encoding 
meta_data['Gender'] = meta_data["Gender"].replace(2, 0)

# clear difference in occurences in data
meta_data["Ethnicity"].value_counts()

In [None]:
# test_ds = tf.keras.utils.image_dataset_from_directory(
#     "/kaggle/input/sorted-heysem-dataset/sorted_data/test",
#     label_mode='binary',
#     seed=SEED,
#     image_size=(IMG_SIZE, IMG_SIZE),
#     batch_size=None,
# #     batch_size=BATCH_SIZE
# )
# test_ds = prepare_data(test_ds, training=False)

In [None]:
def load_and_preprocess_image(image_path):
    # NOTE: ensure this matches the original model
    img = cv2.imread(image_path)
    img = cv2.resize(img, (IMG_SIZE, IMG_SIZE))
    img = img / 255.0  # Normalize pixel values
    return img

In [None]:
# src = "/kaggle/input/heysem-sorted-data/sorted_data/test/"
src = "/kaggle/input/balanced-data/balanced_data/test/"
women_paths = [os.path.join(src, "women/", file) for file in os.listdir(src + "women/")]# if file.endswith('.JPG')]
men_paths = [os.path.join(src, "men/", file) for file in os.listdir(src + "men/")]# if file.endswith('.JPG')]

women_data = [load_and_preprocess_image(path) for path in women_paths]
men_data = [load_and_preprocess_image(path) for path in men_paths]

women_batches = chunked(women_data, BATCH_SIZE)
men_batches = chunked(men_data, BATCH_SIZE)

In [None]:
w_predictions = []
for batch in women_batches:
    dataset = tf.data.Dataset.from_tensor_slices(batch).batch(64)

    # because sigmoid output of one node
    preds = model.predict(dataset) < 0.5 
    w_predictions.append(preds.astype(int))

In [None]:
m_predictions = []
for batch in men_batches:
    dataset = tf.data.Dataset.from_tensor_slices(batch).batch(64)

    # because sigmoid output of one node
    preds = model.predict(dataset) < 0.5 
    m_predictions.append(preds.astype(int))

In [None]:
w_preds = [i[0] for pred in w_predictions for i in pred]
m_preds = [i[0] for pred in m_predictions for i in pred]

In [None]:
results_dict = {}

for pred, path in zip(w_preds, women_paths):
    trim_path = path[-23:-4]
    row = meta_data.loc[meta_data["VideoName"] == trim_path]
    
    ethnicity, gender = tuple(row[["Ethnicity", "Gender"]].values[0])

    results_dict[trim_path] = (pred, ethnicity, gender)
    
for pred, path in zip(m_preds, men_paths):
    trim_path = path[-23:-4]
    row = meta_data.loc[meta_data["VideoName"] == trim_path]
    
    ethnicity, gender = tuple(row[["Ethnicity", "Gender"]].values[0])

    results_dict[trim_path] = (pred, ethnicity, gender)

In [None]:
df = pd.DataFrame.from_dict(results_dict).T
df.columns = ["gender_prediction", "ethnicity", "gender_true"]
print(df.shape)
df.head()

In [None]:
# index to ethnicity according to orignal dataset
i2e = {1: "Asian", 2:"Caucasian", 3:"African-American"}

# print("WOMEN")
acc = round(sum(df['gender_prediction'] == df['gender_true'])/len(df), 3)
cm = confusion_matrix(df["gender_true"], df["gender_prediction"])
disp = ConfusionMatrixDisplay(cm)
disp.plot()
plt.title(f"Overal, acc={acc}")
plt.savefig("/kaggle/working/balanced_plots/overall.png")
plt.show()

for e in i2e.keys():
    e_df = df.loc[df['ethnicity']==e]
    
    acc = round(sum(e_df['gender_prediction'] == e_df['gender_true'])/len(e_df), 3)

    cm = confusion_matrix(e_df["gender_true"], e_df["gender_prediction"])
    disp = ConfusionMatrixDisplay(cm)
    disp.plot()
    plt.title(f"{i2e[e]}, acc={acc}")
    plt.savefig(f"/kaggle/working/balanced_plots/{i2e[e]}.png")
    plt.show()

In [None]:
!mkdir /kaggle/working/balanced_plots

In [None]:
src = "/kaggle/input/sorted-heysem-dataset/sorted_data/train/"
train_women_paths = [os.path.join(src, "woman/", file) for file in os.listdir(src + "woman/")]# if file.endswith('.JPG')]
train_men_paths = [os.path.join(src, "man/", file) for file in os.listdir(src + "man/")]# if file.endswith('.JPG')]

results_dict = {}

# for pred, path in zip(w_preds, train_women_paths):
for path in train_women_paths:

    trim_path = path[-23:-4]
    row = meta_data.loc[meta_data["VideoName"] == trim_path]
    
    ethnicity, gender = tuple(row[["Ethnicity", "Gender"]].values[0])

    results_dict[trim_path] = (ethnicity, gender)
    
# for pred, path in zip(m_preds, train_men_paths):
for path in train_men_paths:
    trim_path = path[-23:-4]
    row = meta_data.loc[meta_data["VideoName"] == trim_path]
    
    ethnicity, gender = tuple(row[["Ethnicity", "Gender"]].values[0])

    results_dict[trim_path] = (ethnicity, gender)
    

df = pd.DataFrame.from_dict(results_dict).T
df.columns = ["ethnicity", "gender_true"]
# print(df.shape)
# df.head()

df['ethnicity'].value_counts()

In [None]:
i2e = {1: "Asian", 2:"Caucasian", 3:"African-American"}

## Archive
storing some stuff that we might still use

In [None]:
# TO PLOT IMAGE
# count = 0 
# # for batch in all_ds[2]:
# for batch in b:

#     count += 1
#     img = batch[0].numpy()#.astype("uint8")
#     plt.imshow(img)
#     plt.show()
    
#     if count > 10: break
        

        # output_path = '/kaggle/working/output'

# checkpoint = ModelCheckpoint(filepath=output_path, 
#                              monitor='val_acc', 
#                              verbose=1, 
#                              save_best_only=True, 
#                              save_weights_only=False, 
#                              mode='auto', 
#                              period=1)


# test_ds = tf.keras.utils.image_dataset_from_directory(
#     "/kaggle/input/sorted-heysem-dataset/sorted_data/test",
#     label_mode='binary',
#     seed=SEED,
#     image_size=(IMG_SIZE, IMG_SIZE),
#     batch_size=None,
# #     batch_size=BATCH_SIZE
# )
# test_ds = prepare_data(test_ds, training=False)

# preds = model.predict(test_ds) >= 0.5 # because sigmoid output of one node
# preds = preds.astype(int)
    