<a href="https://colab.research.google.com/github/notsrujangupta/KYC-Verificaiton/blob/main/Face_Matching_App.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Installations and Imports


In [None]:
!pip install git+https://github.com/rcmalli/keras-vggface.git

In [None]:
pip install Keras-Applications

## Import Libraries


In [None]:
import numpy as np
import os

from matplotlib import pyplot
from numpy import asarray
from scipy.spatial.distance import cosine

import tensorflow
tensorflow.__version__
from tensorflow import keras

from keras.preprocessing import image
from keras_vggface.vggface import VGGFace
from keras_vggface import utils
from keras import optimizers
from keras.models import Model, Sequential
from keras.preprocessing.image import ImageDataGenerator

import random
random.seed(0)

In [None]:
class IdentityMetadata():
    def __init__(self, base, name, file):
        self.base = base
        self.name = name
        self.file = file

    def __repr__(self):
        return self.image_path()

    def image_path(self):
        return os.path.join(self.base, self.name, self.file) 
    
def load_metadata(path):
    metadata = []
    for i in os.listdir(path):
        for f in os.listdir(os.path.join(path, i)):
            ext = os.path.splitext(f)[1]
            if ext == '.jpg' or ext == '.jpeg':
                metadata.append(IdentityMetadata(path, i, f))
    return np.array(metadata)

def img_embd(img_path):
    img = image.load_img(img_path, target_size=(224, 224))
    embd_temp = image.img_to_array(img)
    embd_temp = (embd_temp / 255.).astype(np.float32)
    embd_temp = np.expand_dims(embd_temp, axis=0)
    embd_temp = utils.preprocess_input(embd_temp, version=2)
    return embd_temp

def emb_vec(arr):
    embedding_vector = model.predict(arr)[0]
    return embedding_vector

def distance_from_embd(vec1,vec2):
    return cosine(vec1,vec2)

def distance(path1, path2):
    embd1 = img_embd(path1)
    embd2 = img_embd(path2)
    vec1 = emb_vec(embd1)
    vec2 = emb_vec(embd2)
    return cosine(vec1,vec2)

In [None]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive

# Model/Data Work

## Defining ROOT_PATH and checking distance metric

In [None]:
ROOT_PATH = #root path of all image folders/files after data generation is complete (smile data with photos of photos of smile photos)
file1_path = #file path
file2_path = #file path

In [None]:
model = model = VGGFace(model='resnet50', include_top=False, input_shape=(224, 224, 3), pooling='avg')


In [None]:
distance(ROOT_PATH+file1_path, ROOT_PATH+file2_path)

## Step 0: Initial Data Work

In [None]:
"""Put labelled data in 3 folders: "Correct", "Correct_fake" and "Incorrect" with phone images of people in the "Correct" folder and the images in "Correct_fake" such that images of images of people are in the corresponding order as their "Correct" values. All the other images go into "Incorrect". 
Corrupt files checkers: 
1. BadPeggy (Download: https://www.coderslagoon.com, Github Source Code: https://github.com/llaith-oss/BadPeggy) to check for corrupted image files in the folders easily. 
2. ImagMagick (Download: https://imagemagick.org/script/download.php)"""

## Step 1: Data Pre-Processing


### Creating imagelist, Visualisation

In [None]:
meta_correct = load_metadata(ROOT_PATH+"Correct")
meta_fake = load_metadata(ROOT_PATH+"Correct_fake")
meta_incorrect = load_metadata(ROOT_PATH+"Incorrect")

### Image Agumentation


In [None]:
# could hypothetically try to extract face with MTCNN, not sure if it's needed though. Perhaps it's relevant how the edges of the face meet the background?

array_correct = []
for i in range(0,len(meta_correct)):
    arr_temp = img_embd(meta_correct[i].image_path())
    array_correct.append(arr_temp)

np.savez(ROOT_PATH+"array_correct.npz", array_correct)

array_fake = []
for i in range(0,len(meta_fake)):
    arr_temp = img_embd(meta_fake[i].image_path())
    array_fake.append(arr_temp)

np.savez(ROOT_PATH+"array_fake.npz", array_fake)

array_incorrect = []
for i in range(0,len(meta_incorrect)):
    arr_temp = img_embd(meta_incorrect[i].image_path())
    array_incorrect.append(arr_temp)

np.savez(ROOT_PATH+"array_incorrect.npz", array_incorrect)


#Change n,m and run these lines to get some sort of an idea to visualise the images and see what things may be needed to be done.
n = 3
m = 51
pyplot.imshow(meta_correct[n])
pyplot.imshow(meta_fake[n])
pyplot.imshow(meta_incorrect[m])


#GANs could be pretty useful for the next part.
imgen = ImageDataGenerator(rotation_range=30, zoom_range=[0.2,0.5], width_shift_range=0.3, height_shift_range=0.3, shear_range=0.2, horizontal_flip=True, fill_mode="nearest", brightness_range = [0.1,1.0])

num = 0
for img in array_correct:
    image = imgen.flow(img, batch_size=1, save_to_dir=args[ROOT_PATH+"Correct_Aug"],save_prefix=num+"_image", save_format="jpg")
    num += 1

meta_aug = load_metadata(ROOT_PATH+"Correct_Aug")
array_aug = []
for i in range(0,len(meta_aug)):
    arr_temp = img_embd(meta_aug[i].image_path())
    array_aug.append(arr_temp)

np.savez(ROOT_PATH+"array_aug.npz", array_aug)




## Step 2: pairplots, pairgrids

this section is entirely hypothetical, and we can try various things based on other info we have about the data (ID etc) and do the normal pd.read_csv and make pairgrids and see what new info we may or may not gather about the specific situations. This is usually not going to be super worth it, but it just might. eg: in pairplots, we can change hue to be area of face and see what happens there. This is ultimately not as important as the iamges themselves in this situation since the primary work is with said images.

## Step 3: Model + Training

In [None]:

model = VGGFace(model='resnet50', include_top=False, input_shape=(224, 224, 3), pooling='avg')

embeddings_correct = []
for img array_correct:
  try:
    embedding_vector = emb_vec(img)
  except:
    embedding_vector = [0] * 2622
  embeddings_correct.append(embedding_vector)

np.savez(ROOT_PATH+"embeddings_correct.npz", embeddings_correct)

embeddings_fake = []
for img array_fake:
  try:
    embedding_vector = emb_vec(img)
  except:
    embedding_vector = [0] * 2622
  embeddings_fake.append(embedding_vector)

np.savez(ROOT_PATH+"embeddings_fake.npz", embeddings_fake)

embeddings_incorrect = []
for img array_incorrect:
  try:
    embedding_vector = emb_vec(img)
  except:
    embedding_vector = [0] * 2622
  embeddings_incorrect.append(embedding_vector)

np.savez(ROOT_PATH+"embeddings_incorrect.npz", embeddings_incorrect)


embeddings_aug = []
for img array_aug:
  try:
    embedding_vector = emb_vec(img)
  except:
    embedding_vector = [0] * 2622
  embeddings_aug.append(embedding_vector)

np.savez(ROOT_PATH+"embeddings_aug.npz", embeddings_aug)


Not sure how to figure out an appropriate threshold.

In [None]:
"""distance function gives us all the info we need about the distances between the images themselves. I'll probably need to work with real data before I can talk about something more specific
"""


In [None]:
arr = np.concatenate((embeddings_correct, embeddings_fake, embeddings_incorrect, embeddings_aug), axis=0)

In [None]:
dist_arr = np.zeros([len(arr),len(arr)])

In [None]:
for i in range(len(arr)):
    for j in range(i): #to avoid too much time taken, we are not repeating values
        dist_arr[i][j] = distance_from_embd(arr[i],arr[j])

In [None]:
dist_arr

In [None]:
"""This give us all the distances from each image to the other. We can use this information to create a good threshold. I'm trying to avoid Deep Learning here because it feels like unnecessary computation, but if we had no option, we could always use it for finding a good threshold."""