# Face Recognition Using mlrun with OpenCV And scikit-learn
 A complete pipeline of data processing, model training and serving function deployment.

### Install mlrun and kubeflow pipelines

In [1]:
!pip install mlrun



### Restart jupyter kernel after initial installations

### Install dependencies for the code and set config 

It is possible that after installing dependencies locally, you will need to restart Jupyter kernel to successfully import the packages.

In [2]:
# nuclio: ignore
import nuclio

Change following magic command to %%nuclio cmd -c if the following packages are already installed locally.

In [None]:
%%nuclio cmd
pip install cmake
pip install dlib
pip install face_recognition
pip install opencv-contrib-python
pip install imutils
pip install sklearn 
pip install pandas
pip install joblib
pip install v3io_frames

In [None]:
%nuclio config spec.build.baseImage = "python:3.6-jessie"

### Declare global variables and perform necessary imports 

In [3]:
DATA_PATH = '/User/demos/demos/faces/dataset/'
ARTIFACTS_PATH = '/User/demos/demos/faces/artifacts/'

In [4]:
import os
import shutil
import zipfile
from urllib.request import urlopen
from io import BytesIO
import cv2
import face_recognition
from imutils import paths
import joblib
from sklearn.linear_model import LogisticRegression
from mlrun.artifacts import TableArtifact
import pandas as pd
import numpy as np
import datetime
import random
import string
import v3io_frames as v3f

### Import and define mlrun functions for the pipeline 

In [5]:
# nuclio: ignore
from mlrun import new_function, code_to_function, NewTask, mount_v3io
import kfp
from kfp import dsl

In [6]:
def encode_images(context):
    
    client = v3f.Client("framesd:8081", container="users")
    
    if not os.path.exists(DATA_PATH + 'processed'):
        os.makedirs(DATA_PATH + 'processed')
    
    if not os.path.exists(DATA_PATH + 'label_pending'):
        os.makedirs(DATA_PATH + 'label_pending')
    
    # If no train images exist in the predefined path we will train the model on a small dataset of movie actresses
    if not os.path.exists(DATA_PATH + 'input'):
        os.makedirs(DATA_PATH + 'input')
        resp = urlopen('https://iguazio-public.s3.amazonaws.com/roy-actresses/Actresses.zip')
        zip_ref = zipfile.ZipFile(BytesIO(resp.read()), 'r')
        zip_ref.extractall(DATA_PATH + 'input')
        zip_ref.close()
          
    if os.path.exists(DATA_PATH + 'input/__MACOSX'):
        shutil.rmtree(DATA_PATH + 'input/__MACOSX')
    
    idx_file_path = ARTIFACTS_PATH+"idx2name.csv"
    if os.path.exists(idx_file_path):
        idx2name_df = pd.read_csv(idx_file_path)
    else:
        idx2name_df = pd.DataFrame(columns=['value', 'name'])
    
    #creates a mapping of classes(person's names) to target value
    new_classes_names = [f for f in os.listdir(DATA_PATH + 'input') if not '.ipynb' in f and f not in idx2name_df['name'].values]
    
    initial_len = len(idx2name_df)
    final_len = len(idx2name_df) + len(new_classes_names)
    for i in range(initial_len, final_len):
        idx2name_df.loc[i] = {'value': i, 'name': new_classes_names.pop()}
        
    name2idx = idx2name_df.set_index('name')['value'].to_dict()
            
    
    #log name to index mapping into mlrun context
    context.log_artifact(TableArtifact('idx2name', df=idx2name_df), src_path='idx2name.csv')
    
    #generates a list of paths to labeled images 
    imagePaths = [f for f in paths.list_images(DATA_PATH + 'input') if not '.ipynb' in f]
    knownEncodings = []
    knownLabels = []
    fileNames = []
    urls = []
    for (i, imagePath) in enumerate(imagePaths):
        print("[INFO] processing image {}/{}".format(i + 1, len(imagePaths)))
        #extracts label (person's name) of the image
        name = imagePath.split(os.path.sep)[-2]
        
        #prepares to relocate image after extracting features
        file_name = imagePath.split(os.path.sep)[-1]
        new_path = DATA_PATH + 'processed/' + file_name 
        
        #converts image format to RGB for comptability with face_recognition library
        image = cv2.imread(imagePath)
        rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        #detects coordinates of faces bounding boxes
        boxes = face_recognition.face_locations(rgb, model='hog')
        
        #computes embeddings for detected faces
        encodings = face_recognition.face_encodings(rgb, boxes)
        
        #this code assumes that a person's folder in the dataset does not contain an image with a face other then his own
        for enc in encodings:
            file_name = name + '_' + ''.join(random.choices(string.ascii_uppercase + string.digits, k=5))                                                            
            knownEncodings.append(enc)
            knownLabels.append([name2idx[name]])
            fileNames.append(file_name)
            urls.append(new_path)
        
        #move image to processed images directory
        shutil.move(imagePath, new_path)
        
    #saves computed encodings to avoid repeating computations
    df_x = pd.DataFrame(knownEncodings, columns=['c' + str(i).zfill(3) for i in range(128)]).reset_index(drop=True)
    df_y = pd.DataFrame(knownLabels, columns=['label']).reset_index(drop=True)
    df_details = pd.DataFrame([['initial training']*3]*len(df_x), columns=['imgUrl', 'camera', 'time'])
    df_details['time'] = [datetime.datetime.utcnow()]*len(df_x)
    df_details['imgUrl'] = urls
    data_df = pd.concat([df_x, df_y, df_details], axis=1)
    data_df['fileName'] = fileNames
    
    client.write(backend='kv', table='iguazio/demos/demos/faces/artifacts/encodings', dfs=data_df, index_cols=['fileName'])
    with open('encodings_path.txt', 'w+') as f:
        f.write('iguazio/demos/demos/faces/artifacts/encodings')
    context.log_artifact('encodings_path', src_path=f.name)
    os.remove('encodings_path.txt')

In [7]:
def train(context, processed_data, model_name='model.bst'):
    #trains classifier
    context.logger.info('Client')
    client = v3f.Client("framesd:8081", container="users" )
    with open(processed_data.url, 'r') as f:
        t = f.read()
        
    data_df = client.read(backend="kv", table=t, reset_index=False)
    
    X_train = data_df[['c'+str(i).zfill(3) for i in range(128)]].values
    y_train = data_df['label'].values
    
    model = LogisticRegression(multi_class='ovr', solver='lbfgs').fit(X_train, y_train)
    
    context.logger.info('Save model')
    #saves and logs model into mlrun context
    joblib.dump(model, model_name)
    context.log_artifact('model', src_path=model_name, labels={'framework': 'sklearn_classifier'})
    os.remove(model_name)

In [7]:
# nuclio: end-code

In [9]:
# serving function
serving_function = code_to_function(name='recognize-faces', 
                                      filename='./nuclio-face-prediction.ipynb',
                                      runtime='nuclio')
serving_function.with_http(workers=2).apply(mount_v3io())

<mlrun.runtimes.function.RemoteRuntime at 0x7f59c04285f8>

### Test pipeline functions locally

In [10]:
task = NewTask(handler=encode_images, out_path=ARTIFACTS_PATH)
run = new_function().run(task)

[mlrun] 2020-02-09 11:29:30,864 log artifact idx2name at /User/demos/demos/faces/artifacts/idx2name.csv, size: 98, db: N
[INFO] processing image 1/99
[INFO] processing image 2/99
[INFO] processing image 3/99
[INFO] processing image 4/99
[INFO] processing image 5/99
[INFO] processing image 6/99
[INFO] processing image 7/99
[INFO] processing image 8/99
[INFO] processing image 9/99
[INFO] processing image 10/99
[INFO] processing image 11/99
[INFO] processing image 12/99
[INFO] processing image 13/99
[INFO] processing image 14/99
[INFO] processing image 15/99
[INFO] processing image 16/99
[INFO] processing image 17/99
[INFO] processing image 18/99
[INFO] processing image 19/99
[INFO] processing image 20/99
[INFO] processing image 21/99
[INFO] processing image 22/99
[INFO] processing image 23/99
[INFO] processing image 24/99
[INFO] processing image 25/99
[INFO] processing image 26/99
[INFO] processing image 27/99
[INFO] processing image 28/99
[INFO] processing image 29/99
[INFO] processing 

uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...699838,0,Feb 09 11:29:28,completed,encode_images,host=jupyter-xi2ncuukdq-2the1-669987df4f-wvhvc,,,,idx2nameencodings_path


to track results use .show() or .logs() or in CLI: 
!mlrun get run a99fd36e64c54a0b891d616aa0699838  , !mlrun logs a99fd36e64c54a0b891d616aa0699838 
[mlrun] 2020-02-09 11:32:07,065 run executed, status=completed


In [11]:
task2 = NewTask(handler=train, inputs={'processed_data': run.outputs['encodings_path']}, out_path=ARTIFACTS_PATH)
train = new_function().run(task2)

[mlrun] 2020-02-09 11:32:20,885 Client
[mlrun] 2020-02-09 11:32:21,831 Save model
[mlrun] 2020-02-09 11:32:21,842 log artifact model at /User/demos/demos/faces/artifacts/model.bst, size: 6053, db: N



uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...621fd5,0,Feb 09 11:32:20,completed,train,host=jupyter-xi2ncuukdq-2the1-669987df4f-wvhvc,processed_data,,,model


to track results use .show() or .logs() or in CLI: 
!mlrun get run f2f6fb1ce0dd43cd87502601b0621fd5  , !mlrun logs f2f6fb1ce0dd43cd87502601b0621fd5 
[mlrun] 2020-02-09 11:32:21,859 run executed, status=completed


### Create a function from notebook and build image
supposed to take a few minutes

In [14]:
fn = code_to_function('face-recognition', runtime='job')

In [15]:
fn.deploy()

[mlrun] 2020-02-09 11:40:35,870 database connection is not configured
[mlrun] 2020-02-09 11:40:35,871 building image (.mlrun/func-default-face-recognition-latest)
FROM python:3.6-jessie
RUN pip install cmake
RUN pip install dlib
RUN pip install face_recognition
RUN pip install opencv-contrib-python
RUN pip install imutils
RUN pip install sklearn
RUN pip install pandas
RUN pip install joblib
RUN pip install v3io_frames
RUN pip install mlrun

[mlrun] 2020-02-09 11:40:35,873 using in-cluster config.
[mlrun] 2020-02-09 11:40:35,914 Pod mlrun-build-face-recognition-b9zs2 created
..
[36mINFO[0m[0000] Resolved base name python:3.6-jessie to python:3.6-jessie 
[36mINFO[0m[0000] Resolved base name python:3.6-jessie to python:3.6-jessie 
[36mINFO[0m[0000] Downloading base image python:3.6-jessie     
[36mINFO[0m[0000] Error while retrieving image from cache: getting file info: stat /cache/sha256:0318d80cb241983eda20b905d77fa0bfb06e29e5aabf075c7941ea687f1c125a: no such file or directory 


True

In [16]:
from mlrun import mlconf
mlconf.dbpath = 'http://mlrun-api:8080'
fn.apply(mount_v3io())

<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f59fc0c1cc0>

### Create pipeline

In [17]:
@dsl.pipeline(
    name='face recognition pipeline',
    description='Creates and deploys a face recognition model'
)
def face_recognition_pipeline():
    fn.with_code()
    
    encode = fn.as_step(name='encode-images', handler='encode_images', out_path=ARTIFACTS_PATH, outputs=['idx2name', 'encodings_path']).apply(mount_v3io())
    
    train = fn.as_step(name='train', handler='train', out_path=ARTIFACTS_PATH, outputs=['model'], 
                               inputs={'processed_data': encode.outputs['encodings_path']}).apply(mount_v3io())
    
    deploy = serving_function.deploy_step(project='default', models={'face_rec_v1': train.outputs['model']})
    

In [18]:
client = kfp.Client(namespace='default-tenant')

In [19]:
#For debug purposes compile pipeline code
kfp.compiler.Compiler().compile(face_recognition_pipeline, 'face_rec.yaml')

### Run pipeline

In [21]:
arguments = {}
run_result = client.create_run_from_pipeline_func(face_recognition_pipeline, arguments=arguments, run_name='face_rec_1', experiment_name='face_rec')