# Face Recognition Using mlrun with OpenCV And scikit-learn
 A complete pipeline of data processing, model training and serving function deployment

In [1]:
# nuclio: ignore
import nuclio

### Install mlrun and kubeflow pipelines

### Install dependencies for the code and set config 

It is possible that after installing dependencies locally, you will need to restart Jupyter kernel to successfully import the packages.

In [8]:
%%nuclio cmd
pip install mlrun
pip install kfp
pip install cmake
pip install dlib
pip install face_recognition
pip install opencv-contrib-python
pip install imutils
pip install sklearn 
pip install pandas
pip install joblib
pip install v3io_frames



In [3]:
%nuclio config spec.build.baseImage = "python:3.6-jessie"

%nuclio: setting spec.build.baseImage to 'python:3.6-jessie'


### Declare global variables and perform necessary imports 

In [2]:
DATA_PATH = '/User/demos/face-recognition/dataset/'
ARTIFACTS_PATH = '/User/demos/face-recognition/artifacts/'

In [1]:
import os
import zipfile
from urllib.request import urlopen
from io import BytesIO
import cv2
import face_recognition
from imutils import paths
import joblib
from sklearn.linear_model import LogisticRegression
from mlrun.artifacts import TableArtifact
import pandas as pd
import numpy as np
import datetime
import random
import string
import v3io_frames as v3f

### Import and define mlrun functions for the pipeline 

In [3]:
# nuclio: ignore
from mlrun import new_function, code_to_function, NewTask, mount_v3io
import kfp
from kfp import dsl

In [22]:
def encode_images(context):
    
    client = v3f.Client("framesd:8081", container="users")
    
    # If no train images exist in the predefined path we will train the model on a small dataset of movie actresses
    if not os.path.exists(DATA_PATH + 'input'):
        os.makedirs(DATA_PATH + 'input')
        resp = urlopen('https://iguazio-public.s3.amazonaws.com/roy-actresses/Actresses.zip')
        zip_ref = zipfile.ZipFile(BytesIO(resp.read()), 'r')
        zip_ref.extractall(DATA_PATH + 'input')
        zip_ref.close()
        
    
    
    
        
    #delete encodings if exists
#     encodings_path = ARTIFACTS_PATH+"encodings"
#     if os.path.exists(encodings_path):
#         os.rmdir(encodings_path)    
    
    idx_file_path = ARTIFACTS_PATH+"idx2name.csv"
    if os.path.exists(idx_file_path):
        idx2name_df = pd.read_csv(idx_file_path)
    else:
        idx2name_df = pd.DataFrame(columns=['value', 'name'])
    
    
    
    
    #creates a mapping of classes(person's names) to target value
    new_classes_names = [f for f in os.listdir(DATA_PATH + 'input') if not '.ipynb' in f and f not in idx2name_df['name'].values]
    
    initial_len = len(idx2name_df)
    final_len = len(idx2name_df) + len(new_classes_names)
    for i in range(initial_len, final_len):
        idx2name_df.loc[i] = {'value': i, 'name': new_classes_names.pop()}
        
    name2idx = idx2name_df.set_index('name')['value'].to_dict()
            
    
    #log name to index mapping into mlrun context
    idx2name_df['value'] = idx2name_df['value'].astype('str')
    context.log_artifact(TableArtifact('idx2name', df=idx2name_df), src_path='idx2name.csv')
    
    #generates a list of paths to labeled images 
    imagePaths = [f for f in paths.list_images(DATA_PATH + 'input') if not '.ipynb' in f]
    knownEncodings = []
    knownLabels = []
    fileNames = []
    urls = []
    for (i, imagePath) in enumerate(imagePaths):
        print("[INFO] processing image {}/{}".format(i + 1, len(imagePaths)))
        #extracts label (person's name) of the image
        name = imagePath.split(os.path.sep)[-2]
        
        #converts image format to RGB for comptability with face_recognition library
        image = cv2.imread(imagePath)
        rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        #detects coordinates of faces bounding boxes
        boxes = face_recognition.face_locations(rgb, model='hog')
        
        #computes embeddings for detected faces
        encodings = face_recognition.face_encodings(rgb, boxes)
        
        #this code assumes that a person's folder in the dataset does not contain an image with a face other then his own
        for enc in encodings:
            file_name = name + '_' + ''.join(random.choices(string.ascii_uppercase + string.digits, k=5))                                                            
            knownEncodings.append(enc)
            knownLabels.append([name2idx[name]])
            fileNames.append(file_name)
            urls.append(imagePath)
            
    #saves computed encodings to avoid repeating computations
    df_x = pd.DataFrame(knownEncodings, columns=['c' + str(i).zfill(3) for i in range(128)]).reset_index(drop=True)
    df_y = pd.DataFrame(knownLabels, columns=['label']).reset_index(drop=True)
    df_details = pd.DataFrame([['initial training']*3]*len(df_x), columns=['imgUrl', 'camera', 'time'])
    df_details['time'] = [datetime.datetime.utcnow()]*len(df_x)
    df_details['imgUrl'] = urls
    data_df = pd.concat([df_x, df_y, df_details], axis=1)
    data_df['fileName'] = fileNames
    
    client.write(backend='kv', table='iguazio/demos/face-recognition/artifacts/encodings', dfs=data_df, index_cols=['fileName'])
    with open('encodings_path.txt', 'w+') as f:
        f.write('iguazio/demos/face-recognition/artifacts/encodings')
    context.log_artifact('encodings_path', src_path=f.name)

In [25]:
def train(context, processed_data, model_name='model.bst'):
    #trains classifier
    context.logger.info('Client')
    client = v3f.Client("framesd:8081", container="users" )
    with open(processed_data.url, 'r') as f:
        t = f.read()
        context.logger.info(f't: {t}')
    data_df = client.read(backend="kv", table=t, reset_index=False)
    
    X_train = data_df[['c'+str(i).zfill(3) for i in range(128)]].values
    y_train = data_df['label'].values
    
    model = LogisticRegression(multi_class='ovr', solver='lbfgs').fit(X_train, y_train)
    
    context.logger.info('Save model')
    #saves and logs model into mlrun context
    joblib.dump(model, model_name)
    context.log_artifact('model', src_path=model_name, labels={'framework': 'sklearn_classifier'})

In [17]:
# nuclio: end-code

In [7]:
# serving function
serving_function = code_to_function(name='recognize-faces', 
                                      filename='./nuclio_face_prediction.ipynb',
                                      runtime='nuclio')
serving_function.with_http(workers=2).add_volume('User','~/')

<mlrun.runtimes.function.RemoteRuntime at 0x7fb05bade588>

### Test pipeline functions locally

In [23]:
task = NewTask(handler=encode_images, out_path=ARTIFACTS_PATH)
run = new_function().run(task)

[mlrun] 2019-11-27 08:39:06,847 starting run encode_images uid=bddfab68ea20486698c8bc61f063be24  -> 
[INFO] processing image 1/7
{'Roy_Eisenstadt': 0, 'Avi_Asulin': 1}
[INFO] processing image 2/7
{'Roy_Eisenstadt': 0, 'Avi_Asulin': 1}
[INFO] processing image 3/7
{'Roy_Eisenstadt': 0, 'Avi_Asulin': 1}
[INFO] processing image 4/7
{'Roy_Eisenstadt': 0, 'Avi_Asulin': 1}
[INFO] processing image 5/7
{'Roy_Eisenstadt': 0, 'Avi_Asulin': 1}
[INFO] processing image 6/7
{'Roy_Eisenstadt': 0, 'Avi_Asulin': 1}
[INFO] processing image 7/7
{'Roy_Eisenstadt': 0, 'Avi_Asulin': 1}



uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...63be24,0,Nov 27 08:39:06,completed,encode_images,kind=handlerowner=iguaziohost=jupyter-23tddbwt3g-s1i3d-7bc94769b-c2mg5,,,,idx2nameencodings_path


type result.show() to see detailed results/progress or use CLI:
!mlrun get run --uid bddfab68ea20486698c8bc61f063be24 
[mlrun] 2019-11-27 08:39:11,270 run executed, status=completed


In [24]:
task2 = NewTask(handler=train, inputs={'processed_data': run.outputs['encodings_path']}, out_path=ARTIFACTS_PATH)
train = new_function().run(task2)

[mlrun] 2019-11-27 08:40:07,305 starting run train uid=bf0a21c5e35b4459aede8700fddcdede  -> 
[mlrun] 2019-11-27 08:40:07,326 Client
[mlrun] 2019-11-27 08:40:07,330 t: iguazio/demos/face-recognition/artifacts/encodings
[mlrun] 2019-11-27 08:40:07,428 Dataset:                           c000      c001      c002      c003     c004   c005  \
fileName                                                                       
Roy_Eisenstadt_I2X86 -0.136545  0.102084  0.085755 -0.049162 -0.08464  0.028   

                          c006      c007      c008      c009  ...      c122  \
fileName                                                      ...             
Roy_Eisenstadt_I2X86 -0.074648  0.021552  0.161916 -0.024088  ... -0.109444   

                          c123      c124      c125      c126      c127  \
fileName                                                                 
Roy_Eisenstadt_I2X86 -0.086187  0.069905 -0.044374  0.092996  0.012606   

                                camera 

uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...dcdede,0,Nov 27 08:40:07,completed,train,kind=handlerowner=iguaziohost=jupyter-23tddbwt3g-s1i3d-7bc94769b-c2mg5,processed_data,,,model


type result.show() to see detailed results/progress or use CLI:
!mlrun get run --uid bf0a21c5e35b4459aede8700fddcdede 
[mlrun] 2019-11-27 08:40:07,463 run executed, status=completed


### Create a function from notebook and build image
supposed to take a few minutes

In [26]:
fn = code_to_function('face-recognition')

In [27]:
fn.build(image='mlrun/face_recognition:latest')

[mlrun] 2019-11-27 08:42:04,930 building image (mlrun/face_recognition:latest)
FROM python:3.6-jessie
WORKDIR /run
RUN pip install mlrun
RUN pip install kfp
RUN pip install cmake
RUN pip install dlib
RUN pip install face_recognition
RUN pip install opencv-contrib-python
RUN pip install imutils
RUN pip install sklearn
RUN pip install pandas
RUN pip install joblib
RUN pip install v3io_frames
RUN pip install mlrun
ENV PYTHONPATH /run
[mlrun] 2019-11-27 08:42:04,931 using in-cluster config.
[mlrun] 2019-11-27 08:42:04,952 Pod mlrun-build-l9sjr created
...
[36mINFO[0m[0000] Resolved base name python:3.6-jessie to python:3.6-jessie 
[36mINFO[0m[0000] Resolved base name python:3.6-jessie to python:3.6-jessie 
[36mINFO[0m[0000] Downloading base image python:3.6-jessie     
[36mINFO[0m[0000] Error while retrieving image from cache: getting file info: stat /cache/sha256:0318d80cb241983eda20b905d77fa0bfb06e29e5aabf075c7941ea687f1c125a: no such file or directory 
[36mINFO[0m[0000] Downlo

<mlrun.runtimes.local.LocalRuntime at 0x7fb058336cf8>

### Create pipeline

In [28]:
@dsl.pipeline(
    name='face recognition pipeline',
    description='Creates and deploys a face recognition model'
)
def face_recognition_pipeline():
    fn.with_code()
    
    encode = fn.as_step(name='encode-images', handler='encode_images', out_path=ARTIFACTS_PATH, outputs=['idx2name', 'encodings_path']).apply(mount_v3io())
    
    train = fn.as_step(name='train', handler='train', out_path=ARTIFACTS_PATH, outputs=['model'], 
                               inputs={'processed_data': encode.outputs['encodings_path']}).apply(mount_v3io())
    
    deploy = serving_function.deploy_step(project='default', models={'face_rec_v1': train.outputs['model']})
    

In [29]:
client = kfp.Client(namespace='default-tenant')

In [30]:
#For debug purposes compile pipeline code
kfp.compiler.Compiler().compile(face_recognition_pipeline, 'face_rec.yaml')

### Run pipeline

In [32]:
arguments = {}
run_result = client.create_run_from_pipeline_func(face_recognition_pipeline, arguments=arguments, run_name='face_rec_1', experiment_name='face_rec')