# Face recognition using neural network features

In this task, you have to construct face recognizer based on features extracted from the neural network. The task consists of two parts: image classification and video classification. In the first one you should classify distinct images and in the second one you will deal with short video sequences.

In [None]:
#import os
#os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
#os.environ["CUDA_VISIBLE_DEVICES"] = ""

In [None]:
from keras.models import Model, Sequential
from keras.layers import Flatten, Dense, Activation
from keras.layers import Convolution2D, MaxPooling2D
from keras import backend as K

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA

In [None]:
%pylab inline
from matplotlib import pyplot as plt
import numpy as np
import cv2
import os
import cPickle as pickle
from copy import copy
from collections import Counter
from get_data import download, load_dataset, load_faces
from preprocessing import preprocess_images, preprocess_frames

In [None]:
def VGG_model(weight_path=None):
    
    input_shape = (224,224,3)
    model=Sequential()
    # Block 1
    model.add(Convolution2D(64, (3, 3), activation='relu', padding='same', name='conv1_1', input_shape=input_shape))
    model.add(Convolution2D(64, (3, 3), activation='relu', padding='same', name='conv1_2'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2), name='pool1'))

    # Block 2
    model.add(Convolution2D(128, (3, 3), activation='relu', padding='same', name='conv2_1'))
    model.add(Convolution2D(128, (3, 3), activation='relu', padding='same', name='conv2_2'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2), name='pool2'))

    # Block 3
    model.add(Convolution2D(256, (3, 3), activation='relu', padding='same', name='conv3_1'))
    model.add(Convolution2D(256, (3, 3), activation='relu', padding='same', name='conv3_2'))
    model.add(Convolution2D(256, (3, 3), activation='relu', padding='same', name='conv3_3'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2), name='pool3'))

    # Block 4
    model.add(Convolution2D(512, (3, 3), activation='relu', padding='same', name='conv4_1'))
    model.add(Convolution2D(512, (3, 3), activation='relu', padding='same', name='conv4_2'))
    model.add(Convolution2D(512, (3, 3), activation='relu', padding='same', name='conv4_3'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2), name='pool4'))

    # Block 5
    model.add(Convolution2D(512, (3, 3), activation='relu', padding='same', name='conv5_1'))
    model.add(Convolution2D(512, (3, 3), activation='relu', padding='same', name='conv5_2'))
    model.add(Convolution2D(512, (3, 3), activation='relu', padding='same', name='conv5_3'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2), name='pool5'))

    model.add(Flatten(name='flatten'))
    model.add(Dense(4096, name='fc6'))
    model.add(Activation('relu', name='fc6/relu'))
    model.add(Dense(4096, name='fc7'))
    model.add(Activation('relu', name='fc7/relu'))
    model.add(Dense(500, name='fc8'))
    model.add(Activation('relu', name='prob'))
    if weight_path is not None:
        if not os.path.exists(weight_path):
            download(weight_path)
        model.load_weights(weight_path)
    return model

When you load the data for the first time it can take long time (especially for the deep network weights) as firstly the data will be downloaded from the Internet.

Here is the data you will work with. All the images contain a face with some background. 

In [None]:
x_train, y_train, x_test, y_test = load_dataset('images')
classes = np.unique(y_train)
print '%d'%len(x_train), '\ttraining images'
print '%d'%len(x_test), '\ttesting images'

In [None]:
people_faces = load_faces()

In [None]:
def visualize(data, labels, function = lambda x:x, n_cols = 5, n_rows=1):
    figure(figsize = (3*n_cols,3*n_rows))
    for n,i in enumerate(np.random.randint(len(data), size = n_cols*n_rows)):
        plt.subplot(n_rows,n_cols,n+1)
        plt.axis('off')
        plt.imshow(function(data[i]))
        plt.title(labels[i])
    plt.show()

That is how the data looks like.

In [None]:
visualize(x_train,y_train)
visualize(x_test,y_test)

You have to implement preprocessing function in the cell below.
Getting an image as an input the this function should detect the face on it, find the facial keypoints and then crop and normalize the image 
according to these keypoints. The output image should contain only the aligned face and should be the tensor of the shape (1, 224, 224, 3).

In [None]:
def preprocess(img):
    """ Your implementation """ 
    return preprocess_images(img)

#### Visualization of preprocessing

In [None]:
visualize(x_train,y_train, function = lambda x:preprocess(x)[0])

The neural network is already trained on the other face dataset. You should use this network as feature extractor to get descriptors of the faces. You can choose any hidden layer you need (or several layers) to extract features and any classification method.

In [None]:
model = VGG_model('vgg_face_500.h5')

Here is an example of using the network as feature extractor. The shape of input tensor has to be (n_images, 224, 224, 3), so you can input several images simultaneously and get their face descriptors of shape (n_images, n_components).

In [None]:
def get_layer_output(images, layer = 'fc7'):
    assert len(images.shape)==4, 'Wrong input dimentionality!'
    assert images.shape[1:]==(224,224,3), 'Wrong input shape!'
    
    network_output = model.get_layer(layer).output
    feature_extraction_model = Model(model.input, network_output)
    
    output = feature_extraction_model.predict(images)
    return output

In [None]:
img = cv2.resize(x_train[0], (224,224)).reshape(1,224,224,3)
out = get_layer_output(img)
print out.shape

You have to implement two functions in the cell below.
The function "classify" should return the name of the most probable person shown on the image, and
"predict_proba" should return the list of probabilities. Now these functions return random result, you should change it.


In [None]:
#all training images preprocessed
train_out = preprocess(x_train[0])
for im in x_train[1:]:
    train_out = np.concatenate((train_out, preprocess(im)), axis = 0)    

In [None]:
import time
#neural features of all training images
t0 = time.time()
last = len(train_out)/100 + 1
features = get_layer_output(train_out[:100])
for i in range(1,last):
    res = get_layer_output(train_out[100*i:100*i+100])
    features = np.concatenate((features, res), axis = 0)
    print features.shape
print time.time()-t0

In [None]:
kNN = KNeighborsClassifier(n_neighbors=1)
kNN.fit(features, y_train)

In [None]:
#all testing images preprocessed
test_out = preprocess(x_test[0])
for im in x_test[1:]:
    test_out = np.concatenate((test_out, preprocess(im)), axis = 0)

In [None]:
#neural features of all testing images
t0 = time.time()
last = len(test_out)/100 + 1
test_features = get_layer_output(test_out[:100])
for i in range(1,last):
    res = get_layer_output(test_out[100*i:100*i+100])
    test_features = np.concatenate((test_features, res), axis = 0)
    print test_features.shape
print time.time()-t0

In [None]:
def classify(features):
    
    return kNN.predict([features])
    
def predict_proba(img, img_id=0):
    
    feature = get_layer_output(preprocess(img))
    return kNN.predict_proba(feature)[0]

Let us check the accuracy of your classification. Sometimes it is more convenient to classify the block of images simultaneously, so you can change this script if you need. But you have to get the list of the predictions for each of the testing images.

In [None]:
labels = []
for features in test_features:
    label = classify(features)
    labels.append(label)
pickle.dump(labels, open('result_images.pickle', 'wb'))
print 'Classification accuracy:\t%3f' % accuracy_score(labels, y_test)

### Visualization of the classification

In [None]:
def top_5_visualization(test_data, test_labels, classes,prediction_function, is_video = False, n_images = 3):
    figure(figsize = (18,10))
    for n,i in enumerate(np.random.randint(len(test_data), size = n_images)):
        plt.subplot(n_images,6,6*n+1)
        plt.axis('off')
        if is_video:
            plt.imshow(test_data[i][0])
        else:
            plt.imshow(test_data[i])
        plt.title('Request')
        preds = prediction_function(test_data[i],i)
        labels = preds.argsort()[-1:-6:-1]

        for j,l in enumerate(labels):
            plt.subplot(n_images,6, 6*n+j+2)
            plt.axis('off')
            picture = copy(people_faces[classes[l]])
            plt.title('Top-%d'%(j+1))
            if test_labels[i]==classes[l]:
                cv2.rectangle(picture,(0,0),picture.shape[:2], (0,250,0),15)
            plt.imshow(picture)

In [None]:
top_5_visualization(x_test, y_test, classes,prediction_function=predict_proba)

## Face recognition in video

Now you have to classify faces in video sequences. Each sequence containes about 125 frames with a face depicted on each frame. You should detect the face, find the keypoints and normalize the images as in the previous task (you can use the same preprocess function). To classify the whole video you can combine the predictions for its frames any way you want (averaging, voting, etc.)

Training data is in the same format as in the first task. There are distinct images with different faces depicted on them. Testing data is the dictionary: the keys are video ids and the values are lists of frames.

In [None]:
video_train, train_labels, video_test, test_labels = load_dataset('video')
video_classes = np.unique(train_labels)
print '%d'%len(video_train), '\ttraining images'
print '%d'%len(video_test), '\ttesting videos'

You have to implement two functions in the cell below.
The function "classify" should return the name of the most probable person in video, and
"predict_proba" should return the list of probabilities. Now these functions return random result, you should change it.

Hint: while preprocessing video frames you can use face detector not in all the frames but every few frames and interpolate face detections in other frames. 

In [None]:
#all training images preprocessed
v_train_out = preprocess(video_train[0])
for im in video_train[1:]:
    v_train_out = np.concatenate((v_train_out, preprocess(im)), axis = 0)

In [None]:
#neural features for all training images
t0 = time.time()
last = len(v_train_out)/100 + 1
v_features = get_layer_output(v_train_out[:100])
for i in range(1,last):
    res = get_layer_output(v_train_out[100*i:100*i+100])
    v_features = np.concatenate((v_features, res), axis = 0)
    print v_features.shape
print time.time()-t0

In [None]:
v_features = v_features/np.linalg.norm(v_features,axis=1,keepdims=True)
neigh = KNeighborsClassifier(n_neighbors=1, p=2)
neigh.fit(v_features, train_labels) 

In [None]:
#all frames of testing videos preprocessed
with open('frame_bboxes.pickle','r') as f:
    bboxes = pickle.load(f)
    
v_test_out = {}
for v in video_test:
    print v
    v_out = preprocess_frames(video_test[v][0],v,0, bboxes)
    for i in range(1,len(video_test[v])):
        v_out = np.concatenate((v_out, preprocess_frames(video_test[v][i],v,i, bboxes)), axis = 0)
    v_test_out[v] = v_out

In [None]:
#neural features of all the frames of all testing videos
t0=time.time()
video_descs = {}
for v in v_test_out:
    print v    
    fit_data = v_test_out[v]
    layer_outs = get_layer_output(fit_data)
    video_descs[v] = layer_outs/np.linalg.norm(layer_outs, axis=1, keepdims=True)
print time.time()-t0

In [None]:
def classify_video(video_features):
    
    prob = neigh.predict(video_features)
    return (Counter(prob).most_common(1)[0][0])    
    
def predict_proba_video(video,video_id):
    v_out = preprocess_frames(video[0],video_id,0, bboxes)
    for i in range(1,len(video)):
        v_out = np.concatenate((v_out, preprocess_frames(video[i],video_id,i, bboxes)), axis = 0)
    v_desc = get_layer_output(v_out)    
    prob = neigh.predict_proba(v_desc)
    
    voting = np.argmax(prob, axis = 1)
    summ = len(prob)
    probabilities = np.zeros(len(video_classes))
    for k in Counter(voting):
        probabilities[k]=Counter(voting)[k]/float(summ)
    
    return probabilities

In [None]:
video_labels = []
for video_id in video_descs:  
    label = classify_video(video_descs[video_id])
    video_labels.append(label)
pickle.dump(video_labels, open('result_video.pickle', 'wb'))

print 'Classification accuracy:\t%3f' % accuracy_score(video_labels, test_labels)

In [None]:
top_5_visualization(video_test, test_labels, video_classes, predict_proba_video, is_video=True)