<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Retrieve-Movie-Frames" data-toc-modified-id="Retrieve-Movie-Frames-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Retrieve Movie Frames</a></span></li><li><span><a href="#Extracting-Faces" data-toc-modified-id="Extracting-Faces-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Extracting Faces</a></span></li><li><span><a href="#Classify-Each-face" data-toc-modified-id="Classify-Each-face-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Classify Each face</a></span></li><li><span><a href="#Generate-Overview-for-one-Movie" data-toc-modified-id="Generate-Overview-for-one-Movie-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Generate Overview for one Movie</a></span></li></ul></div>

In [1]:
import sys
sys.path.insert(0,'..')

In [97]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

import mtcnn

from cv2 import imread
from cv2 import CascadeClassifier

In [2]:
from diversity_in_cinema.cnn_model import predict_face
from diversity_in_cinema.data import get_images
from diversity_in_cinema.scraper import get_movies

# Retrieve Movie Frames

In [107]:
# choose movie name
movie_list = list(get_movies().keys())
movie_list[420]

'Man of Steel (2013)'

In [108]:
df_movie = get_images(movie_list[420], frame_interval=4)

In [110]:
df_movie.head()

Unnamed: 0,title,frame_no,img_array
0,Man of Steel (2013),1,"[[[98, 91, 72], [96, 90, 68], [94, 88, 66], [9..."
1,Man of Steel (2013),5,"[[[80, 83, 66], [82, 85, 68], [84, 87, 68], [8..."
2,Man of Steel (2013),9,"[[[67, 71, 57], [66, 70, 56], [69, 73, 59], [6..."
3,Man of Steel (2013),13,"[[[53, 57, 43], [54, 58, 44], [47, 51, 37], [4..."
4,Man of Steel (2013),17,"[[[2, 2, 2], [2, 2, 2], [2, 2, 2], [2, 2, 2], ..."


In [115]:
df_movie.to_csv("man_of_steel_frames.csv")

# Extracting Faces

In [145]:
def extract_face_mtcnn(dataframe):
    
    """
    Function which given a dataset of frame numbers and frame arrays
    returns a dataframe of faces detected in each frame
    """

    detector = mtcnn.MTCNN()   
    
    face_dict = {}
    
    print("Extracting faces...")
    
    for frames, frame_id in zip(dataframe["img_array"], tqdm(dataframe["frame_no"])):
        # print bounding box for each detected face
        bboxes = detector.detect_faces(frames)
        
        faces_list = []
        
        if len(bboxes) > 0:
            
            # extract all faces in image
            for box in bboxes:
                
                x1, y1, width, height = box["box"]
        
                if (width * height) >= 10_000:
                    x2, y2 = x1 + width, y1 + height
                    face = frames[y1:y2, x1:x2]

                    # resize
                    faces_list.append(face)

        face_dict[frame_id] = faces_list
        
        faces_df = pd.DataFrame(data={"frame":list(face_dict.keys()),
                                      "faces": list(face_dict.values())})
        
    return faces_df

In [146]:
def extract_face_opencv(dataframe):
    
    """
    Function which given a dataset of frame numbers and frame arrays
    returns a dataframe of faces detected in each frame
    """
    

    classifier = CascadeClassifier('haarcascade_frontalface_default.xml')
        
    face_dict = {}
    
    print("Extracting faces...")
    
    for frames, frame_id in zip(dataframe["img_array"], tqdm(dataframe["frame_no"])):
        
        # perform face detection
        bboxes = classifier.detectMultiScale(np.array(frames, dtype='uint8'))
        
        # print bounding box for each detected face

        faces_list = []
        if len(bboxes) > 0:
            # extract all faces in image
            for box in bboxes:

                x1, y1, width, height = box

                if (width * height) >= 10_000:
                    x2, y2 = x1 + width, y1 + height
                    face = frames[y1:y2, x1:x2]

                    # resize
                    faces_list.append(face)

        face_dict[frame_id] = faces_list
        
        faces_df = pd.DataFrame(data={"frame":list(face_dict.keys()),
                                      "faces": list(face_dict.values())})
        
        return faces_df

# Classify Each face 

In [147]:
# every rwo is a frame and every frame has n faces

def classify_faces(dataframe):
    
    """
    A function which takes in a dataframe of frame number and the
    extracted face images in each frame as an array
    
    """
    
    df_list = []
    i = 0
    for frame, faces in zip(dataframe["frame"], tqdm(dataframe["faces"])):

        frame_list = []
        gender_list = []
        race_list = []
        face_id_list = []

        results = predict_face(faces)

        for faces in results.values():
            gender = faces["gender"]
            race = faces["dominant_race"]

            frame_list.append(frame)
            gender_list.append(gender)
            race_list.append(race)

            face_id_list.append(i)

            i += 1

        df = pd.DataFrame(data={"frame_number":frame_list,
                                "face_id":face_id_list,
                                "gender":gender_list,
                                "race":race_list})
        df_list.append(df) 
        
    return pd.concat(df_list)

# Generate Overview for one Movie

In [None]:
faces_df = extract_face_mtcnn(df_movie)
faces_df

Extracting faces...


  2%|▊                                      | 83/3951 [01:17<1:00:16,  1.07it/s]

In [89]:
df_classified = classify_faces(faces_df)

In [91]:
df_classified["gender"].value_counts()

Man      954
Woman    222
Name: gender, dtype: int64

In [92]:
df_classifyed["race"].value_counts()

white              800
asian              103
latino hispanic     98
middle eastern      96
black               72
indian               7
Name: race, dtype: int64

In [95]:
df_classifyed[df_classifyed["race"] == "white"]

Unnamed: 0,frame_number,face_id,gender,race
0,31.0,0.0,Man,white
0,51.0,1.0,Man,white
0,131.0,7.0,Man,white
2,141.0,10.0,Man,white
3,141.0,11.0,Man,white
...,...,...,...,...
1,9731.0,1166.0,Woman,white
4,9731.0,1169.0,Man,white
5,9731.0,1170.0,Man,white
2,9741.0,1173.0,Woman,white


In [94]:
df_classifyed[df_classifyed["frame_number"] == 2221]

Unnamed: 0,frame_number,face_id,gender,race
0,2221.0,212.0,Man,white
1,2221.0,213.0,Man,indian
2,2221.0,214.0,Man,white
