In [1]:
import os
import cv2
import numpy as np
import glob
from tqdm import tqdm
import pandas as pd

In [39]:
#define model paths and labels
model_dir = '/g100_scratch/userexternal/pbose000/mentalism/img_prediction/cv_models'
GENDER_MODEL = os.path.join(model_dir, 'deploy_gender.prototxt')
GENDER_PROTO = os.path.join(model_dir,'gender_net.caffemodel')
AGE_MODEL = os.path.join(model_dir, 'deploy_age.prototxt')
AGE_PROTO = os.path.join(model_dir,'age_net.caffemodel')
FACE_PROTO = os.path.join(model_dir,'deploy.prototxt')
FACE_MODEL =  os.path.join(model_dir,'res10_300x300_ssd_iter_140000_fp16.caffemodel')

MODEL_MEAN_VALUES = (78.4263377603, 87.7689143744, 114.895847746)

GENDER_LIST = ['Male', 'Female']
AGE_INTERVALS = ['(0, 19)', '(20,29)', '(30, 39)', '(40, 100)']


In [3]:
#define image paths and load all image paths
img_dir = '/g100_work/IscrC_mental/data/user_classification/images/'
imgs = {}
imgs['train'] = glob.glob(os.path.join(img_dir,'train')+'/*')
imgs['test'] = glob.glob(os.path.join(img_dir,'test')+'/*')
imgs['de'] = glob.glob(os.path.join(img_dir,'de')+'/*')

In [4]:
# load face Caffe model
face_net = cv2.dnn.readNetFromCaffe(FACE_PROTO, FACE_MODEL)
# Load gender prediction model
gender_net = cv2.dnn.readNetFromCaffe(GENDER_MODEL, GENDER_PROTO)
age_net = cv2.dnn.readNetFromCaffe(AGE_MODEL, AGE_PROTO)

In [5]:
# Initialize frame size
frame_width = 1280
frame_height = 720

In [6]:
def get_faces(frame, confidence_threshold=0.5):
    # convert the frame into a blob to be ready for NN input
    blob = cv2.dnn.blobFromImage(frame, 1.0, (300, 300), (104, 177.0, 123.0))
    # set the image as input to the NN
    face_net.setInput(blob)
    # perform inference and get predictions
    output = np.squeeze(face_net.forward())
    # initialize the result list
    faces = []
    # Loop over the faces detected
    for i in range(output.shape[0]):
        confidence = output[i, 2]
        if confidence > confidence_threshold:
            box = output[i, 3:7] * \
                np.array([frame.shape[1], frame.shape[0],
                         frame.shape[1], frame.shape[0]])
            # convert to integers
            start_x, start_y, end_x, end_y = box.astype(int)
            # widen the box a little
            start_x, start_y, end_x, end_y = start_x - \
                10, start_y - 10, end_x + 10, end_y + 10
            start_x = 0 if start_x < 0 else start_x
            start_y = 0 if start_y < 0 else start_y
            end_x = 0 if end_x < 0 else end_x
            end_y = 0 if end_y < 0 else end_y
            # append to our list
            faces.append((start_x, start_y, end_x, end_y))
    return faces

In [7]:
def image_resize(image, width = None, height = None, inter = cv2.INTER_AREA):
    # initialize the dimensions of the image to be resized and
    # grab the image size
    dim = None
    (h, w) = image.shape[:2]
    # if both the width and height are None, then return the
    # original image
    if width is None and height is None:
        return image
    # check to see if the width is None
    if width is None:
        # calculate the ratio of the height and construct the
        # dimensions
        r = height / float(h)
        dim = (int(w * r), height)
    # otherwise, the height is None
    else:
        # calculate the ratio of the width and construct the
        # dimensions
        r = width / float(w)
        dim = (width, int(h * r))
    # resize the image
    return cv2.resize(image, dim, interpolation = inter)

In [8]:
def predict_gender(input_path: str):
    """Predict the gender of the faces showing in the image"""
    # Read Input Image
    img = cv2.imread(input_path)
    # resize the image, uncomment if you want to resize the image
    # img = cv2.resize(img, (frame_width, frame_height))
    # Take a copy of the initial image and resize it
    frame = img.copy()
    if frame.shape[1] > frame_width:
        frame = image_resize(frame, width=frame_width)
    # predict the faces
    faces = get_faces(frame)
    # Loop over the faces detected
    # for idx, face in enumerate(faces):
    results = []
    for i, (start_x, start_y, end_x, end_y) in enumerate(faces):
        face_img = frame[start_y: end_y, start_x: end_x]
        # image --> Input image to preprocess before passing it through our dnn for classification.
        # scale factor = After performing mean substraction we can optionally scale the image by some factor. (if 1 -> no scaling)
        # size = The spatial size that the CNN expects. Options are = (224*224, 227*227 or 299*299)
        # mean = mean substraction values to be substracted from every channel of the image.
        # swapRB=OpenCV assumes images in BGR whereas the mean is supplied in RGB. To resolve this we set swapRB to True.
        blob = cv2.dnn.blobFromImage(image=face_img, scalefactor=1.0, size=(
            227, 227), mean=MODEL_MEAN_VALUES, swapRB=False, crop=False)
        # Predict Gender
        gender_net.setInput(blob)
        gender_preds = gender_net.forward()
        i = gender_preds[0].argmax()
        gender = GENDER_LIST[i]
        gender_confidence_score = gender_preds[0][i]
        
        # Label processed image
        results.append([gender, gender_confidence_score])
    
    if len(results)==0:
        results = ['Male',0.5]
    elif len(results)>1:
        results = handle_multiple_results_gender(results)
    else:
        results = results[0]
    return {'faces':len(faces),'pred':results}


def handle_multiple_results_gender(results):
    p = 0
    for i in results:
        if i[0]=='Male':
            p += i[1]
        else:
            p += 1 - i[1]
    p = p/len(results)
    if p>0.5:
        return ['Male',p]
    else:
        return ['Female',1-p]


In [9]:
def predict_age(input_path: str):
    """Predict the age of the faces showing in the image"""
    # Read Input Image
    img = cv2.imread(input_path)
    # Take a copy of the initial image and resize it
    frame = img.copy()
    if frame.shape[1] > frame_width:
        frame = image_resize(frame, width=frame_width)
    faces = get_faces(frame)
    results = []
    for i, (start_x, start_y, end_x, end_y) in enumerate(faces):
        face_img = frame[start_y: end_y, start_x: end_x]
        # image --> Input image to preprocess before passing it through our dnn for classification.
        blob = cv2.dnn.blobFromImage(
            image=face_img, scalefactor=1.0, size=(227, 227), 
            mean=MODEL_MEAN_VALUES, swapRB=False
        )
        # Predict Age
        age_net.setInput(blob)
        age_preds = age_net.forward()
        age_preds = convert_age_interval(age_preds[0])
        #t = age_preds.argmax()
        #age = AGE_INTERVALS[t]
        #age_confidence_score = age_preds[t]
        results.append(age_preds)
    if len(results)==0:
        results = [AGE_INTERVALS[3],0.25]
    elif len(results)>1:
        results = handle_multiple_results_age(results)
    else:
        t = results[0].argmax()
        age = AGE_INTERVALS[t]
        age_confidence_score = results[0][t]
        results = [age,age_confidence_score]
    return {'faces':len(faces),'pred':results}


def handle_multiple_results_age(results):
    p = np.zeros(len(results[0]))
    for i in results:
        p += i
    p = p/len(results)
    t = p.argmax()
    age = AGE_INTERVALS[t]
    age_confidence_score = p[t]
    return [age,age_confidence_score]


def convert_age_interval(age_preds):
    ages=[age_preds[0]+age_preds[1]+age_preds[2]+age_preds[3],age_preds[4],age_preds[5],age_preds[6]+age_preds[7]]
    return np.array(ages)

In [10]:
def predict_age_all_classes(input_path: str):
    """Predict the age of the faces showing in the image"""
    # Read Input Image
    img = cv2.imread(input_path)
    # Take a copy of the initial image and resize it
    frame = img.copy()
    if frame.shape[1] > frame_width:
        frame = image_resize(frame, width=frame_width)
    faces = get_faces(frame)
    results = []
    for i, (start_x, start_y, end_x, end_y) in enumerate(faces):
        face_img = frame[start_y: end_y, start_x: end_x]
        # image --> Input image to preprocess before passing it through our dnn for classification.
        blob = cv2.dnn.blobFromImage(
            image=face_img, scalefactor=1.0, size=(227, 227), 
            mean=MODEL_MEAN_VALUES, swapRB=False
        )
        # Predict Age
        age_net.setInput(blob)
        age_preds = age_net.forward()
        age_preds = convert_age_interval(age_preds[0])
        #t = age_preds.argmax()
        #age = AGE_INTERVALS[t]
        #age_confidence_score = age_preds[t]
        results.append(age_preds)
    if len(results)==0:
        results = [0.25,0.25,0.25,0.25]
    elif len(results)>1:
        results = handle_multiple_results_age_classes(results)
    else:
        results = results[0]
    return {'faces':len(faces),'pred':results}


def handle_multiple_results_age_classes(results):
    p = np.zeros(len(results[0]))
    for i in results:
        p += i
    p = p/len(results)
    return p

In [13]:
predictions = {}
for image in tqdm(imgs['test']):
    user_id = int(image.split('/')[-1].split('.')[0])
    predictions[user_id] = {'gender':predict_gender(image),
                            'age':predict_age(image),
                          'age_classes':predict_age_all_classes(image)}

100%|██████████| 1138/1138 [02:59<00:00,  6.35it/s]


In [14]:
preds = []
for u,d in predictions.items():
    preds.append({'user_id':u,
                  'pred_faces':d['gender']['faces'],
                  'pred_gender_label':d['gender']['pred'][0],
                  'pred_gender_prob':d['gender']['pred'][1],
                  'pred_age_label':d['age']['pred'][0],
                  'pred_age_prob':d['age']['pred'][1],
                  'pred_age_0_19_prob':d['age_classes']['pred'][0],
                  'pred_age_20_29_prob':d['age_classes']['pred'][1],
                  'pred_age_30_39_prob':d['age_classes']['pred'][2],
                  'pred_age_40_100_prob':d['age_classes']['pred'][3]
                 })

In [15]:
#convert predictions to dataframe
df_preds = pd.DataFrame.from_records(preds)
df_preds['pred_is_male_label']=df_preds['pred_gender_label'].apply(lambda x: True if x=='Male' else False)

In [301]:
df_preds.to_pickle('/g100_work/IscrC_mental/data/user_classification/trained_models/cv_models.pkl')

In [28]:
df_preds = pd.read_pickle('/g100_work/IscrC_mental/data/user_classification/trained_models/cv_models.pkl')

In [29]:
#load test data
test_data = pd.read_pickle('/g100_work/IscrC_mental/data/user_classification/user_age_gender_location_test_set.pkl')

In [30]:
def convert_age_to_cat(age):
    if age<=19:
        return AGE_INTERVALS[0]
    if age<=29:
        return AGE_INTERVALS[1]
    if age<=39:
        return AGE_INTERVALS[2]
    if age>=40:
        return AGE_INTERVALS[3]

test_data['age_cat'] = test_data['age'].apply(convert_age_to_cat)

In [31]:
test_data = test_data.merge(df_preds,on='user_id',how='inner',validate='1:1')

In [32]:
#eval functions
from sklearn.metrics import accuracy_score, f1_score

In [36]:
# gender
at_least_one_face = test_data['pred_faces']>0
one_face = test_data['pred_faces']==1
y_test = test_data['is_male'].astype(int)
y_pred = test_data['pred_is_male_label'].astype(int)
y_test_n0 = test_data.loc[at_least_one_face,'is_male'].astype(int)
y_pred_n0 = test_data.loc[at_least_one_face,'pred_is_male_label'].astype(int)
y_test_1 = test_data.loc[one_face,'is_male'].astype(int)
y_pred_1 = test_data.loc[one_face,'pred_is_male_label'].astype(int)
print('Gender prediction from images')
print('-'*45)
print('Sample: All')
print(f"Accuracy: {accuracy_score(y_test,y_pred)}")
print(f"F1: {f1_score(y_test,y_pred,average='macro')}")
print('-'*45)
print('Sample: At least 1 face')
print(f"Accuracy: {accuracy_score(y_test_n0,y_pred_n0)}")
print(f"F1: {f1_score(y_test_n0,y_pred_n0,average='macro')}")
print('-'*45)
print('Sample: Exactly 1 face')
print(f"Accuracy: {accuracy_score(y_test_1,y_pred_1)}")
print(f"F1: {f1_score(y_test_1,y_pred_1,average='macro')}")
print('-'*45)

Gender prediction from images
---------------------------------------------
Sample: All
Accuracy: 0.7598214285714285
F1: 0.7066038480003817
---------------------------------------------
Sample: At least 1 face
Accuracy: 0.7957124842370744
F1: 0.7717377398720682
---------------------------------------------
Sample: Exactly 1 face
Accuracy: 0.8060109289617486
F1: 0.7839307540721894
---------------------------------------------


In [34]:
def convert_age_cat(age_label):
    return AGE_INTERVALS.index(age_label)

In [37]:
# age
at_least_one_face = test_data['pred_faces']>0
one_face = test_data['pred_faces']==1
y_test = test_data['age_cat'].apply(convert_age_cat)
y_pred = test_data['pred_age_label'].apply(convert_age_cat)
y_test_n0 = test_data.loc[at_least_one_face,'age_cat'].apply(convert_age_cat)
y_pred_n0 = test_data.loc[at_least_one_face,'pred_age_label'].apply(convert_age_cat)
y_test_1 = test_data.loc[one_face,'age_cat'].apply(convert_age_cat)
y_pred_1 = test_data.loc[one_face,'pred_age_label'].apply(convert_age_cat)
print('Age prediction from images')
print('-'*45)
print('Sample: All')
print(f"Accuracy: {accuracy_score(y_test,y_pred)}")
print(f"F1: {f1_score(y_test,y_pred,average='macro')}")
print('-'*45)
print('Sample: At least 1 face')
print(f"Accuracy: {accuracy_score(y_test_n0,y_pred_n0)}")
print(f"F1: {f1_score(y_test_n0,y_pred_n0,average='macro')}")
print('-'*45)
print('Sample: Exactly 1 face')
print(f"Accuracy: {accuracy_score(y_test_1,y_pred_1)}")
print(f"F1: {f1_score(y_test_1,y_pred_1,average='macro')}")
print('-'*45)

Age prediction from images
---------------------------------------------
Sample: All
Accuracy: 0.3517857142857143
F1: 0.28103526583998184
---------------------------------------------
Sample: At least 1 face
Accuracy: 0.23203026481715006
F1: 0.20970163939312067
---------------------------------------------
Sample: Exactly 1 face
Accuracy: 0.22950819672131148
F1: 0.2087746534954376
---------------------------------------------


# train data

In [14]:
predictions = {}
for image in tqdm(imgs['train']):
    user_id = int(image.split('/')[-1].split('.')[0])
    predictions[user_id] = {'gender':predict_gender(image),
                            'age':predict_age(image),
                          'age_classes':predict_age_all_classes(image)}

100%|██████████| 15516/15516 [35:08<00:00,  7.36it/s] 


In [15]:
preds = []
for u,d in predictions.items():
    preds.append({'user_id':u,
                  'pred_faces':d['gender']['faces'],
                  'pred_gender_label':d['gender']['pred'][0],
                  'pred_gender_prob':d['gender']['pred'][1],
                  'pred_age_label':d['age']['pred'][0],
                  'pred_age_prob':d['age']['pred'][1],
                  'pred_age_0_19_prob':d['age_classes']['pred'][0],
                  'pred_age_20_29_prob':d['age_classes']['pred'][1],
                  'pred_age_30_39_prob':d['age_classes']['pred'][2],
                  'pred_age_40_100_prob':d['age_classes']['pred'][3]
                 })

In [16]:
#convert predictions to dataframe
df_preds = pd.DataFrame.from_records(preds)
df_preds['pred_is_male_label']=df_preds['pred_gender_label'].apply(lambda x: True if x=='Male' else False)

In [18]:
df_preds.to_pickle('/g100_work/IscrC_mental/data/user_classification/trained_models/cv_models_train.pkl')

# de

In [11]:
predictions = {}
for image in tqdm(imgs['de']):
    user_id = int(image.split('/')[-1].split('.')[0])
    predictions[user_id] = {'gender':predict_gender(image),
                            'age':predict_age(image),
                          'age_classes':predict_age_all_classes(image)}

100%|██████████| 425/425 [01:12<00:00,  5.87it/s]


In [12]:
preds = []
for u,d in predictions.items():
    preds.append({'user_id':u,
                  'pred_faces':d['gender']['faces'],
                  'pred_gender_label':d['gender']['pred'][0],
                  'pred_gender_prob':d['gender']['pred'][1],
                  'pred_age_label':d['age']['pred'][0],
                  'pred_age_prob':d['age']['pred'][1],
                  'pred_age_0_19_prob':d['age_classes']['pred'][0],
                  'pred_age_20_29_prob':d['age_classes']['pred'][1],
                  'pred_age_30_39_prob':d['age_classes']['pred'][2],
                  'pred_age_40_100_prob':d['age_classes']['pred'][3]
                 })

In [13]:
#convert predictions to dataframe
df_preds = pd.DataFrame.from_records(preds)
df_preds['pred_is_male_label']=df_preds['pred_gender_label'].apply(lambda x: True if x=='Male' else False)

In [24]:
df_preds.to_pickle('/g100_work/IscrC_mental/data/user_classification/trained_models/cv_models_de.pkl')

### Eval

In [14]:
#load test data
test_data = pd.read_pickle('/g100_work/IscrC_mental/data/user_classification/german_data/data_for_models_german_data.pkl')

In [17]:
def convert_age_to_cat(age):
    if age<=19:
        return AGE_INTERVALS[0]
    if age<=29:
        return AGE_INTERVALS[1]
    if age<=39:
        return AGE_INTERVALS[2]
    if age>=40:
        return AGE_INTERVALS[3]

test_data['age_cat'] = test_data['age'].apply(convert_age_to_cat)

In [19]:
test_data = test_data.merge(df_preds,on='user_id',how='inner',validate='1:1')

In [25]:
# gender
at_least_one_face = test_data['pred_faces']>0
one_face = test_data['pred_faces']==1
y_test = test_data['is_male'].astype(int)
y_pred = test_data['pred_is_male_label'].astype(int)
y_test_n0 = test_data.loc[at_least_one_face,'is_male'].astype(int)
y_pred_n0 = test_data.loc[at_least_one_face,'pred_is_male_label'].astype(int)
y_test_1 = test_data.loc[one_face,'is_male'].astype(int)
y_pred_1 = test_data.loc[one_face,'pred_is_male_label'].astype(int)
print('Gender prediction from images')
print('-'*45)
print('Sample: All')
print(f"Accuracy: {accuracy_score(y_test,y_pred)}")
print(f"F1: {f1_score(y_test,y_pred,average='macro')}")
print('-'*45)
print('Sample: At least 1 face')
print(f"Accuracy: {accuracy_score(y_test_n0,y_pred_n0)}")
print(f"F1: {f1_score(y_test_n0,y_pred_n0,average='macro')}")
print('-'*45)
print('Sample: Exactly 1 face')
print(f"Accuracy: {accuracy_score(y_test_1,y_pred_1)}")
print(f"F1: {f1_score(y_test_1,y_pred_1,average='macro')}")
print('-'*45)

Gender prediction from images
---------------------------------------------
Sample: All
Accuracy: 0.7929411764705883
F1: 0.6579351723128705
---------------------------------------------
Sample: At least 1 face
Accuracy: 0.7942238267148014
F1: 0.7088458205019454
---------------------------------------------
Sample: Exactly 1 face
Accuracy: 0.8
F1: 0.7149844000337295
---------------------------------------------


In [27]:
# age
at_least_one_face = test_data['pred_faces']>0
one_face = test_data['pred_faces']==1
y_test = test_data['age_cat'].apply(convert_age_cat)
y_pred = test_data['pred_age_label'].apply(convert_age_cat)
y_test_n0 = test_data.loc[at_least_one_face,'age_cat'].apply(convert_age_cat)
y_pred_n0 = test_data.loc[at_least_one_face,'pred_age_label'].apply(convert_age_cat)
y_test_1 = test_data.loc[one_face,'age_cat'].apply(convert_age_cat)
y_pred_1 = test_data.loc[one_face,'pred_age_label'].apply(convert_age_cat)
print('Age prediction from images')
print('-'*45)
print('Sample: All')
print(f"Accuracy: {accuracy_score(y_test,y_pred)}")
print(f"F1: {f1_score(y_test,y_pred,average='macro')}")
print('-'*45)
print('Sample: At least 1 face')
print(f"Accuracy: {accuracy_score(y_test_n0,y_pred_n0)}")
print(f"F1: {f1_score(y_test_n0,y_pred_n0,average='macro')}")
print('-'*45)
print('Sample: Exactly 1 face')
print(f"Accuracy: {accuracy_score(y_test_1,y_pred_1)}")
print(f"F1: {f1_score(y_test_1,y_pred_1,average='macro')}")
print('-'*45)

Age prediction from images
---------------------------------------------
Sample: All
Accuracy: 0.34823529411764703
F1: 0.27895166016739537
---------------------------------------------
Sample: At least 1 face
Accuracy: 0.30324909747292417
F1: 0.2804223111343527
---------------------------------------------
Sample: Exactly 1 face
Accuracy: 0.3076923076923077
F1: 0.284134132078025
---------------------------------------------
