# Preprocessing Recognition Data and Analysis

In [2]:
#We load the libraries that needed
import pandas as pd
from video_characteristics import *
import boto3
import botocore
import json
import sys
import os
import time
import numpy as np
import random
import pprint
import math
from IPython.display import Video
import matplotlib.pyplot as plt
from loading_s3_data import *

In [3]:
import matplotlib as mpl
mpl.rcParams.update(mpl.rcParamsDefault)
mpl.rcParams['agg.path.chunksize'] = 0
mpl.rcParams.update( mpl.rc_params() )
plt.rcParams.update({'font.size': 11})
#plt.rcParams.update({'text.usetex': True})
plt.rcParams.update({'text.latex.preamble': 'bold'})
plt.rc('font', family='serif')

In [4]:
df = pd.read_csv('../data/dataset_1_rekognition_features.csv')
df.head()

Unnamed: 0,video_key,child_id,ASD,age,gender,VideoDuration,VideoFrameRate,VideoFrameHeight,VideoFrameWidth,Confidence1,...,EyesClosedConf,EyeGlasses,EyeGlassesConf,SunGlasses,SunGlassesConf,AgeLow,AgeHigh,Gender,GenderConf,Size
0,%2B07758486393/1637011476385/GuessWhat.mp4,+07758486393__daisy,1,0.0,Female,57.4,29.9,720.0,960.0,99.5,...,95.0,False,89.6,False,94.2,24.3,32.5,Female,95.8,0.2644083061947145
1,%2B07758486393/1637011774510/GuessWhat.mp4,+07758486393__daisy,1,0.0,Female,57.4,29.9,720.0,960.0,99.8,...,n=0,n=0,n=0,n=0,n=0,n=0,n=0,n=0,n=0,n=0
2,%2B07758486393/1637011884828/GuessWhat.mp4,+07758486393__daisy,1,0.0,Female,57.4,29.9,720.0,960.0,99.6,...,94.7,False,97.0,False,99.9,18.3,27.0,Female,85.3,0.15995461829881003
3,%2B07766544436/1626976603665/GuessWhat.mp4,+07766544436__andrew,1,11.0,Male,57.1,30.0,720.0,960.0,99.5,...,86.2,False,97.1,False,100.0,10.6,18.3,Female,98.5,0.03090653051417859
4,%2B07766544436/1626976728265/GuessWhat.mp4,+07766544436__andrew,1,11.0,Male,57.4,30.0,720.0,960.0,97.3,...,84.4,False,96.1,False,100.0,11.7,19.7,Female,91.3,0.022503950494571684


In [5]:
print(len(df))
print(df.columns)

3113
Index(['video_key', 'child_id', 'ASD', 'age', 'gender', 'VideoDuration',
       'VideoFrameRate', 'VideoFrameHeight', 'VideoFrameWidth', 'Confidence1',
       'Confidence2', 'Sharpness', 'Brightness', 'NoFace1', 'NoFace2',
       'MultiFace1', 'MultiFace2', 'Pitch1', 'Pitch2', 'Roll1', 'Roll2',
       'Yaw1', 'Yaw2', 'EyesClosed', 'EyesClosedConf', 'EyeGlasses',
       'EyeGlassesConf', 'SunGlasses', 'SunGlassesConf', 'AgeLow', 'AgeHigh',
       'Gender', 'GenderConf', 'Size'],
      dtype='object')


There is a total of 3113 videos in the dataset. We can see that the dataset contains the following columns:
`'video_key', 'child_id', 'ASD', 'age', 'gender', 'VideoDuration',
       'VideoFrameRate', 'VideoFrameHeight', 'VideoFrameWidth', 'Confidence1',
       'Confidence2', 'Sharpness', 'Brightness', 'NoFace1', 'NoFace2',
       'MultiFace1', 'MultiFace2', 'Pitch1', 'Pitch2', 'Roll1', 'Roll2',
       'Yaw1', 'Yaw2', 'EyesClosed', 'EyesClosedConf', 'EyeGlasses',
       'EyeGlassesConf', 'SunGlasses', 'SunGlassesConf', 'AgeLow', 'AgeHigh',
       'Gender', 'GenderConf', 'Size'`

## I. Preprocessing steps 

First, we drop videos with no face. 

In [6]:
# Droping videos with no faces
print(len(df[df['Confidence1'].isna()]))
df.dropna(subset=['Confidence1'], inplace=True)

30


There is a total of 30 videos dropped. 

In [7]:
# Converting numerical columns to number
numeric_columns = df.columns.drop(['video_key', 'child_id', 'gender', 'EyeGlasses', 'SunGlasses', 'Gender'])
df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric, errors='coerce')

In [8]:
# Thresholds
# 0 to 100 higher is better
confidence_1_thresh = 95
confidence_2_thresh = 95
eyes_confidence_thresh = 75
sharpness_thresh = 4
brightness_thresh = 20
# 0 to 1 lower is better
no_face_1_thresh = 0.7    
no_face_2_thresh = 0.7
multi_face_1_thresh = 0.7 
multi_face_2_thresh = 0.7
eyes_closed_thresh = 0.7
# -180 to 180 lower magnitude is better
pitch_1_thresh = 45
pitch_2_thresh = 45
roll_1_thresh = 45
roll_2_thresh = 45
yaw_1_thresh = 45
yaw_2_thresh = 45
# 0 to 1 higher is better
size_thresh = 0.01

# Bad conditions
confidence_1_cond = df['Confidence1'] < confidence_1_thresh
confidence_2_cond = df['Confidence2'] < confidence_2_thresh
eyes_confidence_cond = df['EyesClosedConf'] < eyes_confidence_thresh
sharpness_cond = df['Sharpness'] < sharpness_thresh
brightness_cond = df['Brightness'] < brightness_thresh
no_face_1_cond = df['NoFace1'] > no_face_1_thresh
no_face_2_cond = df['NoFace2'] > no_face_2_thresh
multi_face_1_cond = df['MultiFace1'] > multi_face_1_thresh
multi_face_2_cond = df['MultiFace2'] > multi_face_2_thresh
pitch_1_cond = abs(df['Pitch1']) > pitch_1_thresh
pitch_2_cond = abs(df['Pitch2']) > pitch_2_thresh
roll_1_cond = abs(df['Roll1']) > roll_1_thresh
roll_2_cond = abs(df['Roll2']) > roll_2_thresh
yaw_1_cond = abs(df['Yaw1']) > yaw_1_thresh
yaw_2_cond = abs(df['Yaw2']) > yaw_2_thresh
eyes_closed_cond = df['EyesClosed'] > eyes_closed_thresh
eyeglasses_cond = df['EyeGlasses'] == 'True'
sunglasses_cond = df['SunGlasses'] == 'True'
size_cond = df['Size'] < size_thresh
all_cond = (confidence_1_cond&confidence_2_cond) | eyes_confidence_cond | sharpness_cond | (no_face_1_cond&no_face_2_cond) | multi_face_2_cond | (pitch_1_cond&pitch_2_cond) | (roll_1_cond&roll_2_cond) | (yaw_1_cond&yaw_2_cond) | eyes_closed_cond | eyeglasses_cond | sunglasses_cond | size_cond

# Prevalences
print(f'Confidence 1: {len(df[confidence_1_cond])}')
print(f'Confidence 2: {len(df[confidence_2_cond])}')
print(f'Confidence 1 & 2: {len(df[confidence_1_cond & confidence_2_cond])}\n')
print(f'Eyes confidence: {len(df[eyes_confidence_cond])}\n')
print(f'Sharpness: {len(df[sharpness_cond])}')
print(f'Brightness: {len(df[brightness_cond])}\n')
print(f'No face 1: {len(df[no_face_1_cond])}')
print(f'No face 2: {len(df[no_face_2_cond])}')
print(f'No face 1 & 2: {len(df[no_face_1_cond & no_face_2_cond])}\n')
print(f'Multi face 1: {len(df[multi_face_1_cond])}')
print(f'Multi face 2: {len(df[multi_face_2_cond])}')
# print(f'Multi face 1 & 2: {len(data[multi_face_1_cond & multi_face_2_cond])}\n')
print(f'Pitch 1: {len(df[pitch_1_cond])}')
print(f'Pitch 2: {len(df[pitch_2_cond])}')
print(f'Pitch 1 & 2: {len(df[pitch_1_cond & pitch_2_cond])}\n')
print(f'Roll 1: {len(df[roll_1_cond])}')
print(f'Roll 2: {len(df[roll_2_cond])}')
print(f'Roll 1 & 2: {len(df[roll_1_cond & roll_2_cond])}\n')
print(f'Yaw 1: {len(df[yaw_1_cond])}')
print(f'Yaw 2: {len(df[yaw_2_cond])}')
print(f'Yaw 1 & 2: {len(df[yaw_1_cond & yaw_2_cond])}\n')
print(f'Eyes closed: {len(df[eyes_closed_cond])}')
print(f'Eyeglasses: {len(df[eyeglasses_cond])}')
print(f'Sunglasses: {len(df[sunglasses_cond])}\n')
print(f'Size: {len(df[size_cond])}\n\n')
print(f'All: {len(df[all_cond])}')

Confidence 1: 77
Confidence 2: 125
Confidence 1 & 2: 77

Eyes confidence: 13

Sharpness: 104
Brightness: 69

No face 1: 124
No face 2: 119
No face 1 & 2: 117

Multi face 1: 219
Multi face 2: 253
Pitch 1: 1
Pitch 2: 1
Pitch 1 & 2: 1

Roll 1: 71
Roll 2: 79
Roll 1 & 2: 68

Yaw 1: 9
Yaw 2: 18
Yaw 1 & 2: 9

Eyes closed: 0
Eyeglasses: 0
Sunglasses: 0

Size: 69


All: 576


In [9]:
AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY = loading_credentials()
client = boto3.client('s3', 
                      aws_access_key_id=AWS_ACCESS_KEY_ID, 
                      aws_secret_access_key=AWS_SECRET_ACCESS_KEY)

resource = boto3.resource('s3', 
                          aws_access_key_id=AWS_ACCESS_KEY_ID, 
                          aws_secret_access_key=AWS_SECRET_ACCESS_KEY)

In [10]:
# Display a sample video for a condition applied to dataset (used to tune the thresholds)

cond = size_cond 

data_subset = df[cond]
i = data_subset.sample().index[0]

video_key = df.loc[i, 'video_key']
confidence_1 = df.loc[i, 'Confidence1']
confidence_2 = df.loc[i, 'Confidence2']
eyes_confidence = df.loc[i, 'EyesClosedConf']
sharpness = df.loc[i, 'Sharpness']
brightness = df.loc[i, 'Brightness']
no_face_1 = df.loc[i, 'NoFace1']
no_face_2 = df.loc[i, 'NoFace2']
multi_face_1 = df.loc[i, 'MultiFace1']
multi_face_2 = df.loc[i, 'MultiFace2']
roll_1 = df.loc[i, 'Roll1']
roll_2 = df.loc[i, 'Roll2']
size = df.loc[i, 'Size']

print(f'Video key: {video_key}\n')
print(f'Confidence 1: {confidence_1}')
print(f'Confidence 2: {confidence_2}\n')
print(f'Eyes confidence: {eyes_confidence}\n')
print(f'Sharpness: {sharpness}')
print(f'Brightness: {brightness}\n')
print(f'No face 1: {no_face_1}')
print(f'No face 2: {no_face_2}\n')
print(f'Multi face 1: {multi_face_1}')
print(f'Multi face 2: {multi_face_2}\n')
print(f'Roll 1: {roll_1}')
print(f'Roll 2: {roll_2}\n')
print(f'Size: {size}')

try:
    resource.Bucket('headsup-du1r3b78fy').download_file(video_key, 'sample.mp4')
except botocore.exceptions.ClientError as e:
    if e.response['Error']['Code'] == "404":
        pass
    else:
        raise
        
Video('sample.mp4', width=400, height=400)

Video key: _REVIEWED/remote_participants/gibsonsusanm%40gmail.com/1561492594497/GuessWhat.mp4

Confidence 1: 98.5
Confidence 2: 98.5

Eyes confidence: 84.3

Sharpness: 6.3
Brightness: 25.8

No face 1: 0.31
No face 2: 0.29

Multi face 1: 0.01
Multi face 2: 0.02

Roll 1: -2.5
Roll 2: -3.1

Size: 0.006135078524880708


In [13]:
data_cleaned = df[~all_cond]
data_cleaned.to_csv('../data/dataset_2.csv', index=False)
data_cleaned

Unnamed: 0,video_key,child_id,ASD,age,gender,VideoDuration,VideoFrameRate,VideoFrameHeight,VideoFrameWidth,Confidence1,...,EyesClosedConf,EyeGlasses,EyeGlassesConf,SunGlasses,SunGlassesConf,AgeLow,AgeHigh,Gender,GenderConf,Size
0,%2B07758486393/1637011476385/GuessWhat.mp4,+07758486393__daisy,1,0.0,Female,57.4,29.9,720.0,960.0,99.5,...,95.0,False,89.6,False,94.2,24.3,32.5,Female,95.8,0.264408
3,%2B07766544436/1626976603665/GuessWhat.mp4,+07766544436__andrew,1,11.0,Male,57.1,30.0,720.0,960.0,99.5,...,86.2,False,97.1,False,100.0,10.6,18.3,Female,98.5,0.030907
4,%2B07766544436/1626976728265/GuessWhat.mp4,+07766544436__andrew,1,11.0,Male,57.4,30.0,720.0,960.0,97.3,...,84.4,False,96.1,False,100.0,11.7,19.7,Female,91.3,0.022504
7,%2B11991436014/1567730951746/GuessWhat.mp4,+11991436014__Bernardo,1,3.0,Male,87.6,30.0,720.0,960.0,96.8,...,91.3,False,91.7,False,100.0,6.1,13.4,Female,93.6,0.406533
9,%2B13012528047/1587239689132/GuessWhat.mp4,+13012528047__dylan,1,8.0,Male,57.4,29.8,600.0,800.0,97.5,...,87.0,False,94.8,False,100.0,14.5,21.9,Male,93.9,0.031913
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3105,yukster_uk%40yahoo.co.uk/1652896297952/GuessWh...,yukster_uk@yahoo.co.uk__Dylan,1,10.0,Male,57.1,30.0,768.0,1024.0,99.7,...,90.6,False,92.6,False,100.0,5.8,13.3,Male,99.7,0.016612
3106,yukster_uk%40yahoo.co.uk/1652896524105/GuessWh...,yukster_uk@yahoo.co.uk__Dylan,1,10.0,Male,57.3,30.0,768.0,1024.0,100.0,...,84.4,False,93.7,False,100.0,5.7,13.3,Male,92.1,0.055045
3107,yukster_uk%40yahoo.co.uk/1652896748522/GuessWh...,yukster_uk@yahoo.co.uk__Dylan,1,10.0,Male,57.3,30.0,768.0,1024.0,99.9,...,90.6,False,94.8,False,100.0,6.4,14.7,Male,99.0,0.018861
3110,yuliawidjaja@gmail.com/1631319678/GuessWhat.mp4,yuliawidjaja@gmail.com__Hayden,1,0.0,Male,92.5,30.0,360.0,480.0,98.3,...,92.0,False,91.3,False,100.0,5.9,13.3,Male,91.6,0.160503
