In [1]:
import os
import subprocess
import pandas as pd
import numpy as np
import random
import cv2
import matplotlib.pyplot as plt
import re
import pickle 
from tqdm import tqdm
from deepface.modules import modeling, detection, preprocessing

In [2]:
dataset = os.listdir('../DAiSEE/DataSet/')
print(dataset)

['.DS_Store', 'Test', 'Train.txt', 'Validation.txt', 'Train', 'Test.txt', 'Validation']


In [3]:
test_csv = pd.read_csv('../DAiSEE/Labels/TestLabels.csv')
train_csv = pd.read_csv('../DAiSEE/Labels/TrainLabels.csv')
valid_csv = pd.read_csv('../DAiSEE/Labels/ValidationLabels.csv')
train_csv

Unnamed: 0,ClipID,Boredom,Engagement,Confusion,Frustration
0,1100011002.avi,0,2,0,0
1,1100011003.avi,0,2,0,0
2,1100011004.avi,0,3,0,0
3,1100011005.avi,0,3,0,0
4,1100011006.avi,0,3,0,0
...,...,...,...,...,...
5353,4599990246.avi,0,3,0,0
5354,4599990247.avi,0,3,0,0
5355,4599990248.avi,1,2,1,1
5356,4599990249.avi,0,3,0,0


In [4]:
def convert_int(s):
    if s.isdigit():
        return int(s)
    else:
        return s


def alphanum_key(s):
    return [convert_int(c) for c in re.split('([0-9]+)', s)]


def sort_nicely(l):
    l.sort(key=alphanum_key)

In [5]:
# 2 lists  to store the following
# list of features (images)
#{0: [[3, 1, 2, 2][4, 3, 2, 2]]}
# list of labels (confused or not)
#{0: 3}


# loop over the train and test and validation
for ttv in dataset:
    # print(ttv, os.path.exists('DataSet/'+ttv+'/'))
    
    if ttv != '.DS_Store' and os.path.exists('../DAiSEE/DataSet/'+ttv+'/'):
        # train, test, validation
        count = 0
        features_list = []
        labels_list = []
        
        users = os.listdir('../DAiSEE/DataSet/'+ttv+'/')
        # print(users)
        if ttv == 'Test':
            all_clips = test_csv[['ClipID', 'Confusion']]
            outfile_name = 'Test_imb.pkl'
            # print(all_clips)
        if ttv == 'Train':
            all_clips = train_csv[['ClipID', 'Confusion']]
            outfile_name = 'Train_imb.pkl'
        if ttv == 'Validation':
            all_clips = valid_csv[['ClipID', 'Confusion']]
            outfile_name = 'Validation_imb.pkl'
            
        for user in tqdm(users):
            if user != '.DS_Store':
                # user is the folder, has 300 jpeg
                # extract jpeg
                currUser = os.listdir('../DAiSEE/DataSet/'+ttv+'/'+user+'/')
                
                # extract is still a folder (10s clip each folder)
                for extract in currUser:
                    
                    if extract != '.DS_Store':
                        content_list = os.listdir('../DAiSEE/DataSet/'+ttv+'/'+user+'/'+extract+'/')
                                        
                        if pd.Series(content_list).isin(all_clips['ClipID']).any():
                            confusion_val = all_clips.loc[all_clips['ClipID'].isin(content_list)].Confusion.to_list()[0]
                            # remove video for below
                            img_list = [c for c in content_list if c.endswith('.jpg')]
                            sort_nicely(img_list)
                            # if confusion_val == 0 or 1, skip every 30 (1s)
                            # elif confusion_val == 2 or 3, skip every 3 (0.1s)
                            # each video has 300 frames
                            if confusion_val == 0: #6000
                                chosen = img_list[149] 
                            elif confusion_val == 1: #2200 * 1 = 2200
                                chosen = img_list[149] 
                            # confused is every 60 frames
                            elif confusion_val == 2: # 750videos * 5 = 3750, 300 total per video
                                chosen = img_list[149]
                            elif confusion_val == 3: # 101 * 5 = 505
                                chosen = img_list[149]
                            # chosen  = random.choice(img_list)
                            for item in img_list: 
                                
                                if item in chosen:
                                    count += 1
                                    img_path = os.path.abspath('.')+'/../DAiSEE/DataSet/'+ttv+'/'+user+'/'+extract+'/'+item
                                    img = cv2.imread(str(img_path))
                                    # img_objs = detection.extract_faces(
                                    #     img_path=img,
                                    #     detector_backend='mtcnn',
                                    #     grayscale=False,
                                    #     enforce_detection=False
                                    # )
                                    # cv2.imwrite('placeholder.jpg', 255*img_objs[0]['face'])
                                    # cropped_img = cv2.imread('placeholder.jpg')
                                    
                                    img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
                                    img_gray = cv2.resize(img_gray, (64, 64))
                                    
                                    # img is 3d array (w, h, 1), label is confusion_val
                                    features_list.append(img_gray)
                                    # if confusion_val is 0 or 1, then = 0
                                    if confusion_val == 0 or confusion_val == 1:
                                        labels_list.append(0)
                                    # # elif confusion_val is 2 or 3, then = 1 and 2
                                    elif confusion_val == 2 or confusion_val == 3:
                                        labels_list.append(1)
                                    # labels_list.append(confusion_val)
        
        with open(outfile_name, 'wb') as f:
            pickle.dump([features_list, labels_list], f)
                                        
                        # clip = os.listdir('DataSet/'+ttv+'/'+user+'/'+extract+'/')[0]
                        # print (clip[:-4])
                        # path = os.path.abspath('.')+'/DataSet/'+ttv+'/'+user+'/'+extract+'/'
                        # split_video(clip, clip[:-4], path)
        # print(ttv, count)
# print ("================================================================================\n")
# print ("Frame Extraction Successful")

100%|██████████| 22/22 [00:04<00:00,  5.15it/s]
100%|██████████| 71/71 [00:13<00:00,  5.25it/s]
100%|██████████| 22/22 [00:03<00:00,  6.02it/s]


In [8]:
# 2 lists  to store the following
# list of features (images)
#{0: [[3, 1, 2, 2][4, 3, 2, 2]]}
# list of labels (confused or not)
#{0: 3}


# loop over the train and test and validation
for ttv in dataset:
    # print(ttv, os.path.exists('DataSet/'+ttv+'/'))
    
    if ttv != '.DS_Store' and os.path.exists('../DAiSEE/DataSet/'+ttv+'/'):
        # train, test, validation
        count = 0
        features_list = []
        labels_list = []
        
        users = os.listdir('../DAiSEE/DataSet/'+ttv+'/')
        # print(users)
        if ttv == 'Test':
            all_clips = test_csv[['ClipID', 'Confusion']]
            outfile_name = 'Test.pkl'
            # print(all_clips)
        if ttv == 'Train':
            all_clips = train_csv[['ClipID', 'Confusion']]
            outfile_name = 'Train.pkl'
        if ttv == 'Validation':
            all_clips = valid_csv[['ClipID', 'Confusion']]
            outfile_name = 'Validation.pkl'
            
        for user in tqdm(users):
            if user != '.DS_Store':
                # user is the folder, has 300 jpeg
                # extract jpeg
                currUser = os.listdir('../DAiSEE/DataSet/'+ttv+'/'+user+'/')
                
                # extract is still a folder (10s clip each folder)
                for extract in currUser:
                    
                    if extract != '.DS_Store':
                        content_list = os.listdir('../DAiSEE/DataSet/'+ttv+'/'+user+'/'+extract+'/')
                                        
                        if pd.Series(content_list).isin(all_clips['ClipID']).any():
                            confusion_val = all_clips.loc[all_clips['ClipID'].isin(content_list)].Confusion.to_list()[0]
                            # remove video for below
                            img_list = [c for c in content_list if c.endswith('.jpg')]
                            sort_nicely(img_list)
                            # if confusion_val == 0 or 1, skip every 30 (1s)
                            # elif confusion_val == 2 or 3, skip every 3 (0.1s)
                            if confusion_val == 0: #6000
                                chosen = img_list[149] 
                            elif confusion_val == 1: #2200 * 1 = 2200
                                chosen = img_list[149] 
                            elif confusion_val == 2: # 750 * 11 = 8250
                                chosen = img_list[::27]
                            elif confusion_val == 3: # 101 * 80 = 8080
                                chosen = img_list[::4]
                            # chosen  = random.choice(img_list)
                            for item in img_list: 
                                
                                if item in chosen:
                                    count += 1
                                    img_path = os.path.abspath('.')+'/../DAiSEE/DataSet/'+ttv+'/'+user+'/'+extract+'/'+item
                                    img = cv2.imread(str(img_path))
                                    img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
                                    img_gray = cv2.resize(img_gray, (128, 128))
                                    
                                    # img is 3d array (w, h, 1), label is confusion_val
                                    features_list.append(img_gray)
                                    # if confusion_val is 0 or 1, then = 0
                                    if confusion_val == 0 or confusion_val == 1:
                                        labels_list.append(0)
                                    # # elif confusion_val is 2 or 3, then = 1 and 2
                                    elif confusion_val == 2 or confusion_val == 3:
                                        labels_list.append(confusion_val-1)
                                    # labels_list.append(confusion_val)
        
        with open(outfile_name, 'wb') as f:
            pickle.dump([features_list, labels_list], f)
                                        
                        # clip = os.listdir('DataSet/'+ttv+'/'+user+'/'+extract+'/')[0]
                        # print (clip[:-4])
                        # path = os.path.abspath('.')+'/DataSet/'+ttv+'/'+user+'/'+extract+'/'
                        # split_video(clip, clip[:-4], path)
        # print(ttv, count)
# print ("================================================================================\n")
# print ("Frame Extraction Successful")

100%|██████████| 22/22 [00:06<00:00,  3.61it/s]
100%|██████████| 71/71 [00:19<00:00,  3.58it/s]
100%|██████████| 22/22 [00:05<00:00,  4.27it/s]
