# Pain Data Preparation
This notebook prepares the pain dataset in to be able to successfully train a convolutional neural network. Data augmentation techniques such as greyscaling, histogram equalization, etc. are employed.

In [1]:
# Relevant imports
import os
import sys
import numpy as np
import pandas as pd
import cv2
import matplotlib.pyplot as plt
import math

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from Scripts import Data_Loader_Functions as DL
from Scripts import Image_Processor as IP

In [2]:
# Define folder paths
RAW_DATA = os.path.join(module_path, "Data", "Raw Data", "Pain")
AUGMENTED_DATA = os.path.join(module_path, "Data", "Augmented Data", "Pain")
AUGMENTED_DATA_TWOSTEP = os.path.join(module_path, "Data", "Augmented Data", "Pain Two-Step Augmentation")
AUGMENTED_DATA_FLEXIBLE = os.path.join(module_path, "Data", "Augmented Data", "Flexible Augmentation")

## Create Folder Structure
First, we will duplicate the folder structure in "Raw Data" into "Preprocessed Data" and "Augmented Data".

In [None]:
# Duplicate folder structure
DL.mirror_folder_structure(RAW_DATA, PREPROCESSED_DATA)
DL.mirror_folder_structure(RAW_DATA, AUGMENTED_DATA)
DL.mirror_folder_structure(RAW_DATA, AUGMENTED_DATA_TWOSTEP)

## Explore Data

In [None]:
# Get original pain distribution
img_paths = np.array(DL.get_image_paths(RAW_DATA))
labels = np.array(DL.get_labels(img_paths))
no_pain_labels = labels[labels[:,4].astype(int)==0]
pain_labels = labels[labels[:,4].astype(int)>0]
print("Pain Labels:", len(pain_labels))
print("No Pain Labels:", len(no_pain_labels))

In [None]:
# Get number of clients per group
g1_img_paths = [x for x in os.listdir(os.path.join(RAW_DATA, "group_1")) if x != '.DS_Store']
g2_img_paths = [x for x in os.listdir(os.path.join(RAW_DATA, "group_2")) if x != '.DS_Store']
print("Group 1:", len(g1_img_paths))
print("Group 2:", len(g2_img_paths))

In [None]:
# Get number of sessions per client
g1_img_paths = np.array(DL.get_image_paths(os.path.join(RAW_DATA, "group_1")))
g2_img_paths = np.array(DL.get_image_paths(os.path.join(RAW_DATA, "group_2")))
g1_labels = np.array(DL.get_labels(g1_img_paths))
g2_labels = np.array(DL.get_labels(g2_img_paths))
df_1 = pd.DataFrame(g1_labels, columns=['Person','Session','Culture','Frame','Pain']).astype(int)
df_2 = pd.DataFrame(g2_labels, columns=['Person','Session','Culture','Frame','Pain']).astype(int)
df_1['Group'] = 1
df_2['Group'] = 2
df = pd.concat([df_1, df_2])
sess_num = pd.DataFrame(df.groupby(['Person', 'Group'])['Session'].nunique()).sort_values(['Group','Person'])
sess_num

In [None]:
# Average number of sessions per group
print("Average Sessions Group 1: {0:.2f}".format(df_1.groupby('Person')['Session'].nunique().mean()))
print("Average Sessions Group 2: {0:.2f}".format(df_2.groupby('Person')['Session'].nunique().mean()))

In [None]:
# Pain / No Pain per group
print("Group 1 Pain/No Pain/Ratio: {} | {}".format(df[(df['Group'] == 1) & (df['Pain'] == 0)].count()[0], df[(df['Group'] == 1) & (df['Pain'] > 0)].count()[0]))
print("Group 2 Pain/No Pain/Ratio: {} | {}".format(df[(df['Group'] == 2) & (df['Pain'] == 0)].count()[0], df[(df['Group'] == 2) & (df['Pain'] > 0)].count()[0]))

## Process Images
We will now process the images. Preprocessing includes converting to greyscale, and histogram equalization.

In [None]:
# Preprocess images
IP.bulk_process_images(RAW_DATA, PREPROCESSED_DATA, ".jpg")

In [None]:
# Flip images and copy originals into augmented data folder
IP.bulk_augment_images(PREPROCESSED_DATA, AUGMENTED_DATA_TWOSTEP, ".jpg", "flip", "pain", label_threshold=-1)
IP.bulk_augment_images(PREPROCESSED_DATA, AUGMENTED_DATA_TWOSTEP, ".jpg", "original", "pain", label_threshold=-1)

In [None]:
# Rotate Originals and flipped images, and ensure that naming conventions stay consistent
IP.bulk_augment_images(AUGMENTED_DATA_TWOSTEP, AUGMENTED_DATA_TWOSTEP, "_flipped.jpg", "rotate_crop", "pain", label_threshold=-1)
IP.bulk_augment_images(AUGMENTED_DATA_TWOSTEP, AUGMENTED_DATA_TWOSTEP, "_original.jpg", "rotate_crop", "pain", label_threshold=-1)
IP.bulk_rename_files(AUGMENTED_DATA_TWOSTEP, AUGMENTED_DATA_TWOSTEP, "_rotated", "_straight")

In [None]:
# Crop images to same maximum width and height (10-degree rotation in previous step cropped rotated images 
# down to (215, 215), so this is chosen as a max width/height)
IP.bulk_crop_images(AUGMENTED_DATA_TWOSTEP, AUGMENTED_DATA_TWOSTEP, (215, 215))

In [None]:
# Downsample augmented data
DL.downsample_data(os.path.join(AUGMENTED_DATA_TWOSTEP, "group_1"))
DL.downsample_data(os.path.join(AUGMENTED_DATA_TWOSTEP, "group_2"))

## Split Group 2 Data
Split Group 2 Data into 40% Test Data and an additional 60% Test Data.

In [None]:
# Create DataFrame to enable filtering by labels
img_paths = DL.get_image_paths(os.path.join(AUGMENTED_DATA_TWOSTEP, 'group_2'))
labels = DL.get_labels(img_paths)
df = pd.DataFrame(labels, columns=['Person','Session','Culture','Frame','Pain', 'Trans_1', 'Trans_2'])
df[['Person','Session','Culture','Frame','Pain']] = df[['Person','Session','Culture','Frame','Pain']].astype(int)
df['img_path'] = img_paths
df[['Trans_1', 'Trans_2', 'img_path']] = df[['Trans_1', 'Trans_2', 'img_path']].astype(str)

In [None]:
# Split Dataset into Train and Test
origin_path = os.path.join(AUGMENTED_DATA_TWOSTEP, 'group_2')
train_path = os.path.join(AUGMENTED_DATA_TWOSTEP, 'group_2_train')
test_path = os.path.join(AUGMENTED_DATA_TWOSTEP, 'group_2_test')
DL.move_train_test_data(df, origin_path, train_path, test_path)

In [None]:
# Verify that split is 40 / 60
img_train = DL.get_image_paths(train_path)
img_test = DL.get_image_paths(test_path)
print("Test: {0:.2f}".format(len(img_test) / (len(img_test) + len(img_train))))
print("Train: {0:.2f}".format(len(img_train) / (len(img_test) + len(img_train))))

## Split Dataset (Randomly)
Splitting the dataset into training data and test data, by sampling without replacement from the train_data

In [None]:
# Get all images and select 20% at random
img_paths = DL.get_image_paths(AUGMENTED_DATA_TWOSTEP)
np.random.shuffle(img_paths)
split_idx = int(len(img_paths)*0.2)
img_paths_test = img_paths[:split_idx]

In [None]:
# Ensure that each client is represented with ~20% in the test data set
img_per_client_test = np.unique(np.array(DL.get_labels(img_paths_test))[:,0], return_counts=True)[1]
img_per_client_total = np.unique(np.array(DL.get_labels(img_paths))[:,0], return_counts=True)[1]
img_per_client_test / img_per_client_total

In [None]:
# Ensure that the test set is balanced
pain = np.array(img_paths_test)[np.array(DL.get_labels(img_paths_test))[:,4].astype(int) >= 1]
pain_test_labels = np.array(DL.get_labels(pain))
all_test_labels = np.array(DL.get_labels(img_paths_test))
print("Test Pain Split:",len(pain_labels) / len(all_labels))

In [None]:
# Investigate the split for each client in the test set
DL.print_pain_split_per_client(all_test_labels)

In [None]:
# Move test data set into test folder
for src in img_paths_test:
    file = os.path.basename(src)
    dest = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(src))), 'test')
    if not os.path.isdir(dest):
        os.mkdir(dest)
    try:
        os.rename(src, os.path.join(dest, file))
    except FileNotFoundError:
        pass

# Flexible Data Augmentation
Code snippets that allow to move doubly augmented data around quickly. Main purpose is to ensure that the same images in original or augmented form are not being used for training and testing at the same time.

### Step 1: Double-augment images

In [None]:
# Mirror folder structure
print("Mirror Folder Structure")
DL.mirror_folder_structure(RAW_DATA, AUGMENTED_DATA_FLEXIBLE)

# Flip images and copy originals into augmented data folder
print("Flip Images")
IP.bulk_augment_images(PREPROCESSED_DATA, AUGMENTED_DATA_FLEXIBLE, ".jpg", "flip", "pain", label_threshold=-1)
IP.bulk_augment_images(PREPROCESSED_DATA, AUGMENTED_DATA_FLEXIBLE, ".jpg", "original", "pain", label_threshold=-1)

# Rotate Originals and flipped images, and ensure that naming conventions stay consistent
print("Rotate Images")
IP.bulk_augment_images(AUGMENTED_DATA_FLEXIBLE, AUGMENTED_DATA_FLEXIBLE, "_flipped.jpg", "rotate_crop", "pain", label_threshold=-1)
IP.bulk_augment_images(AUGMENTED_DATA_FLEXIBLE, AUGMENTED_DATA_FLEXIBLE, "_original.jpg", "rotate_crop", "pain", label_threshold=-1)
IP.bulk_rename_files(AUGMENTED_DATA_FLEXIBLE, AUGMENTED_DATA_FLEXIBLE, "_rotated", "_straight")

# Crop images to same maximum width and height (10-degree rotation in previous step cropped rotated images 
# down to (215, 215), so this is chosen as a max width/height)
print("Crop Images")
IP.bulk_crop_images(AUGMENTED_DATA_FLEXIBLE, AUGMENTED_DATA_FLEXIBLE, (215, 215), ".jpg")

### Step 2: Reset Folder Structure

In [184]:
# Moving all images into the "raw" subfolder
DL.reset_to_raw(AUGMENTED_DATA_FLEXIBLE)

In [185]:
# Deleting all empty folders
DL.delete_empty_folders(AUGMENTED_DATA_FLEXIBLE)

### Step 3: Load DataFrame

In [29]:
# Get all image paths and corresponding labels into a dataframe
img_paths = np.array(DL.get_image_paths(AUGMENTED_DATA_FLEXIBLE))
labels = np.array(DL.get_labels(img_paths))
df = pd.DataFrame(labels, columns=['Person','Session','Culture','Frame','Pain', 'Trans_1', 'Trans_2'])
df[['Person','Session','Culture','Frame','Pain']] = df[['Person','Session','Culture','Frame','Pain']].astype(int)
df['img_path'] = img_paths
df[['Trans_1', 'Trans_2', 'img_path']] = df[['Trans_1', 'Trans_2', 'img_path']].astype(str)
df = df.sort_values(['Person', 'Session', 'Frame', 'Trans_1', 'Trans_2'], ascending=[True, True, True, False, False]).reset_index(drop=True)
df['temp_id'] = df['Person'].astype(str) + df['Session'].astype(str) + df['Frame'].astype(str)

#### Step 3.1: Remove Subject 101 from the data
Subject 101 only has negative examples "0" and will therefore show "0%" on metrics like "Recall" or "Precision", skewing output graphs.

In [191]:
# Proving that subject 101 only has 0 labels
subject = 101
print("# Pain Labels Subject {} : ".format(subject), np.sum(df[df['Person'] == subject]['Pain']))

# Pain Labels Subject 101 :  0


In [192]:
# Removing subject 101 from the data
df = df[df['Person'] != 101]

### Step 4: Redistribute Data for Training
Do one of the subsection steps.

In [193]:
# Split Data into two groups
group_1 = [42, 47, 49, 66, 95, 97, 103, 106, 108, 121, 123, 124]
df_1 = df[df['Person'].isin(group_1)]
df_2 = df[df['Person'].isin(group_1) == False]

#### Step 4.1: Redistribute Naively
In this step, the we will just downsample the data and put it into two groups, without accounting for potential duplicates in test and train data (e.g. "original" in train, and "flipped" in test.

In [158]:
# Downsample first group
df_1_pain_1 = df_1[df_1['Pain'] > 0]
df_1_pain_0 = df_1[df_1['Pain'] == 0].sample(len(df_1_pain_1), random_state=123)
df_1_downsampled = pd.concat((df_1_pain_0, df_1_pain_1))

In [159]:
# Downsample second group
df_2_pain_1 = df_2[df_2['Pain'] > 0]
df_2_pain_0 = df_2[df_2['Pain'] == 0].sample(len(df_2_pain_1), random_state=123)
df_2_downsampled = pd.concat((df_2_pain_0, df_2_pain_1))

#### Step 4.2: Redistribute - No Mutation Duplicates
In this step the data is split so that the same image in a mutated form is not in train and test data.

In [194]:
# Downsample first group
df_1_pain_1 = df_1[df_1['Pain'] > 0]
df_1_pain_0 = df_1[df_1['Pain'] == 0].sample(len(df_1_pain_1), random_state=123)
df_1_downsampled = pd.concat((df_1_pain_0, df_1_pain_1))

In [195]:
# Split Pain Frames into Train and Test 60 / 40
np.random.seed(123)
ratio = 0.6

temp_ids_pain = df_2[df_2['Pain'] > 0]['temp_id'].unique()
temp_ids_pain_train = np.random.choice(temp_ids_pain, int(ratio * len(temp_ids_pain)), replace=False)
temp_ids_pain_test = temp_ids_pain[np.isin(temp_ids_pain, temp_ids_pain_train) == False]
df_2_pain_train = df_2[df_2['temp_id'].isin(temp_ids_pain_train)]
df_2_pain_test = df_2[df_2['temp_id'].isin(temp_ids_pain_test)]

In [196]:
# Split Pain Frames into Train and Test 60 / 40, with the same number of Train / Test Samples as Pain
temp_ids_no_pain = df_2[df_2['Pain'] == 0]['temp_id'].unique()
temp_ids_no_pain_train = np.random.choice(temp_ids_no_pain, len(df_2_pain_train), replace=False)
temp_ids_no_pain_test = np.random.choice(temp_ids_no_pain[np.isin(temp_ids_no_pain, temp_ids_no_pain_train) == False], len(df_2_pain_test), replace=False)
df_2_pain_0_train = df_2[df_2['temp_id'].isin(temp_ids_no_pain_train)].sample(len(df_2_pain_train))
df_2_pain_0_test = df_2[df_2['temp_id'].isin(temp_ids_no_pain_test)].sample(len(df_2_pain_test))

In [197]:
# Concatenate train and test
df_2_train = pd.concat((df_2_pain_train, df_2_pain_0_train))
df_2_test = pd.concat((df_2_pain_test, df_2_pain_0_test))

In [198]:
# Verify that everything went well
print("Train:          {:.0%} |".format(len(df_2_train) / (len(df_2_test) + len(df_2_train))),
      "Test:          {:.0%}".format(len(df_2_test) / (len(df_2_test) + len(df_2_train))), )
print("Train No Pain: {} |".format(len(df_2_train[df_2_train['Pain'] == 0])), "Test No Pain: {}".format(len(df_2_test[df_2_test['Pain'] == 0])))
print("Train Pain:    {} |".format(len(df_2_train[df_2_train['Pain'] > 0])),  "Test Pain:    {}".format(len(df_2_test[df_2_test['Pain'] > 0])))
print("Train Total:  {} |".format(len(df_2_train)), "Test Total:   {}".format(len(df_2_test)))
print()
print("Total:        {}".format(len(df_2_train) + len(df_2_test)))
print("----------------------------------------")
print("Duplicates:", sum(df_2_train['temp_id'].isin(df_2_test['temp_id'])))

Train:          60% | Test:          40%
Train No Pain: 7396 | Test No Pain: 4936
Train Pain:    7396 | Test Pain:    4936
Train Total:  14792 | Test Total:   9872

Total:        24664
----------------------------------------
Duplicates: 0


### Step 5: Allocate Groups

In [199]:
def allocate_group(df, path):
    if not os.path.isdir(path):
        os.mkdir(path)

    for f_path in df['img_path'].values:
        os.rename(f_path, os.path.join(path, os.path.basename(f_path)))

In [200]:
# Allocate Group 1
group_1_path = os.path.join(AUGMENTED_DATA_FLEXIBLE, "group_1")
allocate_group(df_1_downsampled, group_1_path)

In [201]:
# Allocate Group 2 Train / Test
train_path = os.path.join(AUGMENTED_DATA_FLEXIBLE, 'group_2_train')
test_path = os.path.join(AUGMENTED_DATA_FLEXIBLE, 'group_2_test')

allocate_group(df_2_train, train_path)
allocate_group(df_2_test, test_path)

In [202]:
# Verify Success
print("Group 1:        {}".format(len(os.listdir(group_1_path))))
print("Group 2 Train:  {}".format(len(os.listdir(train_path))))
print("Group 2 Test:   {}".format(len(os.listdir(test_path))))
print("Raw:            {}".format(len(os.listdir(os.path.join(AUGMENTED_DATA_FLEXIBLE, 'raw')))))
print("Raw Pain Img's: {}".format(np.sum(np.minimum(np.array(DL.get_labels(DL.get_image_paths(os.path.join(AUGMENTED_DATA_FLEXIBLE, 'raw'))))[:,4].astype(int), 1))))

Group 1:        41416
Group 2 Train:  14792
Group 2 Test:   9872
Raw:            126344
Raw Pain Img's: 0


In [227]:
def print_experiment(experiment):
    print("\n\n\033[1m{} Experiment {} {}\033[0m".format("-"*int((130-len(experiment))/2), 
                                                         experiment, "-"*int((130-len(experiment))/2)))

In [228]:
exp = "6 - Centralized without pre-training"
print_experiment(exp)



[1m----------------------------------------------- Experiment 6 - Centralized without pre-training -----------------------------------------------[0m
