# Pain Data Preparation
This notebook prepares the pain dataset in to be able to successfully train a convolutional neural network. Data augmentation techniques such as greyscaling, histogram equalization, etc. are employed.

In [1]:
# Relevant imports
import os
import sys
import numpy as np
import pandas as pd
import cv2
import matplotlib.pyplot as plt
import math

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from Scripts import Data_Loader_Functions as DL
from Scripts import Image_Processor as IP

In [2]:
# Define folder paths
RAW_DATA = os.path.join(module_path, "Data", "Raw Data", "Pain")
PREPROCESSED_DATA = os.path.join(module_path, "Data", "Preprocessed Data", "Pain")
AUGMENTED_DATA = os.path.join(module_path, "Data", "Augmented Data", "Pain")
AUGMENTED_DATA_TWOSTEP = os.path.join(module_path, "Data", "Augmented Data", "Pain Two-Step Augmentation")

## Create Folder Structure
First, we will duplicate the folder structure in "Raw Data" into "Preprocessed Data" and "Augmented Data".

In [None]:
# Duplicate folder structure
DL.mirror_folder_structure(RAW_DATA, PREPROCESSED_DATA)
DL.mirror_folder_structure(RAW_DATA, AUGMENTED_DATA)
DL.mirror_folder_structure(RAW_DATA, AUGMENTED_DATA_TWOSTEP)

## Explore Data

In [None]:
# Get original pain distribution
img_paths = np.array(DL.get_image_paths(RAW_DATA))
labels = np.array(DL.get_labels(img_paths))
no_pain_labels = labels[labels[:,4].astype(int)==0]
pain_labels = labels[labels[:,4].astype(int)>0]
print("Pain Labels:", len(pain_labels))
print("No Pain Labels:", len(no_pain_labels))

In [None]:
# Get number of clients per group
g1_img_paths = [x for x in os.listdir(os.path.join(RAW_DATA, "group_1")) if x != '.DS_Store']
g2_img_paths = [x for x in os.listdir(os.path.join(RAW_DATA, "group_2")) if x != '.DS_Store']
print("Group 1:", len(g1_img_paths))
print("Group 2:", len(g2_img_paths))

In [None]:
# Get number of sessions per client
g1_img_paths = np.array(DL.get_image_paths(os.path.join(RAW_DATA, "group_1")))
g2_img_paths = np.array(DL.get_image_paths(os.path.join(RAW_DATA, "group_2")))
g1_labels = np.array(DL.get_labels(g1_img_paths))
g2_labels = np.array(DL.get_labels(g2_img_paths))
df_1 = pd.DataFrame(g1_labels, columns=['Person','Session','Culture','Frame','Pain']).astype(int)
df_2 = pd.DataFrame(g2_labels, columns=['Person','Session','Culture','Frame','Pain']).astype(int)
df_1['Group'] = 1
df_2['Group'] = 2
df = pd.concat([df_1, df_2])
sess_num = pd.DataFrame(df.groupby(['Person', 'Group'])['Session'].nunique()).sort_values(['Group','Person'])
sess_num

In [None]:
# Average number of sessions per group
print("Average Sessions Group 1: {0:.2f}".format(df_1.groupby('Person')['Session'].nunique().mean()))
print("Average Sessions Group 2: {0:.2f}".format(df_2.groupby('Person')['Session'].nunique().mean()))

In [None]:
# Pain / No Pain per group
print("Group 1 Pain/No Pain/Ratio: {} | {}".format(df[(df['Group'] == 1) & (df['Pain'] == 0)].count()[0], df[(df['Group'] == 1) & (df['Pain'] > 0)].count()[0]))
print("Group 2 Pain/No Pain/Ratio: {} | {}".format(df[(df['Group'] == 2) & (df['Pain'] == 0)].count()[0], df[(df['Group'] == 2) & (df['Pain'] > 0)].count()[0]))

## Process Images
We will now process the images. Preprocessing includes converting to greyscale, and histogram equalization.

In [None]:
# Preprocess images
IP.bulk_process_images(RAW_DATA, PREPROCESSED_DATA, ".jpg")

In [None]:
# Flip images and copy originals into augmented data folder
IP.bulk_augment_images(PREPROCESSED_DATA, AUGMENTED_DATA_TWOSTEP, ".jpg", "flip", "pain", label_threshold=-1)
IP.bulk_augment_images(PREPROCESSED_DATA, AUGMENTED_DATA_TWOSTEP, ".jpg", "original", "pain", label_threshold=-1)

In [None]:
# Rotate Originals and flipped images, and ensure that naming conventions stay consistent
IP.bulk_augment_images(AUGMENTED_DATA_TWOSTEP, AUGMENTED_DATA_TWOSTEP, "_flipped.jpg", "rotate_crop", "pain", label_threshold=-1)
IP.bulk_augment_images(AUGMENTED_DATA_TWOSTEP, AUGMENTED_DATA_TWOSTEP, "_original.jpg", "rotate_crop", "pain", label_threshold=-1)
IP.bulk_rename_files(AUGMENTED_DATA_TWOSTEP, AUGMENTED_DATA_TWOSTEP, "_rotated", "_straight")

In [None]:
# Crop images to same maximum width and height (10-degree rotation in previous step cropped rotated images 
# down to (215, 215), so this is chosen as a max width/height)
IP.bulk_rename_files(AUGMENTED_DATA_TWOSTEP, AUGMENTED_DATA_TWOSTEP, "_rotated", "_straight")
IP.bulk_crop_images(AUGMENTED_DATA_TWOSTEP, AUGMENTED_DATA_TWOSTEP, (215, 215))

In [None]:
# Downsample augmented data
DL.downsample_data(os.path.join(AUGMENTED_DATA_TWOSTEP, "training"))
DL.downsample_data(os.path.join(AUGMENTED_DATA_TWOSTEP, "test"))

## Split dataset by clients

In [None]:
img_paths = DL.get_image_paths(AUGMENTED_DATA_TWOSTEP)
labels = DL.get_labels(img_paths)
df = pd.DataFrame(labels, columns=['Person','Session','Culture','Frame','Pain', 'Trans_1', 'Trans_2'])
df[['Person','Session','Culture','Frame','Pain']] = df[['Person','Session','Culture','Frame','Pain']].astype(int)
df['img_path'] = img_paths
df[['Trans_1', 'Trans_2', 'img_path']] = df[['Trans_1', 'Trans_2', 'img_path']].astype(str)

## Split Dataset (Randomly)
Splitting the dataset into training data and test data, by sampling without replacement from the train_data

In [None]:
# Get all images and select 20% at random
img_paths = DL.get_image_paths(AUGMENTED_DATA_TWOSTEP)
np.random.shuffle(img_paths)
split_idx = int(len(img_paths)*0.2)
img_paths_test = img_paths[:split_idx]

In [None]:
# Ensure that each client is represented with ~20% in the test data set
img_per_client_test = np.unique(np.array(DL.get_labels(img_paths_test))[:,0], return_counts=True)[1]
img_per_client_total = np.unique(np.array(DL.get_labels(img_paths))[:,0], return_counts=True)[1]
img_per_client_test / img_per_client_total

In [None]:
# Ensure that the test set is balanced
pain = np.array(img_paths_test)[np.array(DL.get_labels(img_paths_test))[:,4].astype(int) >= 1]
pain_test_labels = np.array(DL.get_labels(pain))
all_test_labels = np.array(DL.get_labels(img_paths_test))
print("Test Pain Split:",len(pain_labels) / len(all_labels))

In [None]:
# Investigate the split for each client in the test set
DL.print_pain_split_per_client(all_test_labels)

In [None]:
# Move test data set into test folder
for src in img_paths_test:
    file = os.path.basename(src)
    dest = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(src))), 'test')
    if not os.path.isdir(dest):
        os.mkdir(dest)
    try:
        os.rename(src, os.path.join(dest, file))
    except FileNotFoundError:
        pass

## Verify Images
In this part we check that the image augmentation had the desired results.

In [None]:
test = os.path.join(AUGMENTED_DATA_TWOSTEP, "test")
print("DISTRIBUTION")
DL.print_pain_label_dist(AUGMENTED_DATA_TWOSTEP)

In [3]:
# Load all images into numpy array
PAIN_TRAIN = os.path.join(AUGMENTED_DATA_TWOSTEP, "training")
PAIN_TEST = os.path.join(AUGMENTED_DATA_TWOSTEP, "test")
train_data, train_labels, test_data, test_labels = DL.load_pain_data(PAIN_TRAIN, PAIN_TEST, None)

0 images processed
1000 images processed
2000 images processed
3000 images processed
4000 images processed
5000 images processed
6000 images processed
7000 images processed
8000 images processed
9000 images processed
10000 images processed
11000 images processed


In [None]:
# Show Flipped/Original Distribution for Train and Test
print(np.unique(train_labels[:,-2], return_counts=True))
print(np.unique(test_labels[:,-2], return_counts=True))

In [None]:
# Show exemplary label
train_labels[0]

In [4]:
# Reduce Pain Labels down to 0/1
pain_label = 4
max_pain_level = 1

# train_labels = DL.reduce_pain_label_categories(train_labels[:,pain_label].astype(np.int), max_pain=max_pain_level)
test_labels = DL.reduce_pain_label_categories(test_labels[:,pain_label].astype(np.int), max_pain=max_pain_level)

In [None]:
# Show pain distribution for train and test
print(np.unique(train_labels, return_counts=True))
print(np.unique(test_labels, return_counts=True))

In [None]:
# Explore the results
print("Train Data: Shape", train_data.shape)
print("Train Labels: Shape", train_labels.shape)
print("Test Data: Shape", test_data.shape)
print("Test Labels: Shape", test_labels.shape)

## Model Testing
Here we bild a simple Keras model

In [5]:
def build_cnn(input_shape):
    """
    Compile and return a simple CNN model for image recognition.

    Configuration:
    Layer 1: Convolution Layer | Filters: 32 | Kernel Size: 3x3 | Activation: Relu
    Layer 2: Max Pooling Layer | Filter: 2x2
    Layer 3: Dense Layer       | Neurons: 32 | Activation: Relu
    Layer 4: Dense Layer       | Neurons: 10 | Activation: Softmax

    Optimizer:      Adam
    Loss function:  Sparse Categorical Cross Entropy
    Loss metric:    Accuracy


    :param input_shape:     image input shape (tuple), e.g. (28, 28, 1)

    :return:
        model               compiled tensorflow model
    """

    # Set up model type
    model = models.Sequential()

    # Add layers
    model.add(layers.Conv2D(32, (5, 5), input_shape=input_shape))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Conv2D(64, (5, 5)))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Conv2D(128, (5, 5)))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Flatten())
    model.add(layers.Dense(128, activation='relu'))
    model.add(layers.Dense(2, activation='softmax'))

    return model

In [21]:
# Imports
import tensorflow as tf
models = tf.keras.models  # like 'from tensorflow.keras import models' (PyCharm import issue workaround)
layers = tf.keras.layers  # like 'from tensorflow.keras import layers' (PyCharm import issue workaround)
optimizers = tf.keras.optimizers  # like 'from tensorflow.keras import optimizers' (PyCharm import issue workaround)

In [27]:
metrics = [
        'accuracy',
        tf.metrics.Recall(),
        tf.metrics.Precision(),
        tf.metrics.AUC(curve='PR'),
        tf.metrics.TruePositives(),
        tf.metrics.TrueNegatives(),
        tf.metrics.FalsePositives(),
        tf.metrics.FalseNegatives(),
    ]

In [14]:
model = build_cnn(test_data[0].shape)

In [36]:
# Training loop
epochs = 2

results = []
for epoch in range(epochs):
    # Training
    model.compile(optimizer='sgd', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    model.fit(test_data[:30], test_labels[:30], epochs=1, batch_size=32, validation_split=0,  use_multiprocessing=True)
    
    # Evaluating
    model.compile(optimizer='sgd', loss='sparse_categorical_crossentropy', metrics=metrics)
    epoch_results = model.evaluate(test_data[:30], test_labels[:30], batch_size=1)
    results.append(epoch_results)

    df = pd.DataFrame(
        results,
        columns=[
            'Loss',
            'Accuracy',
            'Recall',
            'Precision',
            'AUC',
            'TP',
            'TN',
            'FP',
            'FN'
            ]
        )
    f1_score = 2*((df['Precision']*df['Recall'])/(df['Precision']+df['Recall']))
    df['F1_Score'] = f1_score
    df.to_csv('log_results_epoch-{}_type-{}.csv'.format(epoch, train_type))

Train on 30 samples
Train on 30 samples


In [37]:
df

Unnamed: 0,Loss,Accuracy,Recall,Precision,AUC,TP,TN,FP,FN,F1_Score
0,0.65581,0.666667,0.5,0.566667,0.602681,17.0,13.0,13.0,17.0,0.53125
1,0.647472,0.7,0.5,0.566667,0.614824,17.0,13.0,13.0,17.0,0.53125


In [16]:
model.fit(test_data[:500], test_labels[:500], epochs=1, batch_size=32, validation_split=0,  use_multiprocessing=True)

Train on 500 samples


<tensorflow.python.keras.callbacks.History at 0x142378e10>

In [29]:
model.compile(optimizer='sgd', loss='sparse_categorical_crossentropy', metrics=metrics)
model.evaluate(test_data[:500], test_labels[:500], batch_size=1)



[0.6754372404813767, 0.616, 0.5, 0.512, 0.6207885, 256.0, 244.0, 244.0, 256.0]