In [1]:
import pandas as pd
import numpy as np
import os
import tensorflow as tf
import copy
from sklearn.utils import shuffle

import random
from numpy.random import seed
from tensorflow.random import set_seed
seed(0)
set_seed(0)

# Data Processing: Class Combination Case

Read in data to generate Pandas dataframe. If generating for the first time, will take about five minutes. Otherwise, it should take about 10 seconds.

In [2]:
datapath = '../data/Pose_Dataset/'
nFrames = 300 # number of frames per squat
df_filename = '../data/all-data-combined-classes.pkl' # where to save concatenated data
np_filename = '../data/np-data-combined-classes.npy'
name_to_label = {
    'bad_innner_thigh': 0,
    'bad_back_round': 1,
    'bad_back_warp': 2,
    'bad_head': 3,
    'bad_shallow': 4,
    'bad_toe': 5,
    'good': 6
}

#error analysis utility: for combining certain classes to see if this improves accuracy of model...if not combining any classes, pass empty matrix []
#classes_to_combine = [['bad_innner_thigh', 'bad_head']] 
#classes_to_combine = [['bad_head', 'bad_toe']] 
#classes_to_combine = [['bad_back_warp', 'bad_back_round'], ['bad_head', 'bad_toe']] 
#classes_to_combine = [['bad_back_warp', 'bad_back_round']]
classes_to_combine = [['bad_back_warp', 'bad_back_round']]

##have to comment out if through else opening and move the code below one tab to left if need to regenerate the pickled files.
#do the opposite to use the saved data OR just delete the files and then regenerate them

if os.path.exists(df_filename) and os.path.exists(np_filename):
    # read in pkl file
    df = pd.read_pickle(df_filename)
    with open(np_filename, 'rb') as f:
        X_train = np.load(f)
        y_train = np.load(f)
        X_val = np.load(f)
        y_val = np.load(f)
        X_test = np.load(f)
        y_test = np.load(f)
else:
    # generate pkl file and npy file

    # initialize arrays to fill in each loop iteration
    filenames = []
    datas = []
    np_datas = []
    labels = []
    
    import copy
    # read in each squat file
    for cur_dir, _, files in os.walk(datapath):
        for file in files:
            if not file.endswith('.json'):
                continue
            filename = os.path.join(cur_dir, file)
            filenames.append(filename)

            data = pd.read_json(filename).to_numpy()
            data = data[1,0:nFrames] # get data for frames. Note some files have 301 frames, truncate all to first 300
            datas.append(data)
            np_data = np.array([np.array(d) for d in data])
            if np_data.shape == (300, 171):
                np_datas.append(np_data)
                labels.append(name_to_label[cur_dir.split('/')[-2]]) #append label if appending data 
                #labels.append(name_to_label_comb[cur_dir.split('/')[-2]]) #replaced line above to check out the error analysis
        print(f"Done processing {cur_dir}")
        # move arrays into dataframe
    datas = np.array(datas) # convert to array so we can loop through
    d = {'filename': filenames}
    for i in range(nFrames):
        d[str(i)] = datas[:,i]
    df = pd.DataFrame(data=d)

    #add label column based on filename (also makes new naming for combined classes)
    new_classes = ['_or_'.join(classes) for classes in classes_to_combine]
    new_labels = []
    for i in range(len(df)):
        current_label = df['filename'][i].split('/')[3]
        for j in range(len(classes_to_combine)):
            if current_label in classes_to_combine[j]:
                current_label = new_classes[j]
        new_labels.append(current_label)
    df['label'] = new_labels  
    
    # save to pkl
    df.to_pickle(df_filename)

    # write data into numpy arrays and save
    np_datas = np.stack(np_datas)
    labels = np.array(labels)
    
    #separate data into classes 
    num_classes = len(name_to_label)
    pre_data = []
    pre_labels = []   
    for i in range(num_classes):
        indices = np.where(labels == i) #finding indices where particular class is found in labels
        pre_labels.append(labels[indices])
        pre_data.append(np_datas[indices])
        
    #choose to keep only half of the data from the 2 classes we're combining
    modified_name_to_label = copy.deepcopy(name_to_label)

    for i in range(len(classes_to_combine)):
        for j in classes_to_combine[i]:
            label = name_to_label[j]
            label_to_use = name_to_label[ classes_to_combine[i][0] ] #choosing the first label in the group of classes to combine
            modified_name_to_label[j] = label_to_use

            fraction = 1/len(classes_to_combine[i]) #if don't want to reduce data such that data in the combined class is roughly equal to data in the other non-combined classes, make fraction = 1
            pre_data[label] = pre_data[label][0:int(fraction*len( pre_data[label]))]
            pre_labels[label] = [label_to_use]*int(fraction*len( pre_labels[label])) #replacing original label with the new one based on the combined classes

    #recombine the pre-data and pre-labels
    np_datas_combined = [] 
    labels_combined = []

    for i in range(len(pre_labels)):
        np_datas_combined.extend(pre_data[i])
        labels_combined.extend(pre_labels[i])
    np_datas_combined = np.array(np_datas_combined)
    labels_combined =   np.array(labels_combined)  

    #shuffle the data
    np_datas, labels = shuffle(np_datas_combined, labels_combined) #note this is using the combined frames....not the original non-combined ones  

    #modify name_to_label and labels to account for the renumbered classes
    def reorder(name_to_label, labels ):
        #create a set from labels to get unique set of labels
        sorted_unique_labels = list( set(labels) )
        sorted_unique_labels.sort() 


        for i in range(len(sorted_unique_labels)):
            for key,value in name_to_label.items():
                if value == sorted_unique_labels[i]:
                    name_to_label[key] = i #adjust the value in the name to label dict
                    labels = np.where(labels==value, i, labels) #adjust the value in labels
        return name_to_label, labels
    
    name_to_label, labels =  reorder(modified_name_to_label, labels)
    
    
    #original splitting of train, val, test
    n = np_datas.shape[0]
    split_indices = [int(n * 0.8), int(n * 0.9)]
    train_indices, val_indices, test_indices = np.split(np.random.choice(n, n, replace=False), split_indices)

    X_train = np_datas[train_indices]
    y_train = labels[train_indices]

    X_val = np_datas[val_indices]
    y_val = labels[val_indices]

    X_test = np_datas[test_indices]
    y_test = labels[test_indices]

    with open(np_filename, 'wb') as f:
        np.save(f, X_train)
        np.save(f, y_train)
        np.save(f, X_val)
        np.save(f, y_val)
        np.save(f, X_test)
        np.save(f, y_test)    

Done processing ../data/Pose_Dataset/
Done processing ../data/Pose_Dataset/good
Done processing ../data/Pose_Dataset/good/1115_3djoints_index
Done processing ../data/Pose_Dataset/bad_toe
Done processing ../data/Pose_Dataset/bad_toe/1115_3djoints_index
Done processing ../data/Pose_Dataset/bad_shallow
Done processing ../data/Pose_Dataset/bad_shallow/1115_3djoints_index
Done processing ../data/Pose_Dataset/bad_innner_thigh
Done processing ../data/Pose_Dataset/bad_innner_thigh/1115_3djoints_index
Done processing ../data/Pose_Dataset/bad_back_round
Done processing ../data/Pose_Dataset/bad_back_round/1115_3djoints_index
Done processing ../data/Pose_Dataset/bad_back_warp
Done processing ../data/Pose_Dataset/bad_back_warp/1115_3djoints_index
Done processing ../data/Pose_Dataset/bad_head
Done processing ../data/Pose_Dataset/bad_head/1115_3djoints_index


Print number of examples in each class

In [3]:
df_labels = df.label.unique()
for label in df_labels:
    print('%s: %i' %(label,sum(df['label']==label)))

nClasses = len(df_labels)
print('nClasses:', nClasses)

print('\n Train')
df_labels = df.label.unique()
for i in range(nClasses):
    print('%s: %i' %(i,sum(y_train==i)))

print('\n Val')
df_labels = df.label.unique()
for i in range(nClasses):
    print('%s: %i' %(i,sum(y_val==i)))

print('\n Test')
df_labels = df.label.unique()
for i in range(nClasses):
    print('%s: %i' %(i,sum(y_test==i)))

good: 293
bad_toe: 295
bad_shallow: 319
bad_innner_thigh: 230
bad_back_warp_or_bad_back_round: 592
bad_head: 272
nClasses: 6

 Train
0: 177
1: 238
2: 217
3: 265
4: 229
5: 237

 Val
0: 29
1: 34
2: 23
3: 26
4: 31
5: 27

 Test
0: 23
1: 24
2: 32
3: 28
4: 35
5: 29


In [10]:
# convert y-vals to one-hot representation # REMEMBER TO ONLY RUN THIS ONCE
y_train_onehot = tf.keras.utils.to_categorical(y_train,num_classes=nClasses)
y_val_onehot = tf.keras.utils.to_categorical(y_val,num_classes=nClasses)
y_test_onehot = tf.keras.utils.to_categorical(y_test,num_classes=nClasses)

In [11]:
for BATCH_SIZE in [16, 64, 128]:
    train_dset = tf.data.Dataset.from_tensor_slices((X_train,y_train_onehot)).batch(BATCH_SIZE)
    val_dset = tf.data.Dataset.from_tensor_slices((X_val,y_val_onehot)).batch(BATCH_SIZE)
    test_dset = tf.data.Dataset.from_tensor_slices((X_test,y_test_onehot)).batch(BATCH_SIZE)
    tf.data.experimental.save(train_dset,f'../data/dsets_combined_classes/train_dset-{BATCH_SIZE}')
    tf.data.experimental.save(val_dset,f'../data/dsets_combined_classes/val_dset-{BATCH_SIZE}')
    tf.data.experimental.save(test_dset,f'../data/dsets_combined_classes/test_dset-{BATCH_SIZE}')

In [6]:
# # save datasets
# tf.data.experimental.save(train_dset,'../data/dsets_combined_classes/train_dset')
# tf.data.experimental.save(val_dset,'../data/dsets_combined_classes/val_dset')
# tf.data.experimental.save(test_dset,'../data/dsets_combined_classes/test_dset')

In [7]:
# print element_spec for input to loading model in other notebooks
train_dset.element_spec

(TensorSpec(shape=(None, 300, 171), dtype=tf.float64, name=None),
 TensorSpec(shape=(None, 6), dtype=tf.float32, name=None))

In [8]:
##variables to use for the confusion matrices
print(df_labels,'\n', name_to_label)



['good' 'bad_toe' 'bad_shallow' 'bad_innner_thigh'
 'bad_back_warp_or_bad_back_round' 'bad_head'] 
 {'bad_innner_thigh': 0, 'bad_back_round': 1, 'bad_back_warp': 1, 'bad_head': 2, 'bad_shallow': 3, 'bad_toe': 4, 'good': 5}


display_labels = [] list is df_labels but ordering must be adjusted to go from smallest to largest  based on the values in name_to_label

ex. for df_labels = ['good' 'bad_toe' 'bad_shallow' 'bad_innner_thigh_or_bad_head'
 'bad_back_round' 'bad_back_warp']
 
 and name_to_label = {'bad_innner_thigh': 0, 'bad_back_round': 1, 'bad_back_warp': 2, 'bad_head': 0, 'bad_shallow': 3, 'bad_toe': 4, 'good': 5}
 
 display_labels = ['bad_innner_thigh_or_bad_head', 'bad_back_round', 'bad_back_warp', 'bad_shallow', 'bad_toe', 'good']

In [9]:
display_labels = ['bad_inner_thigh', 'bad_back_round', 'bad_back_warp', 'bad_head', 'bad_shallow', 'bad_toe', 'good']

