In [4]:
import pandas as pd
from utils import contain_substr

def preprocess_skel(skel_csv, use_position=0, standardize=True):
    skel_df = pd.read_csv(skel_csv, index_col='frame')
    skel_df.drop(['sync_time', 'raw_time', 'body', 'J1_dist_from_J1', 'J1_3D_rel_X', 'J1_3D_rel_Y', 'J1_3D_rel_Z'], axis=1,
                 inplace=True, errors='ignore')
    if use_position:
        keeps = ['accel', 'speed', 'dist', 'interhand', '2D', 'rel']
    else:
        keeps = ['accel', 'speed', 'dist', 'interhand', 'rel']

    for c in skel_df.columns:
        if contain_substr(c, keeps):
            continue
        else:
            skel_df.drop([c], axis=1, inplace=True)
    if standardize:
        # load sampled skel features, 200 samples for each video.
        combined_runs = pd.read_csv('sampled_skel_features_dec_6.csv')
        # mask outliers with N/A
        select_indices = (skel_df < combined_runs.quantile(upper_quantile)) & (skel_df > combined_runs.quantile(low_quantile))
        skel_df = skel_df[select_indices]
        qualified_columns = (select_indices.sum() > int(len(skel_df) * marked))
        if qualified_columns.sum() / len(qualified_columns) > passed:
            return 1
        else:
            print(f"Video {skel_csv} has {len(qualified_columns) - qualified_columns.sum()} un-qualified columns!!!")
            return 0
        # fill N/A

    #     skel_df = skel_df.ffill()
    #
    #     # standardize using global statistics
    #     select_indices = (combined_runs < combined_runs.quantile(upper_quantile)) & (combined_runs > combined_runs.quantile(low_quantile))
    #     combined_runs_q = combined_runs[select_indices]
    #     stats = combined_runs_q.describe().loc[['mean', 'std']]
    #     skel_df = (skel_df - stats.loc['mean', skel_df.columns]) / stats.loc['std', skel_df.columns]
    #
    # return skel_df

In [6]:
import glob
skel_csvs = glob.glob('output/skel/*.csv')
qualified_runs = 0
for skel_csv in skel_csvs:
    qualified_runs += preprocess_skel(skel_csv)
qualified_runs

Video output/skel\1.1.3_kinect_skel_features.csv has 21 un-qualified columns!!!
Video output/skel\1.1.5_kinect_skel_features.csv has 23 un-qualified columns!!!




Video output/skel\1.2.10_kinect_skel_features.csv has 111 un-qualified columns!!!
Video output/skel\1.2.2_kinect_skel_features.csv has 119 un-qualified columns!!!
Video output/skel\1.2.4_kinect_skel_features.csv has 23 un-qualified columns!!!
Video output/skel\1.2.5_kinect_skel_features.csv has 43 un-qualified columns!!!
Video output/skel\1.2.6_kinect_skel_features.csv has 16 un-qualified columns!!!
Video output/skel\1.2.8_kinect_skel_features.csv has 19 un-qualified columns!!!
Video output/skel\1.3.5_kinect_skel_features.csv has 112 un-qualified columns!!!
Video output/skel\2.3.9_kinect_skel_features.csv has 16 un-qualified columns!!!
Video output/skel\2.4.5_kinect_skel_features.csv has 21 un-qualified columns!!!
Video output/skel\3.3.1_kinect_skel_features.csv has 26 un-qualified columns!!!
Video output/skel\3.3.2_kinect_skel_features.csv has 30 un-qualified columns!!!
Video output/skel\3.3.3_kinect_skel_features.csv has 25 un-qualified columns!!!
Video output/skel\3.3.4_kinect_skel_

In [9]:
low_quantile = 0.05
upper_quantile = 0.95
marked = 0.8 # if more than 80% of the feature falls inside the range, don't mark it
passed = 0.9 # if more than 90% of the # features are not marked, don't filter it
import numpy as np
from joblib import Parallel, delayed
data = pd.DataFrame(columns=['marked', 'passed', 'qualified_count'])
for marked in np.arange(0.5, 1, 0.1):
    for passed in np.arange(0.5, 1, 0.1):
        res = Parallel(n_jobs=16)(delayed(preprocess_skel)(skel_csv) for skel_csv in skel_csvs)
        qualified_runs = np.sum(res)
        data.loc[len(data), :] = [marked, passed, qualified_runs]

In [10]:
data


Unnamed: 0,marked,passed,qualified_count
0,0.2,0.5,148
1,0.2,0.6,144
2,0.2,0.7,144
3,0.2,0.8,144
4,0.2,0.9,144
5,0.4,0.5,146
6,0.4,0.6,144
7,0.4,0.7,144
8,0.4,0.8,144
9,0.4,0.9,143


In [33]:
data_wide = data.pivot(index=['marked'],columns="passed", values="qualified_count")
data_wide = data_wide / 149

In [34]:
data_wide

passed,0.5,0.6,0.7,0.8,0.9
marked,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.2,0.993289,0.966443,0.966443,0.966443,0.966443
0.4,0.979866,0.966443,0.966443,0.966443,0.959732
0.6,0.966443,0.966443,0.966443,0.966443,0.946309
0.8,0.966443,0.966443,0.946309,0.85906,0.644295


In [35]:
import plotly.express as px
fig = px.imshow(data_wide, origin='lower')
fig.update_xaxes(title="Percent of Features to Pass as a Video")
fig.update_yaxes(title="Percent of Frames to Pass as a Feature")
fig.show()