In [1]:
# !conda install tqdm -y

In [2]:
# !pip install seaborn

In [3]:
import os
import ast
import glob
import math
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [4]:
# !pip install sktime

In [5]:
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.ensemble import RandomForestClassifier, IsolationForest, GradientBoostingClassifier
from sklearn.linear_model import RidgeClassifierCV
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sktime.transformations.panel.rocket import MiniRocketMultivariate

In [6]:
from tsai.all import *

In [7]:
my_setup()

os              : Linux-5.14.0-503.11.1.el9_5.x86_64-x86_64-with-glibc2.34
python          : 3.12.8
tsai            : 0.4.1
fastai          : 2.8.2
fastcore        : 1.8.2
torch           : 2.5.1+cu124
device          : 1 gpu (['NVIDIA L40S'])
cpu cores       : 64
threads per cpu : 1
RAM             : 1007.15 GB
GPU memory      : [44.99, 44.99, 44.99, 44.99] GB


In [137]:
from tsfresh import extract_features, select_features
from tsfresh.utilities.dataframe_functions import impute

In [121]:
def extend_df(
        df: pd.DataFrame, 
        target_size: int
    ) -> pd.DataFrame:

    current_size = len(df)
    last_row = df.iloc[[-1]]
    repeats = target_size - current_size
    extension = pd.concat([last_row] * repeats, ignore_index=True)
    extension.index = range(current_size, target_size)
    
    return pd.concat([df, extension])

In [9]:
df_meta = pd.read_csv('../../sensor_fusion/df_meta_withJSON_withTimes_goodDetections_min2700.csv', index_col=0)

In [10]:
df_meta.head()

Unnamed: 0,study_id,premature,gestational_age_weeks,post_menstrual_age_days,adjusted_age_weeks,chronological_age_weeks,over3m,sex,diagnosis,y_true,...,dirname_json,start_time1,stop_time1,start_time2,stop_time2,num_frames_mkv,num_frames_json,num_frames_diff,num_scaled_snippets,fnames_scaled_snippets
0,592,0,38,294.0,40,4,0,1,Normal,0,...,../../sensor_fusion/pred_results_AE5K/592-aafcb787-M-28-2010221627,2700,5400,-1,-1,7196,7197,1,2,"['592-aafcb787-M-28-2010221627_df_scaled_snippet_idx0.csv', '592-aafcb787-M-28-2010221627_df_scaled_snippet_idx1.csv']"
1,1838,0,40,330.0,47,7,0,1,Normal,0,...,../../sensor_fusion/pred_results_AE5K/1838-1976fc8e-M-44-2010271018,3450,6150,-1,-1,7202,7203,1,2,"['1838-1976fc8e-M-44-2010271018_df_scaled_snippet_idx0.csv', '1838-1976fc8e-M-44-2010271018_df_scaled_snippet_idx1.csv']"
2,8015,0,39,314.0,43,5,0,1,Normal,0,...,../../sensor_fusion/pred_results_AE5K/8015-d6a22f17-M-39-2010291126,1500,4200,-1,-1,7196,7197,1,2,"['8015-d6a22f17-M-39-2010291126_df_scaled_snippet_idx0.csv', '8015-d6a22f17-M-39-2010291126_df_scaled_snippet_idx1.csv']"
3,8026,0,40,488.0,69,29,1,1,Normal,0,...,../../sensor_fusion/pred_results_AE5K/8026-0e213ab6-M-207-2010301654,960,3660,-1,-1,7196,7197,1,2,"['8026-0e213ab6-M-207-2010301654_df_scaled_snippet_idx0.csv', '8026-0e213ab6-M-207-2010301654_df_scaled_snippet_idx1.csv']"
4,24034,0,37,339.0,45,11,0,1,Normal,0,...,../../sensor_fusion/pred_results_AE5K/24034-15d9bab8-M-76-2011051205_M,3690,6390,-1,-1,7196,7197,1,2,"['24034-15d9bab8-M-76-2011051205_M_df_scaled_snippet_idx0.csv', '24034-15d9bab8-M-76-2011051205_M_df_scaled_snippet_idx1.csv']"


In [11]:
# df_meta.shape

In [163]:
df_meta_Normal = df_meta[df_meta['diagnosis']=='Normal']
df_meta_CP     = df_meta[df_meta['diagnosis']=='CP']

In [164]:
# df_meta_Normal.shape, df_meta_CP.shape

In [165]:
df_meta_CP_train, df_meta_CP_test = train_test_split(
    df_meta_CP, test_size=0.25
)

In [166]:
num_snippets_CP_train = int(df_meta_CP_train['num_scaled_snippets'].sum())

In [167]:
num_snippets_CP_test  = int(df_meta_CP_test['num_scaled_snippets'].sum())

In [168]:
num_snippets_CP_train, num_snippets_CP_test

(67, 20)

In [169]:
dfs_combined_CP_train = []

for i in tqdm( range( len(df_meta_CP_train) ) ):

    row = df_meta_CP_train.iloc[i]
    
    dirname_json = row['dirname_json']
    
    fnames_scaled_snippets_tmp  = ast.literal_eval( row['fnames_scaled_snippets'] )
    fnames_scaled_snippets_full = [ os.path.join( dirname_json, fname ) for fname in fnames_scaled_snippets_tmp ]

    for j, fname in enumerate(fnames_scaled_snippets_full[:]):

        fname_asterisk = fname.replace('.csv', '_*.csv')
        
        fnames_dfs = glob.glob( fname_asterisk )
        # fnames_dfs = [ fname for fname in glob.glob( fname_asterisk ) if 'pos_3d' in fname ]

        dfs = []
        for fname_df in fnames_dfs:
            df = pd.read_csv(fname_df, index_col=0)
            dfs.append(df)

        dfs_combined = pd.concat( dfs, axis=1 )
    
        if len(dfs_combined)==2700:
            dfs_combined_CP_train.append( dfs_combined )
            
        elif (len(dfs_combined) < 2700) & (len(dfs_combined) > 2680):
            dfs_combined_extended = extend_df( dfs_combined, 2700 )
            dfs_combined_CP_train.append( dfs_combined_extended )

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 27/27 [00:03<00:00,  7.37it/s]


In [170]:
# len(dfs_combined_CP_train)

In [171]:
# df_meta_CP[ df_meta_CP['over3m']==0 ]['num_scaled_snippets'].sum()

In [172]:
# df_meta_CP[ df_meta_CP['over3m']==1 ]['num_scaled_snippets'].sum()

In [173]:
dfs_combined_CP_test = []

for i in tqdm( range( len(df_meta_CP_test) ) ):

    row = df_meta_CP_test.iloc[i]
    
    dirname_json = row['dirname_json']

    fnames_scaled_snippets_tmp  = ast.literal_eval( row['fnames_scaled_snippets'] )
    fnames_scaled_snippets_full = [ os.path.join( dirname_json, fname ) for fname in fnames_scaled_snippets_tmp ]

    for j, fname in enumerate(fnames_scaled_snippets_full[:]):

        fname_asterisk = fname.replace('.csv', '_*.csv')
        
        fnames_dfs = glob.glob( fname_asterisk )
        # fnames_dfs = [ fname for fname in glob.glob( fname_asterisk ) if 'pos_3d' in fname ]

        dfs = []
        for fname_df in fnames_dfs:
            df = pd.read_csv(fname_df, index_col=0)
            dfs.append(df)

        dfs_combined = pd.concat( dfs, axis=1 )
    
        if len(dfs_combined)==2700:
            dfs_combined_CP_test.append( dfs_combined )
            
        elif (len(dfs_combined) < 2700) & (len(dfs_combined) > 2680):
            dfs_combined_extended = extend_df( dfs_combined, 2700 )
            dfs_combined_CP_test.append( dfs_combined_extended )

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:01<00:00,  9.12it/s]


In [174]:
len(dfs_combined_CP_test)

20

In [175]:
dfs_combined_Normal = []

for _, row in df_meta_Normal.sample( 
        num_snippets_CP_train + num_snippets_CP_test,
    ).iterrows():

    dirname_json = row['dirname_json']
    
    fnames_scaled_snippets_tmp  = ast.literal_eval( row['fnames_scaled_snippets'] )
    fnames_scaled_snippets_full = [ os.path.join( dirname_json, fname ) for fname in fnames_scaled_snippets_tmp ]

    for fname in fnames_scaled_snippets_full[:1]:

        fname_asterisk = fname.replace('.csv', '_*.csv')
        
        fnames_dfs = glob.glob( fname_asterisk )
        # fnames_dfs = [ fname for fname in glob.glob( fname_asterisk ) if 'pos_3d' in fname ]

        dfs = []
        for fname_df in fnames_dfs:
            df = pd.read_csv(fname_df, index_col=0)
            dfs.append(df)

        dfs_combined = pd.concat( dfs, axis=1 )
    
        if len(dfs_combined)==2700:
            dfs_combined_Normal.append( dfs_combined )

        elif (len(dfs_combined) < 2700) & (len(dfs_combined) > 2680):
            dfs_combined_extended = extend_df( dfs_combined, 2700 )
            dfs_combined_Normal.append( dfs_combined_extended )

In [176]:
dfs_combined_Normal_train = dfs_combined_Normal[:len(dfs_combined_CP_train)]

In [177]:
dfs_combined_Normal_test  = dfs_combined_Normal[len(dfs_combined_CP_train):]

In [178]:
# len(dfs_combined_Normal_train), len(dfs_combined_Normal_test)

In [179]:
dfs_combined_train = dfs_combined_CP_train + dfs_combined_Normal_train

In [180]:
dfs_combined_test  = dfs_combined_CP_test + dfs_combined_Normal_test

In [181]:
# len(dfs_combined_train), len(dfs_combined_test)

In [182]:
# dfs_combined_test[0]

In [184]:
### BREAK HERE ####

In [25]:
dfs_combined_CP_test = []

for i in tqdm( range( len(df_meta_CP_test) ) ):

    row = df_meta_CP_test.iloc[i]
    
    dirname_json = row['dirname_json']

    fnames_scaled_snippets_tmp  = ast.literal_eval( row['fnames_scaled_snippets'] )
    fnames_scaled_snippets_full = [ os.path.join( dirname_json, fname ) for fname in fnames_scaled_snippets_tmp ]

    for j, fname in enumerate(fnames_scaled_snippets_full[:]):

        fname_asterisk = fname.replace('.csv', '_*.csv')
        
        fnames_dfs = glob.glob( fname_asterisk )

        dfs = []
        for fname_df in fnames_dfs:
            df = pd.read_csv(fname_df, index_col=0)
            dfs.append(df)

        dfs_combined = pd.concat( dfs, axis=1 )
    
        if len(dfs_combined)==2700:
            dfs_combined_CP_test.append( dfs_combined )
            
        elif (len(dfs_combined) < 2700) & (len(dfs_combined) > 2680):
            dfs_combined_extended = extend_df( dfs_combined, 2700 )
            dfs_combined_CP_test.append( dfs_combined_extended )

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:01<00:00,  7.92it/s]


In [26]:
len(dfs_combined_CP_test)

23

In [27]:
dfs_combined_Normal = []

for _, row in df_meta_Normal.sample( 
        num_snippets_CP_train + num_snippets_CP_test,
    ).iterrows():

    dirname_json = row['dirname_json']
    
    fnames_scaled_snippets_tmp  = ast.literal_eval( row['fnames_scaled_snippets'] )
    fnames_scaled_snippets_full = [ os.path.join( dirname_json, fname ) for fname in fnames_scaled_snippets_tmp ]

    for fname in fnames_scaled_snippets_full[:1]:

        fname_asterisk = fname.replace('.csv', '_*.csv')
        
        fnames_dfs = glob.glob( fname_asterisk )

        dfs = []
        for fname_df in fnames_dfs:
            df = pd.read_csv(fname_df, index_col=0)
            dfs.append(df)

        dfs_combined = pd.concat( dfs, axis=1 )
    
        if len(dfs_combined)==2700:
            dfs_combined_Normal.append( dfs_combined )

        elif (len(dfs_combined) < 2700) & (len(dfs_combined) > 2680):
            dfs_combined_extended = extend_df( dfs_combined, 2700 )
            dfs_combined_Normal.append( dfs_combined_extended )

In [28]:
dfs_combined_Normal_train = dfs_combined_Normal[:len(dfs_combined_CP_train)]

In [29]:
dfs_combined_Normal_test  = dfs_combined_Normal[len(dfs_combined_CP_train):]

In [30]:
# len(dfs_combined_Normal_train), len(dfs_combined_Normal_test)

In [31]:
dfs_combined_train = dfs_combined_CP_train + dfs_combined_Normal_train

In [32]:
dfs_combined_test  = dfs_combined_CP_test + dfs_combined_Normal_test

In [33]:
# len(dfs_combined_train), len(dfs_combined_test)

In [34]:
### BREAK HERE ####

In [35]:
X_train = []
for dfs_combined in dfs_combined_train:
    X_train.append( dfs_combined.values )

In [36]:
y_train = [1] * len(dfs_combined_CP_train) + [0] * len(dfs_combined_Normal_train)

In [37]:
y_train_arr = np.asarray(y_train)

In [38]:
X_test = []
for dfs_combined in dfs_combined_test:
    X_test.append( dfs_combined.values )

In [39]:
y_test = [1] * len(dfs_combined_CP_test) + [0] * len(dfs_combined_Normal_test)

In [40]:
y_test_arr = np.asarray(y_test)

In [41]:
# len(y_train_arr), len(y_test_arr)

In [42]:
X_train_arr = np.asarray(X_train).transpose( (0,2,1 ) )

In [43]:
X_test_arr  = np.asarray(X_test).transpose( (0,2,1 ) )

In [44]:
# X_train_arr.shape, X_test_arr.shape

In [45]:
lms_using4 = [
    'left_wrist','right_wrist','left_ankle','right_ankle'
]

In [46]:
lms_using8 = [
    'left_wrist','right_wrist','left_ankle','right_ankle',
    'left_shoulder','right_shoulder','left_hip','right_hip'
]

In [47]:
lms_using12 = [
    'left_wrist','right_wrist','left_ankle','right_ankle',
    'left_shoulder','right_shoulder','left_hip','right_hip',
    'left_elbow','right_elbow','left_knee','right_knee'
]

In [48]:
#### CHANGE THIS ####
lms_using = lms_using4

In [49]:
# Decide which columns/features to use

In [50]:
feature_sets = set( [col.split('__')[0] for col in dfs_combined.columns] )

In [51]:
feature_sets

{'lm2dists_centroid_2d',
 'lm2dists_centroid_3d',
 'lm2dists_framewise_2d',
 'lm2dists_framewise_3d',
 'lm_pair2dists_2d',
 'lm_pair2dists_3d',
 'lm_triple2angles',
 'polar_coords',
 'pos_2d',
 'pos_3d'}

In [53]:
feature_set2col_indices = {}
for feature_set in feature_sets:
    
    print(f'feature_set={feature_set}')
    
    col_indices = []
    for i, col in enumerate(dfs_combined.columns):
        if feature_set in col:
            if feature_set not in ['lm_triple2angles', 'lm_pair2dists_2d', 'lm_pair2dists_3d', 'polar_coords']:
                lm = '_'.join( col.split('__')[-1].split('_')[:-1] )
                # print(i, feature_set, col, lm)
                if lm in lms_using:
                    col_indices.append(i)
            else:
               col_indices.append(i) 

    feature_set2col_indices[feature_set] = col_indices
    # print('***')

feature_set=lm_pair2dists_3d
feature_set=lm2dists_framewise_3d
feature_set=lm_pair2dists_2d
feature_set=lm2dists_centroid_2d
feature_set=pos_2d
feature_set=lm2dists_centroid_3d
feature_set=pos_3d
feature_set=polar_coords
feature_set=lm2dists_framewise_2d
feature_set=lm_triple2angles


In [54]:
feature_set2col_indices

{'lm_pair2dists_3d': [60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71],
 'lm2dists_framewise_3d': [36, 37, 38, 39],
 'lm_pair2dists_2d': [48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59],
 'lm2dists_centroid_2d': [0, 1, 2, 3],
 'pos_2d': [118, 119, 120, 121, 130, 131, 132, 133],
 'lm2dists_centroid_3d': [12, 13, 14, 15],
 'pos_3d': [146, 147, 148, 149, 150, 151, 164, 165, 166, 167, 168, 169],
 'polar_coords': [80,
  81,
  82,
  83,
  84,
  85,
  86,
  87,
  88,
  89,
  90,
  91,
  92,
  93,
  94,
  95,
  96,
  97,
  98,
  99,
  100,
  101,
  102,
  103,
  104,
  105,
  106,
  107,
  108,
  109],
 'lm2dists_framewise_2d': [24, 25, 26, 27],
 'lm_triple2angles': [72, 73, 74, 75, 76, 77, 78, 79]}

In [79]:
col_indices = []
# for feature_set in feature_set2col_indices:
for feature_set in ['lm_pair2dists_3d']:
    col_indices_tmp = feature_set2col_indices[feature_set]
    for col_idx in col_indices_tmp:
        col_indices.append(col_idx)

In [80]:
len( col_indices )

12

In [81]:
skip = 5

In [82]:
X_train_arr_smaller = X_train_arr[:,col_indices,::skip]
# X_train_arr_smaller = X_train_arr[:,:,::skip]
# X_train_arr_smaller = X_train_arr[:,col_indices,:]
# X_train_arr_smaller = X_train_arr[:,:,:]

In [83]:
X_test_arr_smaller = X_test_arr[:,col_indices,::skip]
# X_test_arr_smaller = X_test_arr[:,:,::skip]
# X_test_arr_smaller = X_test_arr[:,col_indices,:]
# X_test_arr_smaller = X_test_arr[:,:,:]

In [84]:
X_train_arr_smaller.shape, X_test_arr_smaller.shape

((128, 12, 540), (46, 12, 540))

In [192]:
X = []
for dfs_combined in dfs_combined_train:
    X.append(dfs_combined.values)
for dfs_combined in dfs_combined_test:
    X.append(dfs_combined.values)

In [193]:
len(X)

174

In [194]:
X_arr = np.asarray(X)

In [195]:
X_arr.shape

(174, 2700, 170)

In [196]:
skip = 5

In [190]:
X_arr_smaller = X_arr[:,::skip,:]

In [191]:
X_arr_smaller.shape

(174, 540, 170)

In [97]:
dfs = []
for x_arr_smaller in X_arr_smaller:
    dfs.append( pd.DataFrame( x_arr_smaller, columns=dfs_combined.columns ) )

In [127]:
dfs_combined2 = pd.concat(dfs).reset_index()

In [128]:
dfs_combined2 = dfs_combined2.rename(columns={'index':'time'})

In [129]:
dfs_combined2['id'] = (dfs_combined2.index / 540).astype(int)

In [130]:
dfs_combined2.columns = [col.replace('__','-') for col in dfs_combined2.columns]

In [131]:
extracted_features = extract_features(dfs_combined2, column_id='id', column_sort='time')

Feature Extraction: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 160/160 [04:56<00:00,  1.85s/it]


In [132]:
extracted_features.shape

(174, 133110)

In [134]:
extracted_features.to_csv('./extracted_features.csv')

In [138]:
extracted_features_postImpute = impute(extracted_features)

In [None]:
y = 

In [None]:
features_filtered = select_features(extracted_features, y)