This is a live notebook with experimental code to develop functionality for feature extraction.

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import itertools
import json

import scipy.stats as spstats
# fourier transform
from scipy.fft import fft, ifft

from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
# from statsmodels.tsa.api import acf, graphics, pacf
from statsmodels.tsa.ar_model import AutoReg
# from statsmodels.tsa.ar_model import ar_select_order

import os
print(os.listdir("."))


import re
res_digit = r'[0-9]'


['debug_windowing_data.ipynb', 'logs', 'feature_development.ipynb', 'data', 'merging_dataframes.ipynb', 'data_and_processing_description.ipynb', 'featured_eeg_prediction.ipynb', 'featured_prediction_random_forest.ipynb', 'window_timings_and_IBI.ipynb', 'gait_data_exploration.ipynb', 'feature_correlation_analysis.ipynb', 'featured_eeg_prediction_gradient_boosting.ipynb', 'Analyse_results.ipynb', '.ipynb_checkpoints', 'loading_e4_data.ipynb', 'featured_E4_prediction.ipynb', 'data_investigation_scratch.ipynb', 'grouping_and_crossvalidation.ipynb', 'featured_prediction_svm.ipynb']


In [2]:
# This is a hack to make the library in the parent folder available for imoprts
# A better solution is by np8 here:
# https://stackoverflow.com/questions/714063/importing-modules-from-parent-folder
import sys
import os
import inspect

thisdir = sys.path[0]
print(f"thisdir = {thisdir}")
parentdir = os.path.dirname(thisdir)
#print(f"parentdir = {parentdir}")
if not parentdir in sys.path:
    print("Adding parent directory to python path")
    sys.path.insert(1, parentdir)
else:
    print("Skipping adding parent direct to path (there already)")

print(f"sys.path =\n{sys.path}")



thisdir = /home/luke/git/external/predicament/notebooks
Adding parent directory to python path
sys.path =
['/home/luke/git/external/predicament/notebooks', '/home/luke/git/external/predicament', '/usr/lib/python310.zip', '/usr/lib/python3.10', '/usr/lib/python3.10/lib-dynload', '', '/home/luke/.local/lib/python3.10/site-packages', '/usr/local/lib/python3.10/dist-packages', '/usr/lib/python3/dist-packages', '/usr/lib/python3.10/dist-packages']


In [3]:
## ensure relative path to data directory is sound
# for the notebook we need to modify the BASE_DATA_FOLDER
import os 
os.environ['PREDICAMENT_DATA_DIR'] =  '../data'

In [4]:
from predicament.utils.config import DREEM_EEG_CHANNELS
from predicament.utils.config import FEATURED_BASE_PATH
from predicament.utils.config import WINDOWED_BASE_PATH


from predicament.data.timeseries import create_participant_data_edf_only
from predicament.data.windowed import window_all_participants_data
from predicament.data.windowed import merge_condition_data
from predicament.data.partitioning import between_subject_cv_partition

from predicament.data.features import MAXIMAL_FEATURE_GROUP
from predicament.data.features import STATS_FEATURE_GROUP
from predicament.data.features import INFO_FEATURE_GROUP
from predicament.data.features import FREQ_FEATURE_GROUP
from predicament.data.features import convert_timeseries_to_features
from prepare_evaluation_data import load_dataframe_and_config


In [5]:
from predicament.utils.config import E4_CSV_FILES
from predicament.utils.config import E4_FULL_DIRPATHS
E4_FULL_DIRPATHS

{'VG_01': '../data/CARE_HOME_DATA/./VG01/E4_8921_15_44/',
 'VG_03': '../data/CARE_HOME_DATA/./VG03/E4_9921_12_16/',
 'VG_05': '../data/CARE_HOME_DATA/./VG05/E4_9921_13_24/',
 'VG_06': '../data/CARE_HOME_DATA/./VG06/E4_51021_13_33/',
 'VG_07': '../data/CARE_HOME_DATA/./VG07/E4_51021_15_39/',
 'VG_08': '../data/CARE_HOME_DATA/./VG08/E4_71021_10_42/',
 'VG_09': '../data/CARE_HOME_DATA/./VG09/E4_11221_14_46/',
 'VG_10': '../data/CARE_HOME_DATA/./VG10/E4_31221_11_17/',
 'VH_01': '../data/CARE_HOME_DATA/./VH01/E4_61021_11_03/',
 'VH_02': '../data/CARE_HOME_DATA/./VH02/E4_61021_13_59/',
 'VH_03': '../data/CARE_HOME_DATA/./VH03/E4_11221_11_22/'}

## Gait data

In [None]:
df = pd.read_csv("../data/other_datasets/uci_multivariate_gait_data/gait.csv")
print(f"df.shape = {df.shape}")
df.head()

In [None]:
subjects = df['subject'].unique()
conditions = df['condition'].unique()
legs = df['leg'].unique()
joints = df['joint'].unique()
replications = df['replication'].unique()
times = df['time'].unique()
print(f"subjects = {subjects}")
print(f"conditions = {conditions}")
print(f"replications = {replications}")
print(f"joints = {joints}")
print(f"legs = {legs}")
print(f"joints = {joints}")
print(f"times = {times}")



In [None]:
for subject in subjects:
    subject_df = df[df['subject'] == subject]
    print(f"subject {subject}: has {subject_df.shape[0]} datapoints with {subject_df['time'].max()} time-points")
    
    
    

In [None]:
s1_c1_r1_df = df[(df['subject'] == 1) &(df['condition'] == 1) &(df['replication'] == 1)]
T = times.size
n_channels = len(legs)*len(joints)

s1_c1_r1_df.head()

In [None]:
X111 = np.zeros((n_channels, T))
for c,(leg, joint) in enumerate(itertools.product(legs,joints)):
    X111[c,:] = s1_c1_r1_df[(s1_c1_r1_df['leg']==leg)&(s1_c1_r1_df['joint']==joint)]['angle'].to_numpy()


In [None]:
# the data looks like it has already been smoothed/filtered so
# we will not be applying any filtering to this dataset
fig, axs = plt.subplots(n_channels,1)
for c,(leg, joint) in enumerate(itertools.product(legs,joints)):
    axs[c].plot(times, X111[c,:])


In [None]:
def get_timeseries(df, subject, condition, replication, legs=None, joints=None):
    if legs is None:
        legs = df['leg'].unique()
    if joints is None:
        joints = df['joint'].unique()
    X = np.zeros((n_channels, T))
    scr_df = df[(df['subject'] == subject) &(df['condition'] == condition) &(df['replication'] == replication)]
    for c,(leg, joint) in enumerate(itertools.product(legs,joints)):
        X[c,:] = scr_df[(scr_df['leg']==leg)&(scr_df['joint']==joint)]['angle'].to_numpy()
    return X

In [None]:
feature_set = set(
    ['Mean', 'SD', 'MAD', 'Max', 'Min',# 'SMA',
     'Energy', 'IQR', 'Entropy',
    'arCoeff', 'Correlation', 'Hurst',
    'MaxFreqInd', 'MeanFreq', 'FreqSkewness',
    'FreqKurtosis'#, 'EnergyBands'
    ])
entropy_tol = 1
output_lol = []
scr_cols = ['subject','condition','replication']
for subject, condition, replication, in np.unique(df[scr_cols].values, axis=0):
    # for each unique subject, condition and replication
    X = get_timeseries(df, subject, condition, replication)
    features, feature_names = convert_timeseries_to_features(
        X, feature_set, entropy_tol=entropy_tol, hurst_kind='random_walk')
    output_lol.append(
        np.concatenate(
            ((subject, condition, replication), features)))
all_columns = scr_cols + feature_names
output_df = pd.DataFrame(output_lol,columns=all_columns)
output_df

In [None]:
output_df.to_csv("../data/other_datasets/uci_multivariate_gait_data/gait_features.csv")