In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import itertools

import scipy.stats as spstats
# fourier transform
from scipy.fft import fft, ifft

from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
# from statsmodels.tsa.api import acf, graphics, pacf
from statsmodels.tsa.ar_model import AutoReg
# from statsmodels.tsa.ar_model import ar_select_order

import os
print(os.listdir("."))



In [None]:
# This is a hack to make the library in the parent folder available for imoprts
# A better solution is by np8 here:
# https://stackoverflow.com/questions/714063/importing-modules-from-parent-folder
import sys
import os
import inspect

thisdir = sys.path[0]
print(f"thisdir = {thisdir}")
parentdir = os.path.dirname(thisdir)
#print(f"parentdir = {parentdir}")
if not parentdir in sys.path:
    print("Adding parent directory to python path")
    sys.path.insert(1, parentdir)
else:
    print("Skipping adding parent direct to path (there already)")

print(f"sys.path =\n{sys.path}")



In [None]:
df = pd.read_csv("../data/other_datasets/uci_multivariate_gait_data/gait.csv")
print(f"df.shape = {df.shape}")
df.head()

In [None]:
subjects = df['subject'].unique()
conditions = df['condition'].unique()
legs = df['leg'].unique()
joints = df['joint'].unique()
replications = df['replication'].unique()
times = df['time'].unique()
print(f"subjects = {subjects}")
print(f"conditions = {conditions}")
print(f"replications = {replications}")
print(f"joints = {joints}")
print(f"legs = {legs}")
print(f"joints = {joints}")
print(f"times = {times}")



In [None]:
for subject in subjects:
    subject_df = df[df['subject'] == subject]
    print(f"subject {subject}: has {subject_df.shape[0]} datapoints with {subject_df['time'].max()} time-points")
    
    
    

In [None]:
s1_c1_r1_df = df[(df['subject'] == 1) &(df['condition'] == 1) &(df['replication'] == 1)]
T = times.size
n_channels = len(legs)*len(joints)

s1_c1_r1_df.head()

In [None]:
X111 = np.zeros((n_channels, T))
for c,(leg, joint) in enumerate(itertools.product(legs,joints)):
    X111[c,:] = s1_c1_r1_df[(s1_c1_r1_df['leg']==leg)&(s1_c1_r1_df['joint']==joint)]['angle'].to_numpy()


In [None]:
# the data looks like it has already been smoothed/filtered so
# we will not be applying any filtering to this dataset
fig, axs = plt.subplots(n_channels,1)
for c,(leg, joint) in enumerate(itertools.product(legs,joints)):
    axs[c].plot(times, X111[c,:])


## Aggregate functions

Taken from:
Public Domain Dataset for Human Activity Recognition Using Smartphones, (Anguita et al., 2013)

| Function	|	Description |
|  :---:	|	:---:  |
| mean	|	Mean value |
| std	|	Standard deviation |
| mad	|	Median absolute value |
| max	|	Largest values in array |
| min	|	Smallest value in array |
| sma	|	Signal magnitude area |
| energy	|	Average sum of the squares |
| iqr	|	Interquartile range |
| entropy	|	Signal Entropy |
| arCoeff	|	Autorregresion coefficients |
| correlation	|	Correlation coefficient |
| maxFreqInd	|	Largest frequency component |
| meanFreq	|	Frequency signal weighted average |
| skewness	|	Frequency signal Skewness |
| kurtosis	|	Frequency signal Kurtosis |
| energyBand	|	Energy of a frequency interval |
| angle	|	Angle between two vectors |


In [None]:
## sample entropy from Wikipedia
from itertools import combinations
from math import log

def construct_templates(timeseries_data:list, m:int=2):
    num_windows = len(timeseries_data) - m + 1
    return [timeseries_data[x:x+m] for x in range(0, num_windows)]

def get_matches(templates:list, r:float):
    return len(list(filter(lambda x: is_match(x[0], x[1], r), combinations(templates, 2))))

def is_match(template_1:list, template_2:list, r:float):
    return all([abs(x - y) < r for (x, y) in zip(template_1, template_2)])

def sample_entropy(timeseries_data:list, window_size:int, r:float):
    B = get_matches(construct_templates(timeseries_data, window_size), r)
    A = get_matches(construct_templates(timeseries_data, window_size+1), r)
    return -log(A/B)


In [None]:
def correlations(X):
    return np.array([ e for i,r in enumerate(np.corrcoef(X)) for e in r[i+1:]])

In [None]:

# | mean	|	Mean value |
features111 = np.mean(X111,axis=1)
# | std	|	Standard deviation |
features111 = np.concatenate([features111, np.std(X111,axis=1)])
# | mad	|	Median absolute value/deviation |
features111 = np.concatenate([features111, spstats.median_abs_deviation(X111,axis=1)])
# | max	|	Largest values in array |
features111 = np.concatenate([features111, np.max(X111,axis=1)])
# | min	|	Smallest value in array |
features111 = np.concatenate([features111, np.min(X111,axis=1)])
print(f"After min = {features111.shape}")
# | sma	|	Signal magnitude area |
# not sure how useful for general signals
# see: https://en.wikipedia.org/wiki/Signal_magnitude_area
# seems similar to median absolute deviation
# | energy	|	Average sum of the squares |
features111 = np.concatenate([features111, np.mean(X111**2,axis=1)])
# | iqr	|	Interquartile range |
features111 = np.concatenate([features111, spstats.iqr(X111,axis=1)])
print(f"after IQR = {features111.shape}")
# | entropy	|	Signal Entropy |
# looks like sample entropy
# see: https://academic.oup.com/biomethods/article/4/1/bpz016/5634143
# see: https://www.mdpi.com/1099-4300/20/10/764
window_size = 10
tol = 0.05*(np.mean(np.std(X111,axis=1)))
entropies = [
    sample_entropy(list(ts), window_size, tol) for ts in X111]
features111 = np.concatenate([features111, entropies])
print(f"after entropies = {features111.shape}")
# | arCoeff	|	Autorregresion coefficients |
# using code from here:
# https://www.statsmodels.org/dev/examples/notebooks/generated/autoregressions.html
order = 3
arCoeffs = np.empty((order+1, n_channels))
for c in range(n_channels):
    mod = AutoReg(X111[c,:], order, old_names=False)
    res = mod.fit()
    arCoeffs[:,c] = res.params
features111 = np.concatenate([features111, arCoeffs.flatten()])
print(f"After arCoeffs = {features111.shape}")
# | correlation	|	Correlation coefficient |
features111 = np.concatenate([features111, correlations(X111)])
print(f"After correlations = {features111.shape}")

X111freq = fft(X111)
print(f"X111.shape = {X111.shape}")
print(f"X111freq.shape = {X111freq.shape}")
# | maxFreqInd	|	Largest frequency component |
print(f"np.argmax(X111freq) = {np.argmax(X111freq, axis=1)}")
# | meanFreq	|	Frequency signal weighted average |
# here frequency bands are given in multiples of the base frequency in dB
freqBands = np.arange(X111freq.shape[1]).reshape((1,-1))
print(f"freqBands = {freqBands}")
# using intensity as the magnitude which is the square of the absolute value of the signal
intensities = np.abs(X111freq)**2
X111meanFreq = np.real(np.sum(freqBands*intensities,axis=1))/np.real(np.sum(intensities, axis=1))
print(f"X111meanFreq = {X111meanFreq}")
# an alternative mean frequency is to use p=1 (manhattan norm)
X111meanFreqAlt = np.sum(freqBands*np.abs(X111freq),axis=1)/np.sum(np.abs(X111freq), axis=1)
print(f"X111meanFreqAlt = {X111meanFreqAlt}")

for i in range(X111freq.shape[0]):
    plt.plot(freqBands.flatten(),np.abs(X111freq[i]))

# | skewness	|	Frequency signal Skewness |
# | kurtosis	|	Frequency signal Kurtosis |
# | energyBand	|	Energy of a frequency interval |
# | angle	|	Angle between two vectors |

features111

In [None]:
help(res)

## Things to explore

[Hurst exponent](https://www.mdpi.com/1099-4300/23/12/1672), [Lyapunov exponent spectrum](), [Lempev-Ziv Complexity]()


In [None]:
from predicament.data.features import convert_timeseries_to_features
feature_set = set(
    ['Mean', 'SD', 'MAD', 'Max', 'Min',# 'SMA',
     'Energy', 'IQR', 'Entropy',
    'arCoeff', 'Correlation', 'MaxFreqInd', 'MeanFreq', 'FreqSkewness',
    'FreqKurtosis', 'EnergyBands'])
features111alt = convert_timeseries_to_features(
        X111, feature_set,
        entropy_tol=0.05*(np.mean(np.std(X111,axis=1))))
print(f"features111.shape = {features111.shape}")
print(f"features111alt.shape = {features111alt.shape}")

In [None]:
window_size = 10
tol = 0.05*(np.mean(np.std(X111,axis=1)))
entropies = [
    sample_entropy(list(ts), window_size, tol) for ts in X111]
tol

In [None]:
convert_timeseries_to_features(
        X111, feature_set=set(['Entropy']))

In [None]:
np.mean(np.std(X111,axis=1))

In [None]:
np.where(~np.isclose(features111, features111alt[:87]))