# How to Use Feature Extraction Toolbox

In [4]:
import pandas as pd

from package_name.feature_extraction import settings, extraction

In [5]:
# Load dataset
filename = "/home/scai9/feature_dataset/USCHAD_data.csv"
dataset = pd.read_csv(filename)

# Get subset of dataset to use for testing
dataset = dataset.loc[dataset["subject"]==1]

### Parameter Settings for Feature Extraction

There are three categories of features: Statistical, Spectral, and Time Frequency. Each have a class holding parameters required for feature calculation. A given configuration can be saved in either a json or yaml format.

In [3]:
# Initialization of the feature extraction parameters
statistical_params = settings.StatisticalFeatureParams(25)
spectral_params = settings.SpectralFeatureParams(25)
time_freq_params = settings.TimeFrequencyFeatureParams(25)

# Save and load the parameters
statistical_params.to_json("statistical_params.json")
statistical_params_2 = settings.StatisticalFeatureParams.from_json("statistical_params.json")

assert statistical_params.get_settings_as_dict() == statistical_params_2.get_settings_as_dict()

### Data Format for Feature Calculators

The individual statistial feature calculator functions support univariate series inputs. See example below.

In [2]:
from package_name.feature_extraction.statistical_feature_calculators import calculate_area_under_squared_curve

area = calculate_area_under_squared_curve(dataset["accx"].values)
print(f"Area Under Squared Curve: {area}")

NameError: name 'dataset' is not defined

### Using Feature Extraction Functions

In `package_name.feature_extraction.extraction` there are functions to extract features from each subcategory: Statistical, Spectral, and Time-Frequency. There is also a fucntion available to extraction features from all three categories. The `signal_name` parameter specifies a name for the dataset to be prepended to all sub-signal names in the resultant dataframe. `njobs` also specifies the number of cores to use. A basic example of each is shown below.


In [5]:
# Calculate statistical features
features = extraction.calculate_statistical_features(dataset, statistical_params, columns=["accx", "accy", "accz"], signal_name="test", njobs=1)

print(features.head())

Error calculating feature(s) sample_entropy: Unable to allocate 34.1 GiB for an array with shape (39074, 39074, 3) and data type float64
Feature(s) sample_entropy will be excluded.
Error calculating feature(s) approximate_entropy: Unable to allocate 22.8 GiB for an array with shape (39075, 39075, 2) and data type float64
Feature(s) approximate_entropy will be excluded.


  return mode(signal)[0][0]


Error calculating feature(s) sample_entropy: Unable to allocate 34.1 GiB for an array with shape (39074, 39074, 3) and data type float64
Feature(s) sample_entropy will be excluded.
Error calculating feature(s) approximate_entropy: Unable to allocate 22.8 GiB for an array with shape (39075, 39075, 2) and data type float64
Feature(s) approximate_entropy will be excluded.


  return mode(signal)[0][0]


Error calculating feature(s) sample_entropy: Unable to allocate 34.1 GiB for an array with shape (39074, 39074, 3) and data type float64
Feature(s) sample_entropy will be excluded.
Error calculating feature(s) approximate_entropy: Unable to allocate 22.8 GiB for an array with shape (39075, 39075, 2) and data type float64
Feature(s) approximate_entropy will be excluded.


  return mode(signal)[0][0]


               mean  geometric_mean  harmonic_mean  trimmed_mean_0.1  \
test_accx  0.742372        0.907159       0.702253          0.790657   
test_accy  0.188140        0.250165       0.106604          0.211222   
test_accz -0.158577        0.099436       0.027122         -0.152811   

           trimmed_mean_0.15  trimmed_mean_0.2  trimmed_mean_0.25  \
test_accx           0.847635          0.863518           0.872033   
test_accy           0.213097          0.212477           0.210095   
test_accz          -0.148787         -0.144394          -0.141309   

           trimmed_mean_0.3  mean_of_abs  geometric_mean_of_abs  ...  \
test_accx          0.879215     0.936795               0.852483  ...   
test_accy          0.211006     0.311715               0.205384  ...   
test_accz         -0.142072     0.241382               0.162420  ...   

           hurst_exponent  adf_teststats  adf_pvalue  adf_usedlag  \
test_accx        0.803642      -2.638544    0.085290         54.0   
test_ac

In [6]:
# Calculate spectral features
features = extraction.calculate_spectral_features(dataset, spectral_params, columns=["accx", "accy", "accz"], signal_name="test", njobs=1)

print(features.head())

  valley_width_mode = mode(valley_widths)[0]


           spectral_centroid_order_1  spectral_centroid_order_2  \
test_accx                   5.141873                  38.604725   
test_accy                   5.759207                  46.973967   
test_accz                   5.790333                  46.604544   

           spectral_centroid_order_3  spectral_centroid_order_4  \
test_accx                 341.026915                3300.370039   
test_accy                 436.952883                4359.183368   
test_accz                 425.692532                4166.225257   

           spectral_centroid_order_5  spectral_variance  spectral_skewness  \
test_accx               33736.373902          12.165865           0.410417   
test_accy               45381.461031          13.805503           0.144313   
test_accz               42583.239419          13.076591           0.093072   

           spectral_kurtosis  median_frequency  spectral_flatness  ...  \
test_accx           2.116443          3.906250           0.703348  ...   
t

In [7]:
# Calculate time frequency features
features = extraction.calculate_time_frequency_features(dataset[0:3000], time_freq_params, columns=["accx", "accy", "accz"], signal_name="test", njobs=1)

print(features.head())

  return np.mean(np.abs(np.diff(signal) / signal[:-1]))
  return np.mean(np.abs(np.diff(signal) / signal[:-1]))
  return mode(signal)[0][0]
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  return mode(signal)[0][0]
  return mode(signal)[0][0]
  return np.mean(np.abs(np.diff(signal) / signal[:-1]))
  return mode(signal)[0][0]
  return mode(signal)[0][0]
  return np.mean(np.abs(np.diff(signal) / signal[:-1]))
  return np.mean(np.abs(np.diff(signal) / signal[:-1]))
  return mode(signal)[0][0]
  return mode(signal)[0][0]
  return mode(signal)[0][0]
  return np.mean(np.abs(np.diff(signal) / signal[:-1]))
  return np.mean(np.abs(np.diff(signal) / signal[:-1]))
  return mode(signal)[0][0]
  return np.mean(np.abs(np.diff(signal) / signal[:-1]))
  return mode(signal)[0][0]
  return np.mean(np.abs(np.diff(signal) / signal[:-1]))
  return np.mean(np.abs(np.diff(signal) / signal[:-1]))
  return mode(signal)[0][0]
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(

           tkeo_mean  tkeo_geometric_mean  tkeo_harmonic_mean  \
test_accx  -0.000237             0.001189        1.536900e-14   
test_accy  -0.000109             0.000479        7.894626e-15   
test_accz  -0.000313             0.000374        6.615248e-15   

           tkeo_trimmed_mean_0.1  tkeo_trimmed_mean_0.15  \
test_accx              -0.000003                0.000027   
test_accy              -0.000037               -0.000036   
test_accz              -0.000003                0.000010   

           tkeo_trimmed_mean_0.2  tkeo_trimmed_mean_0.25  \
test_accx               0.000038                0.000048   
test_accy              -0.000035               -0.000040   
test_accz               0.000016                0.000014   

           tkeo_trimmed_mean_0.3  tkeo_mean_of_abs  \
test_accx               0.000051          0.008779   
test_accy              -0.000048          0.003708   
test_accz               0.000012          0.005751   

           tkeo_geometric_mean_of_abs  .

In [None]:
# Calculate all features
features = extraction.calculate_all_features(dataset, statistical_params, spectral_params, time_freq_params, columns=["accx", "accy", "accz"], signal_name="test", njobs=6)

print(features.head())

Error calculating feature(s) sample_entropy: Unable to allocate 34.1 GiB for an array with shape (39074, 39074, 3) and data type float64
Feature(s) sample_entropy will be excluded.
Error calculating feature(s) sample_entropy: Unable to allocate 34.1 GiB for an array with shape (39074, 39074, 3) and data type float64Error calculating feature(s) sample_entropy: Unable to allocate 34.1 GiB for an array with shape (39074, 39074, 3) and data type float64Error calculating feature(s) approximate_entropy: Unable to allocate 22.8 GiB for an array with shape (39075, 39075, 2) and data type float64

Feature(s) sample_entropy will be excluded.Feature(s) approximate_entropy will be excluded.


Feature(s) sample_entropy will be excluded.
Error calculating feature(s) approximate_entropy: Unable to allocate 22.8 GiB for an array with shape (39075, 39075, 2) and data type float64Error calculating feature(s) approximate_entropy: Unable to allocate 22.8 GiB for an array with shape (39075, 39075, 2) and d

  return mode(signal)[0][0]
  return mode(signal)[0][0]
  return mode(signal)[0][0]


Arrays, DataFrames and Series are all acceptable input data formats. If the input is a DataFrame, the columns parameters specifies the columns to analyze (as seen in the previous examples). If not, they are the ordered names of the components of signal. 

In [4]:
# 2D array input
print(dataset[["accx", "accy", "accz"]].values.shape)
features = extraction.calculate_statistical_features(dataset[["accx", "accy", "accz"]].values, statistical_params, columns=["accx", "accy", "accz"], signal_name="test", njobs=1)

print(features.head())

# Series input
features = extraction.calculate_statistical_features(dataset["accx"].values, statistical_params, columns=["accx"], signal_name="test", njobs=1)

print(features.head())

Error calculating feature(s) sample_entropy: Unable to allocate 34.1 GiB for an array with shape (39074, 39074, 3) and data type float64
Feature(s) sample_entropy will be excluded.
Error calculating feature(s) approximate_entropy: Unable to allocate 22.8 GiB for an array with shape (39075, 39075, 2) and data type float64
Feature(s) approximate_entropy will be excluded.


  return mode(signal)[0][0]


               mean  geometric_mean  harmonic_mean  trimmed_mean_0.1  \
test_accx  0.742372        0.907159       0.702253          0.790657   

           trimmed_mean_0.15  trimmed_mean_0.2  trimmed_mean_0.25  \
test_accx           0.847635          0.863518           0.872033   

           trimmed_mean_0.3  mean_of_abs  geometric_mean_of_abs  ...  \
test_accx          0.879215     0.936795               0.852483  ...   

           hurst_exponent  adf_teststats  adf_pvalue  adf_usedlag  \
test_accx        0.803642      -2.638544     0.08529         54.0   

           has_duplicates  max_has_duplicates  min_has_duplicates  large_std  \
test_accx            True               False               False      False   

           lempel_ziv_complexity      cid_ce  
test_accx               0.076134  133.676732  

[1 rows x 149 columns]


### Extracting a Subset of Features

Optionally, a subset of the available features can be selected for extraction through a list of feature names in the parameter classes.

In [8]:
statistical_params = settings.StatisticalFeatureParams(25, calculators=["mean", "mode", "std"])
spectral_params = settings.SpectralFeatureParams(25, calculators=["spectral_variance"])
time_freq_params = settings.TimeFrequencyFeatureParams(25, calculators=["tkeo_features"],tkeo_sf_params=statistical_params)

features = extraction.calculate_all_features(dataset, statistical_params, spectral_params, time_freq_params, columns=["accx", "accy", "accz"], signal_name="test", njobs=1)
print(features.head())

  return mode(signal)[0][0]


  return mode(signal)[0][0]


               mean       std      mode  spectral_variance  tkeo_mean  \
test_accx  0.742372  0.705045  0.864040          12.165865  -0.200183   
test_accy  0.188140  0.342796  0.509207          13.805503  -0.066683   
test_accz -0.158577  0.266645  0.125137          13.076591  -0.043469   

           tkeo_std  tkeo_mode  
test_accx  1.325616        0.0  
test_accy  0.307666        0.0  
test_accz  0.161682        0.0  
