# How to Use Feature Extraction Toolbox

In [1]:
import pandas as pd

from package_name.feature_extraction import settings, extraction

In [2]:
# Load dataset
filename = "/home/scai9/feature_dataset/USCHAD_data.csv"
dataset = pd.read_csv(filename)

# Get subset of dataset to use for testing
dataset = dataset.loc[dataset["subject"]==1][0:5000]

### Parameter Settings for Feature Extraction

There are three categories of features: Statistical, Spectral, and Time Frequency. Each have a class holding parameters required for feature calculation. A given configuration can be saved in either a json or yaml format.

In [3]:
# Initialization of the feature extraction parameters
statistical_params = settings.StatisticalFeatureParams(25)
spectral_params = settings.SpectralFeatureParams(25)
time_freq_params = settings.TimeFrequencyFeatureParams(25)

# Save and load the parameters
statistical_params.to_json("statistical_params.json")
statistical_params_2 = settings.StatisticalFeatureParams.from_json("statistical_params.json")

assert statistical_params.get_settings_as_dict() == statistical_params_2.get_settings_as_dict()

### Data Format for Feature Calculators

The individual statistial feature calculator functions support univariate series inputs. See example below.

In [4]:
from package_name.feature_extraction.statistical_feature_calculators import calculate_area_under_squared_curve

area = calculate_area_under_squared_curve(dataset["accx"].values)
print(f"Area Under Squared Curve: {area}")

Area Under Squared Curve: 2488.43278078524


### Using Feature Extraction Functions

In `package_name.feature_extraction.extraction` there are functions to extract features from each subcategory: Statistical, Spectral, and Time-Frequency. There is also a fucntion available to extraction features from all three categories. The `signal_name` parameter specifies a name for the dataset to be prepended to all sub-signal names in the resultant dataframe. `njobs` also specifies the number of cores to use. A basic example of each is shown below.


In [5]:
# Calculate statistical features
features = extraction.calculate_statistical_features(dataset, statistical_params, columns=["accx"], signal_name="test", njobs=1)

print(features.head())

  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


               mean  geometric_mean  harmonic_mean  trimmed_mean_0.1  \
test_accx -0.704724             NaN            NaN         -0.709874   

           trimmed_mean_0.15  trimmed_mean_0.2  trimmed_mean_0.25  \
test_accx          -0.711949         -0.714596          -0.717405   

           trimmed_mean_0.3  mean_of_abs  geometric_mean_of_abs  ...  \
test_accx         -0.719037     0.704724               0.703829  ...   

           hurst_exponent  adf_teststats  adf_pvalue  adf_usedlag  \
test_accx        1.104163      -3.348753    0.012835         32.0   

           has_duplicates  max_has_duplicates  min_has_duplicates  large_std  \
test_accx            True               False               False      False   

           lempel_ziv_complexity     cid_ce  
test_accx                 0.0598  34.857094  

[1 rows x 151 columns]


In [6]:
# Calculate spectral features
features = extraction.calculate_spectral_features(dataset, spectral_params, columns=["accx", "accy", "accz"], signal_name="test", njobs=1)

print(features.head())

Error calculating feature(s) ['spectral_cumulative_frequency_below_threshold_0.5', 'spectral_cumulative_frequency_below_threshold_0.75']: index -1 is out of bounds for axis 0 with size 0
Feature(s) ['spectral_cumulative_frequency_below_threshold_0.5', 'spectral_cumulative_frequency_below_threshold_0.75'] will be excluded.


  valley_width_mode = mode(valley_widths)[0]


           spectral_centroid_order_1  spectral_centroid_order_2  \
test_accx                   1.881512                  14.266790   
test_accy                   1.079966                   7.049539   
test_accz                   1.918049                  13.556253   

           spectral_centroid_order_3  spectral_centroid_order_4  \
test_accx                 125.264091                1189.120649   
test_accy                  60.098733                 575.726582   
test_accz                 114.738415                1064.237861   

           spectral_centroid_order_5  spectral_variance  spectral_skewness  \
test_accx               11875.828418          10.726702           1.652527   
test_accy                5873.998937           5.883212           2.787550   
test_accz               10454.414019           9.877341           1.637948   

           spectral_kurtosis  median_frequency  spectral_flatness  ...  \
test_accx           4.448157          0.585938           0.303453  ...   
t

In [7]:
# Calculate time frequency features
features = extraction.calculate_time_frequency_features(dataset[0:3000], time_freq_params, columns=["accx", "accy", "accz"], signal_name="test", njobs=1)

print(features.head())

  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


           tkeo_mean  tkeo_geometric_mean  tkeo_harmonic_mean  \
test_accx  -0.000237             0.001189        1.536900e-14   
test_accy  -0.000109             0.000479        7.894626e-15   
test_accz  -0.000313             0.000374        6.615248e-15   

           tkeo_trimmed_mean_0.1  tkeo_trimmed_mean_0.15  \
test_accx              -0.000003                0.000027   
test_accy              -0.000037               -0.000036   
test_accz              -0.000003                0.000010   

           tkeo_trimmed_mean_0.2  tkeo_trimmed_mean_0.25  \
test_accx               0.000038                0.000048   
test_accy              -0.000035               -0.000040   
test_accz               0.000016                0.000014   

           tkeo_trimmed_mean_0.3  tkeo_mean_of_abs  \
test_accx               0.000051          0.008779   
test_accy              -0.000048          0.003708   
test_accz               0.000012          0.005751   

           tkeo_geometric_mean_of_abs  .

In [None]:
# Calculate all features
features = extraction.calculate_all_features(dataset, statistical_params, spectral_params, time_freq_params, columns=["accx", "accy", "accz"], signal_name="test", njobs=6)

print(features.head())

  avg = a.mean(axis, **keepdims_kw)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  ret = ret.dtype.type(ret / rcount)


Arrays, DataFrames and Series are all acceptable input data formats. If the input is a DataFrame, the columns parameters specifies the columns to analyze (as seen in the previous examples). If not, they are the ordered names of the components of signal. 

In [4]:
# 2D array input
print(dataset[["accx", "accy", "accz"]].values.shape)
features = extraction.calculate_statistical_features(dataset[["accx", "accy", "accz"]].values, statistical_params, columns=["accx", "accy", "accz"], signal_name="test", njobs=1)

print(features.head())

# Series input
features = extraction.calculate_statistical_features(dataset["accx"].values, statistical_params, columns=["accx"], signal_name="test", njobs=1)

print(features.head())

Error calculating feature(s) sample_entropy: Unable to allocate 34.1 GiB for an array with shape (39074, 39074, 3) and data type float64
Feature(s) sample_entropy will be excluded.
Error calculating feature(s) approximate_entropy: Unable to allocate 22.8 GiB for an array with shape (39075, 39075, 2) and data type float64
Feature(s) approximate_entropy will be excluded.


  return mode(signal)[0][0]


               mean  geometric_mean  harmonic_mean  trimmed_mean_0.1  \
test_accx  0.742372        0.907159       0.702253          0.790657   

           trimmed_mean_0.15  trimmed_mean_0.2  trimmed_mean_0.25  \
test_accx           0.847635          0.863518           0.872033   

           trimmed_mean_0.3  mean_of_abs  geometric_mean_of_abs  ...  \
test_accx          0.879215     0.936795               0.852483  ...   

           hurst_exponent  adf_teststats  adf_pvalue  adf_usedlag  \
test_accx        0.803642      -2.638544     0.08529         54.0   

           has_duplicates  max_has_duplicates  min_has_duplicates  large_std  \
test_accx            True               False               False      False   

           lempel_ziv_complexity      cid_ce  
test_accx               0.076134  133.676732  

[1 rows x 149 columns]


### Extracting a Subset of Features

Optionally, a subset of the available features can be selected for extraction through a list of feature names in the parameter classes.

In [3]:
statistical_params = settings.StatisticalFeatureParams(25, calculators=["mean", "mode", "std"])
spectral_params = settings.SpectralFeatureParams(25, calculators=["spectral_variance"])
time_freq_params = settings.TimeFrequencyFeatureParams(25, calculators=["tkeo_features"],tkeo_sf_params=statistical_params)

features = extraction.calculate_all_features(dataset, statistical_params, spectral_params, time_freq_params, columns=["accx", "accy", "accz"], signal_name="test", njobs=1)
print(features.head())

               mean       std      mode  spectral_variance  tkeo_mean  \
test_accx -0.704724  0.033738 -0.725808          10.726702  -0.000193   
test_accy  0.583404  0.077468  0.577970           5.883212  -0.000084   
test_accz -0.506294  0.062115 -0.486109           9.877341  -0.000220   

           tkeo_std  tkeo_mode  
test_accx  0.016841        0.0  
test_accy  0.007834        0.0  
test_accz  0.012551        0.0  
