In [1]:
import pandas as pd
import numpy as np
import dask.distributed

import tsfresh
import tsfresh.utilities.distribution

#### Load raw data from `processed_raw_data.parquet`

This data was generated in `002_process-raw-data.ipynb`.

In [2]:
df = pd.read_parquet('processed_raw_data.parquet')

# Put `t_ms_rounded` and `segment_id` back in dataframe
df = df.reset_index(['t_ms_rounded', 'segment_id'])

# Drop any rows with NaNs (only about 3%)
df.dropna(how='any', inplace=True)

df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 3216364 entries, (1600, 'A', 'walking', 1) to (1650, 'S', 'folding', 12)
Data columns (total 14 columns):
 #   Column         Dtype  
---  ------         -----  
 0   t_ms_rounded   int64  
 1   segment_id     int64  
 2   accel_phone_x  float64
 3   accel_phone_y  float64
 4   accel_phone_z  float64
 5   accel_watch_x  float64
 6   accel_watch_y  float64
 7   accel_watch_z  float64
 8   gyro_phone_x   float64
 9   gyro_phone_y   float64
 10  gyro_phone_z   float64
 11  gyro_watch_x   float64
 12  gyro_watch_y   float64
 13  gyro_watch_z   float64
dtypes: float64(12), int64(2)
memory usage: 355.8+ MB


In [3]:
df.head(9)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,t_ms_rounded,segment_id,accel_phone_x,accel_phone_y,accel_phone_z,accel_watch_x,accel_watch_y,accel_watch_z,gyro_phone_x,gyro_phone_y,gyro_phone_z,gyro_watch_x,gyro_watch_y,gyro_watch_z
subject_id,activity_id,activity_name,segment_sequence_number,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1600,A,walking,1,0,1,-0.364761,8.793503,1.055084,7.091625,-0.591667,8.195502,-0.85321,0.297226,0.890182,0.314944,-1.022277,-0.309962
1600,A,walking,1,50,1,-0.87973,9.768784,1.016998,4.972757,-0.158317,6.696732,-0.875137,0.015472,0.162231,0.387382,-0.618541,-0.048972
1600,A,walking,1,100,1,2.001495,11.10907,2.619156,3.25372,-0.191835,6.107758,-0.720169,0.388489,-0.284012,0.070999,-0.20948,-0.195978
1600,A,walking,1,150,1,0.450623,12.651642,0.184555,2.801216,-0.155922,5.997625,-0.57164,1.227402,-0.241669,0.037975,0.254976,-0.156563
1600,A,walking,1,200,1,-2.164352,13.928436,-4.422485,3.770868,-1.051354,7.731027,-0.380493,1.202835,-0.213135,0.073129,0.719431,-0.001035
1600,A,walking,1,250,1,-4.332779,13.361191,-0.718872,4.661511,0.169689,9.684695,-0.225784,0.558136,0.124481,-0.101574,1.082686,-0.134193
1600,A,walking,1,300,1,-0.319443,13.318359,-0.232025,6.145916,0.832883,11.003901,0.127808,0.380066,0.552887,-0.677882,1.176429,-0.211957
1600,A,walking,1,350,1,1.566452,9.515274,-0.017776,7.25922,-0.79278,11.485135,-0.513504,0.379852,0.332291,-0.16336,1.019835,0.303631
1600,A,walking,1,400,1,-0.323746,5.262665,0.322342,7.354988,-2.535759,11.454011,0.446365,-0.51059,-0.113098,0.412948,0.990008,0.994987


#### Use `tsfresh` to calculate features

In [18]:
%%time

features_df = tsfresh.extract_features(
    timeseries_container = df,
    # y = y,
    column_id = 'segment_id',
    column_sort = 't_ms_rounded',
)

features_df.info()

Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 60/60 [33:29<00:00, 33.49s/it]


<class 'pandas.core.frame.DataFrame'>
Int64Index: 10809 entries, 1 to 11138
Columns: 9444 entries, accel_phone_x__variance_larger_than_standard_deviation to gyro_watch_z__matrix_profile__feature_"75"__threshold_0.98
dtypes: float64(9444)
memory usage: 778.9 MB
Wall time: 35min 38s




In [19]:
features_df.sample(7)

Unnamed: 0,accel_phone_x__variance_larger_than_standard_deviation,accel_phone_x__has_duplicate_max,accel_phone_x__has_duplicate_min,accel_phone_x__has_duplicate,accel_phone_x__sum_values,accel_phone_x__abs_energy,accel_phone_x__mean_abs_change,accel_phone_x__mean_change,accel_phone_x__mean_second_derivative_central,accel_phone_x__median,...,gyro_watch_z__permutation_entropy__dimension_5__tau_1,gyro_watch_z__permutation_entropy__dimension_6__tau_1,gyro_watch_z__permutation_entropy__dimension_7__tau_1,gyro_watch_z__query_similarity_count__query_None__threshold_0.0,"gyro_watch_z__matrix_profile__feature_""min""__threshold_0.98","gyro_watch_z__matrix_profile__feature_""max""__threshold_0.98","gyro_watch_z__matrix_profile__feature_""mean""__threshold_0.98","gyro_watch_z__matrix_profile__feature_""median""__threshold_0.98","gyro_watch_z__matrix_profile__feature_""25""__threshold_0.98","gyro_watch_z__matrix_profile__feature_""75""__threshold_0.98"
620,0.0,0.0,0.0,1.0,-600.953201,1218.747082,0.06723,0.000138,-0.000168,-2.036819,...,4.493948,5.37254,5.596045,,0.669328,2.549272,1.785258,1.864947,1.583503,2.071668
202,0.0,0.0,0.0,1.0,1115.446351,4194.106098,0.111031,0.000288,0.000142,3.769562,...,4.236267,5.039053,5.40918,,0.897949,3.850784,1.853358,1.77115,1.550873,2.090641
2636,1.0,0.0,0.0,0.0,-877.759109,6328.696055,2.414102,-0.01914,0.00217,-2.634621,...,3.137897,3.823181,4.396022,,0.926674,3.640245,1.855424,1.636038,1.154958,2.586669
4957,1.0,0.0,0.0,0.0,920.624921,3473.970104,0.895322,-0.001396,0.000544,3.217819,...,3.036091,3.819922,4.469277,,0.904836,3.480074,1.730432,1.689701,1.331974,2.099079
7787,0.0,0.0,0.0,1.0,-2075.40602,14504.787405,0.040507,-0.000189,0.000147,-6.980301,...,4.009356,4.893539,5.363539,,0.757048,3.361426,1.720681,1.689373,1.306054,2.155172
5184,0.0,0.0,0.0,0.0,-218.935897,307.908402,0.300922,-0.007367,-0.000852,-0.855362,...,2.58151,3.172045,3.700343,,1.540977,6.692903,2.871212,2.72046,1.801677,3.316842
6495,0.0,0.0,0.0,1.0,-1127.04341,4282.72766,0.0267,-1.8e-05,-1.4e-05,-3.860214,...,4.01612,4.779529,5.208506,,1.437791,6.378184,3.665348,3.380794,2.576935,4.912287


#### Write `features_df` to disk

In [20]:
features_df.to_parquet('tsfresh_features.parquet', index=True)

