In [1]:
import os, sys
import pandas as pd

# Vectorized Feature Extraction
## Construct a DataFrame from the Raw Data Files

In [2]:
folders = [f for f in os.listdir('data') if not os.path.isfile(os.path.join('data', f)) and 
           f.find('_MODEL') < 0]

In [3]:
def parse_metadata_from_file_name(file_name):
    subject_id = file_name.split('-')[-1].split('.txt')[0]
    gender = 'Male' if subject_id[0] == 'm' else 'Female'
    timestamp = ''
    for i in file_name.split('-')[1:7]:
        timestamp += i
    return subject_id, gender, timestamp

In [4]:
data = pd.DataFrame()
for folder in folders:
    files = [f for f in os.listdir('data/'+folder) if 
             os.path.isfile(os.path.join('data/'+folder, f))]
    for file_name in files:
        raw_data = pd.read_csv('data/{0}/{1}'.format(folder, file_name), sep=' ', header=0)
        raw_data.columns = ['X', 'Y', 'Z']
        subject_id, gender, timestamp = parse_metadata_from_file_name(file_name)
        raw_data['SubjectID'] = subject_id
        raw_data['Gender'] = gender
        raw_data['Timestamp'] = timestamp
        raw_data['ADL'] = folder
        data = data.append(raw_data, ignore_index=True)
print data.shape

(445690, 7)


In [5]:
data.to_csv('data/adl_data.csv', sep=',', header=True, index=False)

## The Groupby Operation

The pandas groupby operation lets you apply a function to subsets of a DataFrame without using a for loop. This is very similar to applying an aggregate function with a group by clause in SQL, if you are familiar with the database language. If you are not, now is a great time to start experimenting with set-based logic.

For the [ADL data set](https://archive.ics.uci.edu/ml/datasets/Dataset+for+ADL+Recognition+with+Wrist-worn+Accelerometer), we want to calculate a set of features for every axis in every file. We will apply a groupby operation that aggregates by timestamp because it is unique to every file. Since we want the other indicators (subject_id and gender) and the label (ADL) to be in the output, and these are all guaranteed to be constant for a given timestamp, we will include those columns in the groupby specification. The order does not matter.

In [6]:
grouped_data = data.groupby(['SubjectID', 'Gender', 'Timestamp', 'ADL'])

Now we can easily generate a feature like the mean for every group in the set:

In [7]:
grouped_data[['X', 'Y', 'Z']].mean().head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,X,Y,Z
SubjectID,Gender,Timestamp,ADL,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
f1,Female,20110323103918,Standup_chair,16.760479,38.700599,40.868263
f1,Female,20110323104201,Standup_chair,18.016598,36.755187,40.937759
f1,Female,20110323104551,Standup_chair,19.256667,34.766667,42.506667
f1,Female,20110323104759,Standup_chair,18.295238,36.695238,40.701587
f1,Female,20110324094434,Comb_hair,31.016827,36.177885,39.520433
f1,Female,20110324094936,Standup_chair,15.892216,33.976048,38.952096
f1,Female,20110324095016,Sitdown_chair,23.974026,33.493506,47.272727
f1,Female,20110324095107,Walk,12.899059,36.757057,34.76219
f1,Female,20110324095211,Walk,12.460733,37.011518,34.107853
f1,Female,20110324100432,Pour_water,28.31026,40.707046,49.347342


With the agg() function, you can generate many different features in one pass through the original DataFrame. I will leave the verification of increased efficiency to you as an exercise.

In [8]:
from numpy import mean, std, min, max

features = grouped_data['X'].agg({
        'MeanX': mean,
        'StdX': std,
        'MinX': min,
        'MaxX': max,
        'LenX': len
    })

In [9]:
features.head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,MinX,MaxX,LenX,MeanX,StdX
SubjectID,Gender,Timestamp,ADL,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
f1,Female,20110323103918,Standup_chair,11,32,167,16.760479,6.638887
f1,Female,20110323104201,Standup_chair,8,33,241,18.016598,8.690017
f1,Female,20110323104551,Standup_chair,10,36,300,19.256667,9.754724
f1,Female,20110323104759,Standup_chair,7,35,315,18.295238,9.104059
f1,Female,20110324094434,Comb_hair,6,63,832,31.016827,16.66957
f1,Female,20110324094936,Standup_chair,8,31,167,15.892216,6.802755
f1,Female,20110324095016,Sitdown_chair,8,31,154,23.974026,7.349312
f1,Female,20110324095107,Walk,0,38,1169,12.899059,6.191721
f1,Female,20110324095211,Walk,0,37,955,12.460733,5.367352
f1,Female,20110324100432,Pour_water,17,40,809,28.31026,4.895584


## Define Feature Extraction Functions

Now, we would like to do this for every acclerometer axis with only a single pass through the DataFrame. Defining a set of functions will allow us to do that while also maintaining control over the named columns. This might seem like a lot of overhead for such readily available built-in functions like mean and std, but it will be become very useful when you write your own feature extraction algorithms.

In [10]:
def Length(data):
    """Returns the number of samples in a time series"""
    return len(data)

def Mean(data):
    """Returns the mean of a time series"""
    return data.mean()

def Std(data):
    """Returns the standard deviation a time series"""
    return data.std()

def Min(data):
    """Returns the mean of a time series"""
    return data.min()

def Max(data):
    """Returns the standard deviation a time series"""
    return data.max()

In [11]:
functions = [Mean, Std, Min, Max, Length]

features = grouped_data.agg({
        'X': functions, 
        'Y': functions,
        'Z': functions
    })

features.head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Y,Y,Y,Y,Y,X,X,X,X,X,Z,Z,Z,Z,Z
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Mean,Std,Min,Max,Length,Mean,Std,Min,Max,Length,Mean,Std,Min,Max,Length
SubjectID,Gender,Timestamp,ADL,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
f1,Female,20110323103918,Standup_chair,38.700599,1.788568,35,46,167,16.760479,6.638887,11,32,167,40.868263,5.888063,35,53,167
f1,Female,20110323104201,Standup_chair,36.755187,1.610585,29,41,241,18.016598,8.690017,8,33,241,40.937759,8.061241,32,58,241
f1,Female,20110323104551,Standup_chair,34.766667,3.373944,30,41,300,19.256667,9.754724,10,36,300,42.506667,6.873191,35,55,300
f1,Female,20110323104759,Standup_chair,36.695238,2.439551,27,43,315,18.295238,9.104059,7,35,315,40.701587,7.923992,32,54,315
f1,Female,20110324094434,Comb_hair,36.177885,10.425466,0,63,832,31.016827,16.66957,6,63,832,39.520433,7.095333,19,61,832
f1,Female,20110324094936,Standup_chair,33.976048,2.953356,23,37,167,15.892216,6.802755,8,31,167,38.952096,8.651769,24,54,167
f1,Female,20110324095016,Sitdown_chair,33.493506,3.074769,22,44,154,23.974026,7.349312,8,31,154,47.272727,8.76958,24,61,154
f1,Female,20110324095107,Walk,36.757057,2.735642,18,55,1169,12.899059,6.191721,0,38,1169,34.76219,4.277799,27,52,1169
f1,Female,20110324095211,Walk,37.011518,2.912216,29,54,955,12.460733,5.367352,0,37,955,34.107853,3.539814,27,53,955
f1,Female,20110324100432,Pour_water,40.707046,4.723227,29,56,809,28.31026,4.895584,17,40,809,49.347342,2.324375,38,61,809


## Put the Functions in a Module and Apply to the DataFrame

If you find yourself repeatedly analyzing time series data, e.g. for wearable sensor application development, this technique will come in handy over and over again. I have a feature function library (features.py) that you are welcome to use and add to: https://github.com/mhbuehler/feature_extraction

This is how you import and apply the functions:

In [12]:
from features import *

functions = [Mean, Median, Std, IQR, Length, Min, Max, DominantFrequency]

In [13]:
features = grouped_data.agg({
        'X': functions, 
        'Y': functions,
        'Z': functions
    })

features.head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Y,Y,Y,Y,Y,Y,Y,Y,X,X,X,X,X,Z,Z,Z,Z,Z,Z,Z,Z
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Mean,Median,Std,IQR,Length,Min,Max,DominantFrequency,Mean,Median,...,Max,DominantFrequency,Mean,Median,Std,IQR,Length,Min,Max,DominantFrequency
SubjectID,Gender,Timestamp,ADL,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2
f1,Female,20110323103918,Standup_chair,38.700599,38.0,1.788568,1.0,167,35,46,0,16.760479,13.0,...,32,0,40.868263,38.0,5.888063,9.0,167,35,53,0
f1,Female,20110323104201,Standup_chair,36.755187,37.0,1.610585,2.0,241,29,41,0,18.016598,13.0,...,33,0,40.937759,35.0,8.061241,17.0,241,32,58,0
f1,Female,20110323104551,Standup_chair,34.766667,33.5,3.373944,7.0,300,30,41,0,19.256667,13.0,...,36,0,42.506667,38.0,6.873191,14.0,300,35,55,0
f1,Female,20110323104759,Standup_chair,36.695238,36.0,2.439551,2.0,315,27,43,0,18.295238,13.0,...,35,0,40.701587,35.0,7.923992,17.0,315,32,54,0
f1,Female,20110324094434,Comb_hair,36.177885,36.0,10.425466,7.5,832,0,63,0,31.016827,35.0,...,63,0,39.520433,38.0,7.095333,11.0,832,19,61,0
f1,Female,20110324094936,Standup_chair,33.976048,35.0,2.953356,3.0,167,23,37,0,15.892216,12.0,...,31,0,38.952096,35.0,8.651769,15.0,167,24,54,0
f1,Female,20110324095016,Sitdown_chair,33.493506,33.0,3.074769,1.0,154,22,44,0,23.974026,28.0,...,31,0,47.272727,52.0,8.76958,10.0,154,24,61,0
f1,Female,20110324095107,Walk,36.757057,36.0,2.735642,2.0,1169,18,55,0,12.899059,12.0,...,38,0,34.76219,34.0,4.277799,3.0,1169,27,52,0
f1,Female,20110324095211,Walk,37.011518,36.0,2.912216,3.0,955,29,54,0,12.460733,12.0,...,37,0,34.107853,34.0,3.539814,3.0,955,27,53,0
f1,Female,20110324100432,Pour_water,40.707046,41.0,4.723227,6.0,809,29,56,0,28.31026,29.0,...,40,0,49.347342,50.0,2.324375,3.0,809,38,61,0


## Create the Flat Feature File
Now let's reformat the index and column names and save it to CSV. This file will be the source of training examples for our machine learning models.

In [14]:
features.reset_index(inplace=True)
features.columns = ['{0}{1}'.format(a, b) if b else a for a, b in features.columns]

features.head(20)

Unnamed: 0,SubjectID,Gender,Timestamp,ADL,YMean,YMedian,YStd,YIQR,YLength,YMin,...,XMax,XDominantFrequency,ZMean,ZMedian,ZStd,ZIQR,ZLength,ZMin,ZMax,ZDominantFrequency
0,f1,Female,20110323103918,Standup_chair,38.700599,38.0,1.788568,1.0,167,35,...,32,0,40.868263,38.0,5.888063,9.0,167,35,53,0
1,f1,Female,20110323104201,Standup_chair,36.755187,37.0,1.610585,2.0,241,29,...,33,0,40.937759,35.0,8.061241,17.0,241,32,58,0
2,f1,Female,20110323104551,Standup_chair,34.766667,33.5,3.373944,7.0,300,30,...,36,0,42.506667,38.0,6.873191,14.0,300,35,55,0
3,f1,Female,20110323104759,Standup_chair,36.695238,36.0,2.439551,2.0,315,27,...,35,0,40.701587,35.0,7.923992,17.0,315,32,54,0
4,f1,Female,20110324094434,Comb_hair,36.177885,36.0,10.425466,7.5,832,0,...,63,0,39.520433,38.0,7.095333,11.0,832,19,61,0
5,f1,Female,20110324094936,Standup_chair,33.976048,35.0,2.953356,3.0,167,23,...,31,0,38.952096,35.0,8.651769,15.0,167,24,54,0
6,f1,Female,20110324095016,Sitdown_chair,33.493506,33.0,3.074769,1.0,154,22,...,31,0,47.272727,52.0,8.76958,10.0,154,24,61,0
7,f1,Female,20110324095107,Walk,36.757057,36.0,2.735642,2.0,1169,18,...,38,0,34.76219,34.0,4.277799,3.0,1169,27,52,0
8,f1,Female,20110324095211,Walk,37.011518,36.0,2.912216,3.0,955,29,...,37,0,34.107853,34.0,3.539814,3.0,955,27,53,0
9,f1,Female,20110324100432,Pour_water,40.707046,41.0,4.723227,6.0,809,29,...,40,0,49.347342,50.0,2.324375,3.0,809,38,61,0


In [15]:
features.to_csv('data/features.csv', index=False)