In [None]:
import os, sys
import pandas as pd

# Tutorial: Vectorized Feature Extraction
## 1. Construct a DataFrame from the Raw Data Files

In [None]:
folders = [f for f in os.listdir('data') if not os.path.isfile(os.path.join('data', f)) and 
           f.find('_MODEL') < 0]

In [None]:
def parse_metadata_from_file_name(file_name):
    subject_id = file_name.split('-')[-1].split('.txt')[0]
    gender = 'Male' if subject_id[0] == 'm' else 'Female'
    timestamp = ''
    for i in file_name.split('-')[1:7]:
        timestamp += i
    return subject_id, gender, timestamp

In [None]:
data = pd.DataFrame()
for folder in folders:
    files = [f for f in os.listdir('data/'+folder) if 
             os.path.isfile(os.path.join('data/'+folder, f))]
    for file_name in files:
        raw_data = pd.read_csv('data/{0}/{1}'.format(folder, file_name), sep=' ', header=0)
        raw_data.columns = ['X', 'Y', 'Z']
        subject_id, gender, timestamp = parse_metadata_from_file_name(file_name)
        raw_data['SubjectID'] = subject_id
        raw_data['Gender'] = gender
        raw_data['Timestamp'] = timestamp
        raw_data['ADL'] = folder
        data = data.append(raw_data, ignore_index=True)
print data.shape

In [None]:
data.to_csv('data/adl_data.csv', sep=',', header=True, index=False)

## 2. The Groupby Operation

The pandas groupby operation lets you apply a function to subsets of a DataFrame without using a for loop. This is very similar to applying an aggregate function with a group by clause in SQL, if you are familiar with the database language. If you are not, what better time to start experimenting with set-based logic!

For the ADL data set, we want to calculate a set of features for every axis in every file. We will apply a groupby operation that aggregates by timestamp because it is unique to every file. Since we want the other indicators (subject_id and gender) and the label (ADL) to be in the output, and these are all guaranteed to be constant for a given timestamp, we will include those columns in the groupby specification. The order does not matter.

In [None]:
grouped_data = data.groupby(['SubjectID', 'Gender', 'Timestamp', 'ADL'])

Now we can easily generate a feature like the mean for every group in the set:

In [None]:
grouped_data[['X', 'Y', 'Z']].mean().head(20)

With the agg() function, you can generate many different features in one pass through the original DataFrame. I will leave the verification of increased efficiency to you as an exercise. :D

In [None]:
from numpy import mean, std, min, max

features = grouped_data['X'].agg({
        'MeanX': mean,
        'StdX': std,
        'MinX': min,
        'MaxX': max,
        'LenX': len
    })

In [None]:
features.head(20)

## 3. Define Feature Extraction Functions

Now, we would like to do this for every acclerometer axis with only a single pass through the DataFrame. Defining a set of functions will allow us to do that while also maintaining control over the named columns. This might seem like a lot of overhead for such readily available built-in functions like mean and std, but it will be become very useful when you write your own feature extraction algorithms.

In [None]:
def Length(data):
    """Returns the number of samples in a time series"""
    return len(data)

def Mean(data):
    """Returns the mean of a time series"""
    return data.mean()

def Std(data):
    """Returns the standard deviation a time series"""
    return data.std()

def Min(data):
    """Returns the mean of a time series"""
    return data.min()

def Max(data):
    """Returns the standard deviation a time series"""
    return data.max()

In [None]:
functions = [Mean, Std, Min, Max, Length]

features = grouped_data.agg({
        'X': functions, 
        'Y': functions,
        'Z': functions
    })

features.head(20)

## 4. Put the Functions in a Module

If you find yourself repeatedly analyzing time series data or building applications for wearable sensors, this technique will come in handy over and over again. I have a feature function library (features.py) that I use in almost all of my machine learning projects. I invite you to use the repository and add to it: https://github.com/mhbuehler/feature_extraction

This is how you import and apply the functions:

In [None]:
from features import *

functions = [Mean, Median, Std, IQR, Length, Min, Max, DominantFrequency]

In [None]:
features = grouped_data.agg({
        'X': functions, 
        'Y': functions,
        'Z': functions
    })

features.head(20)

Looks good. Now let's reformat the index and column names and save it to CSV. This file will be the source of training examples for our machine learning models.

In [None]:
features.reset_index(inplace=True)
features.columns = ['{0}{1}'.format(a, b) if b else a for a, b in features.columns]

features.head(20)

In [None]:
features.to_csv('data/features.csv', index=False)