In [13]:
import pandas as pd
import numpy as np
import os
import random
from IPython.display import Image
from sklearn.preprocessing import MinMaxScaler
import data_processing.sliding_window as helper_functions

## 1. Preprocessing
This notebook will teach you how to preprocess a sensor based Human Activity Recognition dataset.

Let's readn-in the data

In [14]:
data_folder = 'data'
dataset = '/Users/ahoelzemann/Documents/git/dl-for-har/data/rwhar_3sbjs_data.csv'
data = pd.read_csv(os.path.join(data_folder, dataset),
                   names=['subject_id', 'acc_x', 'acc_y', 'acc_z', 'activity_label'])

#### 1.1 Cleaning

##### 1.1.1 Sensor Orientation

Whenever we are working with a multimodal dataset, which means a dataset that consists of data from different sensors,
we need to make sure that the sensor orientation of the data matches each other.



Depending on the circumstances we want to clean the data before we train our classifier.

It is very important to double check if the dataset contains **NaN - Values**. If the dataset contains these values
make sure, that missing values are interpolated, since we want to keep the original sampling rate.


In [15]:
Image(url="../images/pamap_skoda_orientation.png")

In [16]:
def getLastNonNaN(series, index, missingvalues=1):
    if not pd.isna(series[index - 1]):
        return series[index - 1], missingvalues
    else:
        return getLastNonNaN(series, index - 1)


def getNextNonNaN(series, index, missingvalues=1):
    if not pd.isna(series[index + 1]):
        return series[index + 1], missingvalues
    else:
        return getNextNonNaN(series, index + 1, missingvalues=missingvalues + 1)


def replaceNaNValues(series, output_dtype='float'):
    if output_dtype == 'float':
        if series is not np.array:
            series = np.array(series)
        if pd.isna(series[0]):
            series[0] = series[1]
        if pd.isna(series[series.shape[0] - 1]):
            lastNonNan, numberOfMissingValues = getLastNonNaN(series, series.shape[0] - 1)
            if numberOfMissingValues != 1:
                for k in range(1, numberOfMissingValues):
                    series[series.shape[0] - 1 - k] = lastNonNan
            series[series.shape[0] - 1] = series[series.shape[0] - 2]
        for x in range(0, series.shape[0]):
            if pd.isna(series[x]):
                lastNonNan, _ = getLastNonNaN(series, x)
                nextNonNan, _ = getNextNonNaN(series, x)
                missingValue = (lastNonNan + nextNonNan) / 2
                series[x] = missingValue

    elif output_dtype == 'int' or output_dtype == 'string':
        if series is not np.array:
            series = np.array(series)
        if pd.isna(series[0]):
            series[0] = series[1]
        if pd.isna(series[series.shape[0] - 1]):
            lastNonNan, numberOfMissingValues = getLastNonNaN(series, series.shape[0] - 1)
            if numberOfMissingValues != 1:
                for k in range(1, numberOfMissingValues):
                    series[series.shape[0] - 1 - k] = lastNonNan
            series[series.shape[0] - 1] = series[series.shape[0] - 2]
        for x in range(0, series.shape[0]):
            if pd.isna(series[x]):
                lastNonNan, missingValuesLast = getLastNonNaN(series, x)
                nextNonNan, missingValuesNext = getNextNonNaN(series, x)
                if missingValuesLast < missingValuesNext:
                    series[x] = lastNonNan
                else:
                    series[x] = nextNonNan
    else:
        print("Please choose a valid output dtype. You can choose between float, int and string.")
        exit(0)

    return pd.DataFrame(series, dtype=output_dtype)

for i in range(0, 1000):
    fill_index = random.randint(1, data.shape[0])
    data.loc[fill_index] = [np.nan, np.nan, np.nan, np.nan, np.nan]

subject_id = replaceNaNValues(data['subject_id'], output_dtype='int')
acc_x = replaceNaNValues(data['acc_x'], output_dtype='float')
acc_y = replaceNaNValues(data['acc_y'], output_dtype='float')
acc_z = replaceNaNValues(data['acc_z'], output_dtype='float')
activity_label = replaceNaNValues(data['activity_label'], output_dtype='string')


interpolated_data = pd.concat([subject_id, acc_x, acc_y, acc_z, activity_label], axis=1)
interpolated_data.columns = ['subject_id', 'acc_x', 'acc_y', 'acc_z', 'activity_label']


Be careful with cleaning the data from noise or outlier, since it only is recommandable if the noise/outlier is not from any importance for the use case of our model.
#### 1.2 Resampling

Resampling is necessary if we work with sensor data from different sensors, that were not recorded with the sampling rate.
The optimize the classifier we need to align the data sampling rates with each other.
Resampling can either be done by up- or downsample the data.


In [17]:
def interpolate(data, freq_old, freq_new):
    tsAligned = np.divide(np.arange(0, data.shape[0]), freq_old)
    timeStep = 1 / freq_new
    tsCount = round(tsAligned[-1] / timeStep)
    tsMax = tsCount * timeStep
    tsNew = np.linspace(tsAligned[0], tsMax, tsCount + 1)
    dataNew = np.interp(tsNew, tsAligned, data)

    return tsNew, dataNew



#### 1.3 Normalizing
Normalizing is in an important part in the preprocessing chain, but can also the reason for many mistakes.
Therefore it is important to choose the correct strategy for normalizing your dataset.

##### 1.3.1 How to normalize?



3 possible solutions to normalize correctly.
Big pitfalls, since beginners tend to normalize the whole vector at once.

Normalizing sensor-wise

In [26]:
scaler_sensorwise = MinMaxScaler(feature_range=[-1,1])

scaled_sensorwise = scaler_sensorwise.fit_transform(interpolated_data[["acc_x", "acc_y", "acc_z"]].values.reshape(-1,1))
scaled_sensorwise

array([[ 0.0193068 ],
       [ 0.51856662],
       [ 0.04323806],
       ...,
       [-0.05468475],
       [ 0.51707086],
       [ 0.01787214]])

before / after

Normalizing axis-wise

before / after


In [None]:
scaler_axiswise = MinMaxScaler(feature_range=[-1,1])
scaled_x = scaler_axiswise.fit_transform(interpolated_data["acc_x"].values.reshape(-1,1))
scaled_y = scaler_axiswise.fit_transform(interpolated_data["acc_y"].values.reshape(-1,1))
scaled_z = scaler_axiswise.fit_transform(interpolated_data["acc_z"].values.reshape(-1,1))


#### 1.4 Windowing
##### 1.4.1 Jumping/Sliding Window

https://docs.microsoft.com/en-us/azure/stream-analytics/stream-analytics-window-functions

In [None]:
X_train, y_train = apply_sliding_window(scaled_data_X, scaled_data_Y,
                                        sliding_window_size=25,
                                        unit=None,
                                        sampling_rate=50,
                                        sliding_window_overlap=25,
                                        )
