# Detecting Human Activities Through Smartphone Sensor - Feature engineering

## Import

In [1]:
import os
from tqdm import tqdm
import math
import pandas as pd
import numpy as np
import scipy.fftpack


## Global

In [2]:
PBAR_FORMAT='{desc:12} {percentage:3.0f}%|{bar:27}| [ {n:4d}/{total:4d}, {elapsed}<{remaining}{postfix} ]'
WINDOW_SIZE=200

## Features

### Average

In [3]:
def get_avg(a):
    return a.mean()

### Standard deviation

In [4]:
def get_stddev(a):
    return np.std(a)

### Variance

In [5]:
def get_var(a):
    return np.var(a)



### ABSOLDEV

In [6]:
def get_absoldev(a):
    return np.mean(np.abs(a - np.mean(a)))

### Cosine distance

In [7]:
def get_cos(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))


## Correlation

In [8]:
def get_cor(a, b):
    return np.corrcoef(a,b)[0][1]

### Resultant

In [9]:
def get_resultant(x, y, z):
    resultant = np.sqrt(x**2 + y**2 + z**2)
    average_resultant = np.mean(resultant)
    return average_resultant

### Activity extractor

In [10]:
def calc_activity(arr, window_size, data_length):
    f = np.empty([data_length], dtype=arr.dtype)
    for i in range(data_length):
        start = i * window_size
        f[i] = arr[start]
    return f

### Binned distribution

In [11]:
def binned_distribution(data):
    data_min = np.min(data)
    data_max = np.max(data)
    bins = np.linspace(data_min, data_max, 11)  # 11 edges, so 10 bins
    # Use np.histogram to calculate the histogram and the fraction of values in each bin
    hist, _ = np.histogram(data, bins=bins)
    # Normalize the histogram to get the fraction
    bin_fractions = hist / len(data)
    return np.array(bin_fractions)

def calc_bindist(arr, bins, window_size, data_length):
    f = np.zeros([data_length, bins])
    for i in range(data_length):
        start = i * window_size
        f[i] = binned_distribution(arr[start:start+window_size])
    return f

### MFCC

In [12]:
def compute_mfcc(signal, sampling_rate=20, num_mfcc=13, frame_size=200, hop_size=200, num_mels=40):
    # Step 1: Frame the signal
    num_frames = 1 + (len(signal) - frame_size) // hop_size
    frames = np.zeros((num_frames, frame_size))
    for i in range(num_frames):
        start = i * hop_size
        frames[i] = signal[start:start + frame_size] * np.hamming(frame_size)

    # Step 2: Compute the FFT and Power Spectrum
    n_fft = frame_size
    fft = np.fft.rfft(frames, n=n_fft)
    power_spectrum = (1.0 / n_fft) * (np.abs(fft) ** 2)

    # Step 3: Apply Mel Filter Bank
    mel_filters = np.zeros((num_mels, n_fft // 2 + 1))
    mel_min = 0
    mel_max = 2595 * np.log10(1 + (sampling_rate / 2) / 700)  # Convert max frequency to Mel scale
    mel_points = np.linspace(mel_min, mel_max, num_mels + 2)  # Mel points
    hz_points = 700 * (10 ** (mel_points / 2595) - 1)         # Convert Mel points back to Hz
    bin_points = np.floor((n_fft + 1) * hz_points / sampling_rate).astype(int)  # FFT bin indices

    for m in range(1, num_mels + 1):
        mel_filters[m - 1, bin_points[m - 1]:bin_points[m]] = np.linspace(0, 1, bin_points[m] - bin_points[m - 1])
        mel_filters[m - 1, bin_points[m]:bin_points[m + 1]] = np.linspace(1, 0, bin_points[m + 1] - bin_points[m])

    mel_power = np.dot(power_spectrum, mel_filters.T)
    mel_power = np.where(mel_power == 0, np.finfo(float).eps, mel_power)  # Avoid log(0)

    # Step 4: Log Mel Spectrum
    log_mel_power = np.log(mel_power)

    # Step 5: Discrete Cosine Transform (DCT)
    mfccs = scipy.fftpack.dct(log_mel_power, axis=1, norm='ortho')[:, :num_mfcc]

    mfccs = mfccs.T  # Transpose to get shape (num_mfcc x num_frames)

    mfccs =(mfccs - mfccs.min())/(mfccs.max()-mfccs.min())
    return mfccs

### Utility for windowing

In [13]:
def calc(arr, window_size, data_length, op):
    f = np.zeros([data_length])
    for i in range(data_length):
        start = i * window_size
        f[i] = op(arr[start:start+window_size])
    return f

def calc_2i(arr0, arr1, window_size, data_length, op):
    f = np.zeros([data_length])
    for i in range(data_length):
        start = i * window_size
        f[i] = op(arr0[start:start+window_size],arr1[start:start+window_size])
    return f

def calc_3i(arr0, arr1, arr2, window_size, data_length, op):
    f = np.zeros([data_length])
    for i in range(data_length):
        start = i * window_size
        f[i] = op(arr0[start:start+window_size],
                 arr1[start:start+window_size],
                 arr2[start:start+window_size])
    return f

## Extraction

In [14]:
extracted_features = ['ACTIVITY',
                        'X0', # 1st bin fraction of x axis acceleration distribution
                        'X1', # 2nd bin fraction ...
                        'X2',
                        'X3',
                        'X4',
                        'X5',
                        'X6',
                        'X7',
                        'X8',
                        'X9',
                        'Y0', # 1st bin fraction of y axis acceleration distribution
                        'Y1', # 2nd bin fraction ...
                        'Y2',
                        'Y3',
                        'Y4',
                        'Y5',
                        'Y6',
                        'Y7',
                        'Y8',
                        'Y9',
                        'Z0', # 1st bin fraction of z axis acceleration distribution
                        'Z1', # 2nd bin fraction ...
                        'Z2',
                        'Z3',
                        'Z4',
                        'Z5',
                        'Z6',
                        'Z7',
                        'Z8',
                        'Z9',
                        'XAVG', # average sensor value over the window (per axis)
                        'YAVG',
                        'ZAVG',
                        'XABSOLDEV', # Average absolute difference between the each of the 200 readings and the mean of those values (per axis)
                        'YABSOLDEV',
                        'ZABSOLDEV',
                        'XSTANDDEV', # Standard deviation of the 200 window's values (per axis)  ***BUG!***
                        'YSTANDDEV',
                        'ZSTANDDEV',
                        'XVAR', # Variance of the 200 window's values (per axis)   ***BUG!***
                        'YVAR',
                        'ZVAR',
                        'XMFCC0', # short-term power spectrum of a wave, based on a linear cosine transform of a log power spectrum on a non-linear mel scale of frequency (13 values per axis)
                        'XMFCC1',
                        'XMFCC2',
                        'XMFCC3',
                        'XMFCC4',
                        'XMFCC5',
                        'XMFCC6',
                        'XMFCC7',
                        'XMFCC8',
                        'XMFCC9',
                        'XMFCC10',
                        'XMFCC11',
                        'XMFCC12',
                        'YMFCC0', # short-term power spectrum of a wave, based on a linear cosine transform of a log power spectrum on a non-linear mel scale of frequency (13 values per axis)
                        'YMFCC1',
                        'YMFCC2',
                        'YMFCC3',
                        'YMFCC4',
                        'YMFCC5',
                        'YMFCC6',
                        'YMFCC7',
                        'YMFCC8',
                        'YMFCC9',
                        'YMFCC10',
                        'YMFCC11',
                        'YMFCC12',
                        'ZMFCC0', # short-term power spectrum of a wave, based on a linear cosine transform of a log power spectrum on a non-linear mel scale of frequency (13 values per axis)
                        'ZMFCC1',
                        'ZMFCC2',
                        'ZMFCC3',
                        'ZMFCC4',
                        'ZMFCC5',
                        'ZMFCC6',
                        'ZMFCC7',
                        'ZMFCC8',
                        'ZMFCC9',
                        'ZMFCC10',
                        'ZMFCC11',
                        'ZMFCC12',
                        'XYCOS', # The cosine distances between sensor values for pairs of axes (three pairs of axes)
                        'XZCOS',
                        'YZCOS',
                        'XYCOR', # The correlation between sensor values for pairs of axes (three pairs of axes)
                        'XZCOR',
                        'YZCOR',
                        'RESULTANT', # Average resultant value, computed by squaring each matching x, y, and z value, summing them, taking the square root, and then averaging these values over the 200 readings
                        'PARTICIPANT'] # Categirical: 1600 -1650

In [15]:
def extract_feature(src_path, dst_path):
    if not os.path.exists(dst_path):
        os.makedirs(dst_path)
    filelist_train = sorted([txt for txt in os.listdir(src_path + "/") if txt[-4:] == ".txt"])
    filelist_train

    data_length=9999999999999999999
    for idx, txt in enumerate(tqdm(filelist_train, desc="Scanning", bar_format=PBAR_FORMAT)):
        df = pd.read_csv(src_path+txt,
                         names = ['participant_id' , 'activity_code' , 'timestamp', 'x', 'y', 'z'],
                         index_col=None,
                         header=None)
        data_length = min(data_length, df['x'].to_numpy().shape[0]//WINDOW_SIZE)

    print(data_length)

    for idx, txt in enumerate(tqdm(filelist_train, desc="parsing data", bar_format=PBAR_FORMAT)):
        df = pd.read_csv(src_path+txt,
                         names = ['participant_id' , 'activity_code' , 'timestamp', 'x', 'y', 'z'],
                         index_col=None,
                         header=None)
        df_processed = pd.DataFrame()

        df.z = df.z.str.strip(';')
        df.z = pd.to_numeric(df.z)
        x = df['x'].to_numpy()
        y = df['y'].to_numpy()
        z = df['z'].to_numpy()

        # AVG
        f_x = calc(x, WINDOW_SIZE, data_length, get_avg)
        f_y = calc(y, WINDOW_SIZE, data_length, get_avg)
        f_z = calc(z, WINDOW_SIZE, data_length, get_avg)

        df_processed['XAVG'] = f_x
        df_processed['YAVG'] = f_y
        df_processed['ZAVG'] = f_z

        # VAR
        f_x = calc(x, WINDOW_SIZE, data_length, get_var)
        f_y = calc(y, WINDOW_SIZE, data_length, get_var)
        f_z = calc(z, WINDOW_SIZE, data_length, get_var)
        df_processed['XVAR'] = f_x
        df_processed['YVAR'] = f_y
        df_processed['ZVAR'] = f_z


        # STDDEV
        f_x = calc(x, WINDOW_SIZE, data_length, get_stddev)
        f_y = calc(y, WINDOW_SIZE, data_length, get_stddev)
        f_z = calc(z, WINDOW_SIZE, data_length, get_stddev)
        df_processed['XSTANDDEV'] = f_x
        df_processed['YSTANDDEV'] = f_y
        df_processed['ZSTANDDEV'] = f_z

        # ABSOLDEV
        f_x = calc(x, WINDOW_SIZE, data_length, get_absoldev)
        f_y = calc(y, WINDOW_SIZE, data_length, get_absoldev)
        f_z = calc(z, WINDOW_SIZE, data_length, get_absoldev)

        df_processed['XABSOLDEV'] = f_x
        df_processed['YABSOLDEV'] = f_y
        df_processed['ZABSOLDEV'] = f_z

        # COS
        f_xy = calc_2i(x, y, WINDOW_SIZE, data_length, get_cos)
        f_yz = calc_2i(y, z, WINDOW_SIZE, data_length, get_cos)
        f_xz = calc_2i(x, z, WINDOW_SIZE, data_length, get_cos)

        df_processed['XYCOS'] = f_xy
        df_processed['YZCOS'] = f_yz
        df_processed['XZCOS'] = f_xz

        # COR
        f_xy = calc_2i(x, y, WINDOW_SIZE, data_length, get_cor)
        f_yz = calc_2i(y, z, WINDOW_SIZE, data_length, get_cor)
        f_xz = calc_2i(x, z, WINDOW_SIZE, data_length, get_cor)

        df_processed['XYCOR'] = f_xy
        df_processed['YZCOR'] = f_yz
        df_processed['XZCOR'] = f_xz

        # resultant
        res = calc_3i(x, y, z, WINDOW_SIZE, data_length, get_resultant)

        df_processed['RESULTANT'] = res

        # X0-X9, Y0-Y9, Z0-Z9
        f_x = calc_bindist(x, 10, WINDOW_SIZE, data_length)
        f_y = calc_bindist(y, 10, WINDOW_SIZE, data_length)
        f_z = calc_bindist(z, 10, WINDOW_SIZE, data_length)

        for i in range(10):
            x_n = 'X'+str(i)
            y_n = 'Y'+str(i)
            z_n = 'Z'+str(i)
            df_processed[x_n] = f_x[:,i]
            df_processed[y_n] = f_y[:,i]
            df_processed[z_n] = f_z[:,i]

        f_x = compute_mfcc(x)
        f_y = compute_mfcc(y)
        f_z = compute_mfcc(z)
        for i in range(13):
            x_n = 'XMFCC'+str(i)
            y_n = 'YMFCC'+str(i)
            z_n = 'ZMFCC'+str(i)
            df_processed[x_n] = f_x[i][0:data_length]
            df_processed[y_n] = f_y[i][0:data_length]
            df_processed[z_n] = f_z[i][0:data_length]

        df_processed['PARTICIPANT'] = df['participant_id'][0:data_length]
        df_processed['ACTIVITY'] = calc_activity(df['activity_code'].to_numpy(), WINDOW_SIZE, data_length)
        df_processed = df_processed[extracted_features]
        df_processed.to_csv(dst_path+txt, encoding='utf-8', index=False, header=False)



In [16]:
src_path="../dataset/raw/phone/accel/"
dst_path="../dataset/processed/phone/accel/"
extract_feature(src_path, dst_path)

Scanning     100%|███████████████████████████| [   51/  51, 00:03<00:00 ]


303


parsing data 100%|███████████████████████████| [   51/  51, 00:20<00:00 ]


In [17]:
src_path="../dataset/raw/phone/gyro/"
dst_path="../dataset/processed/phone/gyro/"
extract_feature(src_path, dst_path)

Scanning     100%|███████████████████████████| [   51/  51, 00:02<00:00 ]


285


parsing data 100%|███████████████████████████| [   51/  51, 00:18<00:00 ]


In [18]:
src_path="../dataset/raw/watch/accel/"
dst_path="../dataset/processed/watch/accel/"
extract_feature(src_path, dst_path)

Scanning     100%|███████████████████████████| [   51/  51, 00:02<00:00 ]


300


parsing data 100%|███████████████████████████| [   51/  51, 00:18<00:00 ]


In [19]:
src_path="../dataset/raw/watch/gyro/"
dst_path="../dataset/processed/watch/gyro/"
extract_feature(src_path, dst_path)

Scanning     100%|███████████████████████████| [   51/  51, 00:02<00:00 ]


300


parsing data 100%|███████████████████████████| [   51/  51, 00:18<00:00 ]
