# Pre processing UCI-HAR

This notebook will pre processing the dataset UCI-HAR (https://archive.ics.uci.edu/ml/datasets/human+activity+recognition+using+smartphones)

1. Load the uci dataset
2. Remove the overlap from samples
3. Remove the gravity appling a ButterWoeth filter
4. Resample the data from 50Hz to 20Hz
5. Change the time window from 2.56s to 3s
6. Change the accelerometer mensure from g to m/s²

In [1]:
import numpy as np
import pandas as pd
import os
import plotly.express as px
import plotly.graph_objects as go

## Filtering the accelerometer signal

In [2]:
from scipy import signal

def filtering(sig):
    
    sos = signal.butter(3, 0.3, 'low', fs=50, output='sos')
    filtered = signal.sosfilt(sos, sig)
    return filtered

def resample(sig, time_window):
    return signal.resample(sig, 20*time_window)

In [3]:
def transformar(X, y, user, sensor):
    """Essa função é responsável por pegar os dados do UCI-HAR
    (recortados em janelas de 2,56 segundos e com sobreposição de 50%)
    e transformá-los em janelas de 3 segundos sem sobreposição"""
    '''
    '''
    
    # Vamos ver quando ocorre uma nova amostra (ou seja, quando não ocorre sobreposição)
    aux = np.zeros(len(y)-1)
    for i in range(len(y)-1):
        aux[i] = np.all(X[i,64:]!=X[i+1,:64])
    ind = np.nonzero(aux)[0]+1
    ind = np.append([0],ind)
    ind = np.append(ind,[len(y)])
    
    # Agora vamos construir as capturas originais (sem sobreposição)
    sigs = []
    yy = []
    uuser = []
    for i in range(len(ind)-1):
        sig = X[ind[i]:ind[i+1],64:].reshape((ind[i+1]-ind[i])*64)
        sig = np.append(X[ind[i],:64],sig)
        
        if sensor < 3:
            sig_filtered = sig - filtering(sig) # Removing the gravity
            time_window = len(sig_filtered) // 50 # The time window is the 
            sig_resampled = resample(sig_filtered, time_window) # Resampling the signal

            sigs.append(sig_resampled)
            yy.append(y[ind[i]])
            uuser.append(user[ind[i]])
        else:
            time_window = len(sig) // 50 # The time window is the 
            sig_resampled = resample(sig, time_window) # Resampling the signal

            sigs.append(sig_resampled)
            yy.append(y[ind[i]])
            uuser.append(user[ind[i]])
    # Vamos construir as janelas de 3 segundos (lembrando que o UCI-HAR está amostrado a uma taxa de 20 Hz)
    new_time_window = 3
    sample_rate = 20
    points_per_new_window = new_time_window*sample_rate
    
    aux = np.zeros(len(sigs))
    for i in range(len(sigs)):
        aux[i] = len(sigs[i])
    Na = np.sum(aux//points_per_new_window, dtype=np.int64)
    mat = np.zeros((Na,points_per_new_window))
    yyy = np.zeros(Na)
    uuuser = np.zeros(Na)
    k = 0
    for i in range(len(sigs)):
        for j in range(len(sigs[i])//points_per_new_window):
            mat[k,:] = sigs[i][j*points_per_new_window:(j+1)*points_per_new_window]
            yyy[k] = yy[i]
            uuuser[k] = uuser[i]
            k+=1
    return mat, yyy, uuuser;

## Pre processing the train data

In [4]:
# Vamos abrir os dados de treino do UCI-HAR
pasta = "../../../../../Downloads/UCI HAR Dataset/UCI HAR Dataset/train/"
dados = [None]*6
dados[0] = np.loadtxt(pasta+'Inertial Signals/total_acc_x_train.txt')
dados[1] = np.loadtxt(pasta+'Inertial Signals/total_acc_y_train.txt')
dados[2] = np.loadtxt(pasta+'Inertial Signals/total_acc_z_train.txt')
dados[3] = np.loadtxt(pasta+'Inertial Signals/body_gyro_x_train.txt')
dados[4] = np.loadtxt(pasta+'Inertial Signals/body_gyro_y_train.txt')
dados[5] = np.loadtxt(pasta+'Inertial Signals/body_gyro_z_train.txt')
y = np.loadtxt(pasta+'y_train.txt')
user = np.loadtxt(pasta+'subject_train.txt')
labels = ['WALKING', 'WALKING_UPSTAIRS', 'WALKING_DOWNSTAIRS', 'SITTING', 'STANDING', 'LAYING']
print('Número de amostras por classe \npara janelas de 2,56 segundos e sobreposição de 50%')
for i in np.unique(y):
    print(labels[i.astype(int)-1]+': '+str(np.sum(y==i)))

Número de amostras por classe 
para janelas de 2,56 segundos e sobreposição de 50%
WALKING: 1226
WALKING_UPSTAIRS: 1073
WALKING_DOWNSTAIRS: 986
SITTING: 1286
STANDING: 1374
LAYING: 1407


In [47]:
new_time_window = 3
sample_rate = 20
points_per_new_window = new_time_window*sample_rate

train = []
for i in range(6):
    mat, yyy, uuuser = transformar(dados[i], y, user, i)
    if type(train) == list:
        train = mat
    else:
        train = np.concatenate([train, mat], axis=1)
    print(train.shape)
train = np.column_stack((train,yyy.astype(int),uuuser.astype(int)))

(3049, 60)
(3049, 120)
(3049, 180)
(3049, 240)
(3049, 300)
(3049, 360)


In [48]:
df_train = pd.DataFrame(data=train)

In [49]:
colunas = ['accel-x-' + str(x) for x in np.arange(points_per_new_window).tolist()]
colunas += ['accel-y-' + str(x) for x in np.arange(points_per_new_window).tolist()]
colunas += ['accel-z-' + str(x) for x in np.arange(points_per_new_window).tolist()]
colunas += ['gyro-x-' + str(x) for x in np.arange(points_per_new_window).tolist()]
colunas += ['gyro-y-' + str(x) for x in np.arange(points_per_new_window).tolist()]
colunas += ['gyro-z-' + str(x) for x in np.arange(points_per_new_window).tolist()]
colunas += ['activity code']
colunas += ['user']
df_train.columns = colunas
df_train = df_train.astype({"activity code": int, "user": int})

# Removing the activity laying
df_train = df_train[df_train['activity code'].isin([1, 2, 3, 4, 5])]
df_train

Unnamed: 0,accel-x-0,accel-x-1,accel-x-2,accel-x-3,accel-x-4,accel-x-5,accel-x-6,accel-x-7,accel-x-8,accel-x-9,...,gyro-z-52,gyro-z-53,gyro-z-54,gyro-z-55,gyro-z-56,gyro-z-57,gyro-z-58,gyro-z-59,activity code,user
0,0.709658,1.098709,0.974849,1.042785,0.988832,1.020784,0.979064,0.994043,0.956313,0.957692,...,0.001605,-0.000460,-0.001029,-0.000238,0.002616,0.006758,0.009952,-0.005709,5,1
1,-0.062805,-0.056817,-0.051473,-0.048221,-0.045523,-0.037635,-0.031380,-0.030902,-0.026130,-0.022408,...,-0.001788,-0.005908,-0.010234,-0.011284,-0.009106,-0.001114,-0.001296,-0.002068,5,1
2,-0.002409,-0.004017,-0.003232,-0.001114,-0.001560,-0.004402,-0.002449,-0.003568,-0.001499,-0.002786,...,0.000421,0.004503,0.002473,0.001083,0.007256,0.021702,0.007641,0.009008,5,1
3,-0.003230,0.000680,0.000479,0.000186,0.004959,-0.002651,0.000215,-0.001799,0.002436,0.001305,...,0.005918,-0.003793,-0.002657,0.001718,-0.007073,-0.012203,-0.009020,-0.008270,5,1
4,0.002240,0.000876,-0.002080,0.001666,0.000811,-0.000224,-0.001148,-0.002598,0.005442,-0.000696,...,0.000288,-0.001961,0.001915,0.002949,0.007039,-0.004036,-0.003410,-0.004388,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3044,-0.040509,0.051099,0.043308,0.131483,0.405293,0.583331,0.315966,0.023301,-0.052495,-0.145659,...,-0.396459,-0.545852,-0.702665,-0.833772,-0.654115,-0.355501,-0.202928,-0.085239,2,30
3045,0.714841,1.279840,1.410648,1.509643,1.230629,0.989366,0.834677,0.837818,0.574001,0.602336,...,-0.445656,-0.579521,-0.846560,-0.700226,-0.574337,-0.199706,0.069693,0.210308,2,30
3046,0.201737,0.463930,0.421784,0.134871,-0.079729,-0.105013,-0.213468,-0.364409,-0.369793,-0.380467,...,0.035472,0.292552,0.203059,-0.175440,0.281563,0.134684,0.217240,0.443779,2,30
3047,-0.249057,-0.391390,-0.204677,-0.254794,-0.239145,-0.136961,0.073090,0.282398,0.715923,0.157419,...,-0.232415,0.024815,0.065954,0.068590,0.115724,0.091088,0.309496,0.256306,2,30


In [None]:
print('Número de amostras por classe na formatação final:')
for i in np.unique(yyy):
    print(labels[i.astype(int)-1]+': '+str(np.sum(yyy==i)))

In [None]:
# Mudando a unidade de medida do acelerômetro para m/s² (Basta multiplicar por g = 9.81)
# g = 9.81
g = 9.81
df_train.iloc[:,:450] = df_train.iloc[:,:450]*g
df_train

## Pre processing the test data

In [None]:
# Agora vamos abrir os dados de treino do UCI-HAR
pasta = "UCI HAR Dataset/UCI HAR Dataset/test/"
dados = [None]*6
dados[0] = np.loadtxt(pasta+'Inertial Signals/total_acc_x_test.txt')
dados[1] = np.loadtxt(pasta+'Inertial Signals/total_acc_y_test.txt')
dados[2] = np.loadtxt(pasta+'Inertial Signals/total_acc_z_test.txt')
dados[3] = np.loadtxt(pasta+'Inertial Signals/body_gyro_x_test.txt')
dados[4] = np.loadtxt(pasta+'Inertial Signals/body_gyro_y_test.txt')
dados[5] = np.loadtxt(pasta+'Inertial Signals/body_gyro_z_test.txt')
y = np.loadtxt(pasta+'y_test.txt')
user = np.loadtxt(pasta+'subject_test.txt')
labels = ['WALKING', 'WALKING_UPSTAIRS', 'WALKING_DOWNSTAIRS', 'SITTING', 'STANDING'
          , 'LAYING']
print('Número de amostras por classe \npara janelas de 2,56 segundos e sobreposição de 50%')
for i in np.unique(y):
    print(labels[i.astype(int)-1]+': '+str(np.sum(y==i)))

In [None]:
new_time_window = 3
sample_rate = 20
points_per_new_window = new_time_window*sample_rate

test = []
for i in range(6):
    mat, yyy, uuuser = transformar(dados[i], y, user, i)
    if type(test) == list:
        test = mat
    else:
        test = np.concatenate([test, mat], axis=1)
    print(test.shape)
test = np.column_stack((test,yyy.astype(int),uuuser.astype(int)))

In [None]:
df_test = pd.DataFrame(data=test)

In [None]:
df_test.columns = colunas
df_test = df_test.astype({"activity code": int, "user": int})

# Removing the activity laying
df_test = df_test[df_test['activity code'].isin([1, 2, 3, 4, 5])]
df_test

In [None]:
print('Número de amostras por classe na formatação final:')
for i in np.unique(yyy):
    print(labels[i.astype(int)-1]+': '+str(np.sum(yyy==i)))

In [None]:
# Mudando a unidade de medida do acelerômetro para m/s² (Basta multiplicar por g = 9.81)
# g = 9.81
g = 9.81
df_test.iloc[:,:450] = df_test.iloc[:,:450]*g
df_test

In [None]:
df_train.to_csv('train.csv',index=False)
df_test.to_csv('test.csv',index=False)