# Pre processing UCI-HAR

This notebook will pre processing the dataset UCI-HAR (https://archive.ics.uci.edu/ml/datasets/human+activity+recognition+using+smartphones)

1. Load the uci dataset
2. Remove the overlap from samples
3. Remove the gravity appling a ButterWoeth filter
4. Resample the data from 50Hz to 20Hz
5. Change the time window from 2.56s to 3s
6. Change the accelerometer mensure from g to m/s²

In [1]:
import numpy as np
import pandas as pd
import os
import plotly.express as px
import plotly.graph_objects as go

## Filtering the accelerometer signal

In [2]:
from scipy import signal

def filtering(sig):
    
    sos = signal.butter(3, 0.3, 'low', fs=50, output='sos')
    filtered = signal.sosfilt(sos, sig)
    return filtered

def resample(sig, time_window):
    return signal.resample(sig, 20*time_window)

In [3]:
def transformar(X, y, user, sensor, filter_the_signal=True):
    """Essa função é responsável por pegar os dados do UCI-HAR
    (recortados em janelas de 2,56 segundos e com sobreposição de 50%)
    e transformá-los em janelas de 3 segundos sem sobreposição"""
    '''
    '''
    
    # Vamos ver quando ocorre uma nova amostra (ou seja, quando não ocorre sobreposição)
    aux = np.zeros(len(y)-1)
    for i in range(len(y)-1):
        aux[i] = np.all(X[i,64:]!=X[i+1,:64])
    ind = np.nonzero(aux)[0]+1
    ind = np.append([0],ind)
    ind = np.append(ind,[len(y)])
    
    # Agora vamos construir as capturas originais (sem sobreposição)
    sigs = []
    yy = []
    uuser = []
    for i in range(len(ind)-1):
        sig = X[ind[i]:ind[i+1],64:].reshape((ind[i+1]-ind[i])*64)
        sig = np.append(X[ind[i],:64],sig)
        
        if sensor < 3:
            if filter_the_signal:
                sig_filtered = sig - filtering(sig) # Removing the gravity
            else:
                sig_filtered = sig
            time_window = len(sig_filtered) // 50 # The time window is the time of the total window
            sig_resampled = resample(sig_filtered, time_window) # Resampling the signal

            sigs.append(sig_resampled)
            yy.append(y[ind[i]])
            uuser.append(user[ind[i]])
        else:
            time_window = len(sig) // 50 # The time window is the time of the total window
            sig_resampled = resample(sig, time_window) # Resampling the signal

            sigs.append(sig_resampled)
            yy.append(y[ind[i]])
            uuser.append(user[ind[i]])
    # Vamos construir as janelas de 3 segundos (lembrando que o UCI-HAR está amostrado a uma taxa de 20 Hz)
    new_time_window = 3
    sample_rate = 20
    points_per_new_window = new_time_window*sample_rate
    
    aux = np.zeros(len(sigs))
    for i in range(len(sigs)):
        aux[i] = len(sigs[i])
    Na = np.sum(aux//points_per_new_window, dtype=np.int64)
    mat = np.zeros((Na,points_per_new_window))
    yyy = np.zeros(Na)
    uuuser = np.zeros(Na)
    k = 0
    for i in range(len(sigs)):
        for j in range(len(sigs[i])//points_per_new_window):
            mat[k,:] = sigs[i][j*points_per_new_window:(j+1)*points_per_new_window]
            yyy[k] = yy[i]
            uuuser[k] = uuser[i]
            k+=1
    return mat, yyy, uuuser;

## Pre processing the train data

In [4]:
# Vamos abrir os dados de treino do UCI-HAR
pasta = "../../../../../Downloads/UCI HAR Dataset/UCI HAR Dataset/train/"
# dados = [None]*6
sensor = [
    'Inertial Signals/total_acc_x_train.txt', 
    'Inertial Signals/total_acc_y_train.txt', 
    'Inertial Signals/total_acc_z_train.txt', 
    'Inertial Signals/body_gyro_x_train.txt', 
    'Inertial Signals/body_gyro_y_train.txt', 
    'Inertial Signals/body_gyro_z_train.txt'
]
dados = [np.loadtxt(pasta+sensor[i]) for i in range(6)]
                    
# dados[0] = np.loadtxt(pasta+'Inertial Signals/total_acc_x_train.txt')
# dados[1] = np.loadtxt(pasta+'Inertial Signals/total_acc_y_train.txt')
# dados[2] = np.loadtxt(pasta+'Inertial Signals/total_acc_z_train.txt')
# dados[3] = np.loadtxt(pasta+'Inertial Signals/body_gyro_x_train.txt')
# dados[4] = np.loadtxt(pasta+'Inertial Signals/body_gyro_y_train.txt')
# dados[5] = np.loadtxt(pasta+'Inertial Signals/body_gyro_z_train.txt')
                    
y = np.loadtxt(pasta+'y_train.txt')
user = np.loadtxt(pasta+'subject_train.txt')
labels = ['WALKING', 'WALKING_UPSTAIRS', 'WALKING_DOWNSTAIRS', 'SITTING', 'STANDING', 'LAYING']
print('Número de amostras por classe \npara janelas de 2,56 segundos e sobreposição de 50%')
for i in np.unique(y):
    print(labels[i.astype(int)-1]+': '+str(np.sum(y==i)))

Número de amostras por classe 
para janelas de 2,56 segundos e sobreposição de 50%
WALKING: 1226
WALKING_UPSTAIRS: 1073
WALKING_DOWNSTAIRS: 986
SITTING: 1286
STANDING: 1374
LAYING: 1407


In [5]:
new_time_window = 3
sample_rate = 20
points_per_new_window = new_time_window*sample_rate

colunas = ['accel-x-' + str(x) for x in np.arange(points_per_new_window).tolist()]
colunas += ['accel-y-' + str(x) for x in np.arange(points_per_new_window).tolist()]
colunas += ['accel-z-' + str(x) for x in np.arange(points_per_new_window).tolist()]
colunas += ['gyro-x-' + str(x) for x in np.arange(points_per_new_window).tolist()]
colunas += ['gyro-y-' + str(x) for x in np.arange(points_per_new_window).tolist()]
colunas += ['gyro-z-' + str(x) for x in np.arange(points_per_new_window).tolist()]
colunas += ['activity code']
colunas += ['user']

In [6]:
new_time_window = 3
sample_rate = 20
points_per_new_window = new_time_window*sample_rate

train = []
for i in range(6):
    mat, yyy, uuuser = transformar(dados[i], y, user, i, filter_the_signal=False)
    if type(train) == list:
        train = mat
    else:
        train = np.concatenate([train, mat], axis=1)
    print(train.shape)
train = np.column_stack((train,yyy.astype(int),uuuser.astype(int)))

(3049, 60)
(3049, 120)
(3049, 180)
(3049, 240)
(3049, 300)
(3049, 360)


In [7]:
df_train = pd.DataFrame(data=train)

In [8]:
df_train.columns = colunas
df_train = df_train.astype({"activity code": int, "user": int})

# Removing the activity laying
df_train = df_train[df_train['activity code'].isin([1, 2, 3, 4, 5])]
df_train

Unnamed: 0,accel-x-0,accel-x-1,accel-x-2,accel-x-3,accel-x-4,accel-x-5,accel-x-6,accel-x-7,accel-x-8,accel-x-9,...,gyro-z-52,gyro-z-53,gyro-z-54,gyro-z-55,gyro-z-56,gyro-z-57,gyro-z-58,gyro-z-59,activity code,user
0,1.017550,1.021611,1.018552,1.018257,1.019347,1.019592,1.019239,1.020283,1.020903,1.020657,...,0.001605,-0.000460,-0.001029,-0.000238,0.002616,0.006758,0.009952,-0.005709,5,1
1,1.018839,1.018934,1.021502,1.018748,1.018452,1.020407,1.023633,1.018406,1.020265,1.018635,...,-0.001788,-0.005908,-0.010234,-0.011284,-0.009106,-0.001114,-0.001296,-0.002068,5,1
2,1.020177,1.018852,1.019657,1.022014,1.021480,1.018843,1.020615,1.019679,1.021483,1.020366,...,0.000421,0.004503,0.002473,0.001083,0.007256,0.021702,0.007641,0.009008,5,1
3,1.016370,1.021254,1.020121,1.020826,1.024640,1.018054,1.019939,1.018973,1.022200,1.022139,...,0.005918,-0.003793,-0.002657,0.001718,-0.007073,-0.012203,-0.009020,-0.008270,5,1
4,1.021596,1.021918,1.017197,1.022672,1.020033,1.020768,1.018041,1.018400,1.024612,1.020318,...,0.000288,-0.001961,0.001915,0.002949,0.007039,-0.004036,-0.003410,-0.004388,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3044,0.945186,1.037966,1.026399,1.115456,1.385308,1.564512,1.293956,1.003946,0.926564,0.837592,...,-0.396459,-0.545852,-0.702665,-0.833772,-0.654115,-0.355501,-0.202928,-0.085239,2,30
3045,1.006686,1.206809,1.452206,1.487297,1.261462,0.992533,0.879638,0.873343,0.649326,0.680278,...,-0.445656,-0.579521,-0.846560,-0.700226,-0.574337,-0.199706,0.069693,0.210308,2,30
3046,1.258910,1.515128,1.469078,1.176960,0.959597,0.930599,0.820493,0.666827,0.660159,0.646789,...,0.035472,0.292552,0.203059,-0.175440,0.281563,0.134684,0.217240,0.443779,2,30
3047,0.727985,0.589146,0.777407,0.730529,0.746949,0.851375,1.060909,1.271284,1.703356,1.145747,...,-0.232415,0.024815,0.065954,0.068590,0.115724,0.091088,0.309496,0.256306,2,30


In [9]:
print('Número de amostras por classe na formatação final:')
for i in np.unique(yyy):
    print(labels[i.astype(int)-1]+': '+str(np.sum(yyy==i)))

Número de amostras por classe na formatação final:
WALKING: 506
WALKING_UPSTAIRS: 439
WALKING_DOWNSTAIRS: 395
SITTING: 544
STANDING: 575
LAYING: 590


In [10]:
df_train.iloc[:,:181]

Unnamed: 0,accel-x-0,accel-x-1,accel-x-2,accel-x-3,accel-x-4,accel-x-5,accel-x-6,accel-x-7,accel-x-8,accel-x-9,...,accel-z-51,accel-z-52,accel-z-53,accel-z-54,accel-z-55,accel-z-56,accel-z-57,accel-z-58,accel-z-59,gyro-x-0
0,1.017550,1.021611,1.018552,1.018257,1.019347,1.019592,1.019239,1.020283,1.020903,1.020657,...,0.092465,0.098621,0.083744,0.081572,0.091025,0.089220,0.095104,0.094382,0.095215,0.024057
1,1.018839,1.018934,1.021502,1.018748,1.018452,1.020407,1.023633,1.018406,1.020265,1.018635,...,0.082314,0.087260,0.084563,0.083469,0.086823,0.090716,0.085884,0.079124,0.082201,0.012282
2,1.020177,1.018852,1.019657,1.022014,1.021480,1.018843,1.020615,1.019679,1.021483,1.020366,...,0.086665,0.084553,0.087942,0.088134,0.087042,0.085652,0.089638,0.078830,0.097809,-0.007669
3,1.016370,1.021254,1.020121,1.020826,1.024640,1.018054,1.019939,1.018973,1.022200,1.022139,...,0.076261,0.069967,0.071702,0.078302,0.080754,0.079804,0.073671,0.079196,0.071009,-0.003489
4,1.021596,1.021918,1.017197,1.022672,1.020033,1.020768,1.018041,1.018400,1.024612,1.020318,...,0.080049,0.075542,0.081946,0.079378,0.077898,0.085409,0.071605,0.073909,0.085251,0.009188
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3044,0.945186,1.037966,1.026399,1.115456,1.385308,1.564512,1.293956,1.003946,0.926564,0.837592,...,0.136649,-0.020635,-0.051798,-0.113695,-0.054654,-0.148727,-0.178828,-0.178395,-0.148811,0.322930
3045,1.006686,1.206809,1.452206,1.487297,1.261462,0.992533,0.879638,0.873343,0.649326,0.680278,...,0.011327,-0.075889,-0.075525,-0.055996,-0.124729,-0.150101,-0.116391,-0.101789,-0.140159,-0.493072
3046,1.258910,1.515128,1.469078,1.176960,0.959597,0.930599,0.820493,0.666827,0.660159,0.646789,...,-0.122666,-0.138411,-0.121999,-0.225036,-0.409721,-0.256473,-0.239156,-0.188245,-0.161185,-0.517789
3047,0.727985,0.589146,0.777407,0.730529,0.746949,0.851375,1.060909,1.271284,1.703356,1.145747,...,-0.174029,-0.095994,-0.064758,-0.116504,-0.095521,-0.009466,0.093278,0.148427,0.269718,0.519843


In [11]:
# Mudando a unidade de medida do acelerômetro para m/s² (Basta multiplicar por g = 9.81)
# g = 9.81
g = 9.81
df_train.iloc[:,:180] = df_train.iloc[:,:180]*g
df_train

Unnamed: 0,accel-x-0,accel-x-1,accel-x-2,accel-x-3,accel-x-4,accel-x-5,accel-x-6,accel-x-7,accel-x-8,accel-x-9,...,gyro-z-52,gyro-z-53,gyro-z-54,gyro-z-55,gyro-z-56,gyro-z-57,gyro-z-58,gyro-z-59,activity code,user
0,9.982164,10.022004,9.991999,9.989105,9.999790,10.002199,9.998735,10.008979,10.015062,10.012644,...,0.001605,-0.000460,-0.001029,-0.000238,0.002616,0.006758,0.009952,-0.005709,5,1
1,9.994811,9.995747,10.020938,9.993917,9.991015,10.010195,10.041842,9.990567,10.008804,9.992807,...,-0.001788,-0.005908,-0.010234,-0.011284,-0.009106,-0.001114,-0.001296,-0.002068,5,1
2,10.007941,9.994940,10.002836,10.025953,10.020718,9.994851,10.012237,10.003053,10.020745,10.009794,...,0.000421,0.004503,0.002473,0.001083,0.007256,0.021702,0.007641,0.009008,5,1
3,9.970586,10.018501,10.007389,10.014299,10.051722,9.987107,10.005598,9.996123,10.027783,10.027182,...,0.005918,-0.003793,-0.002657,0.001718,-0.007073,-0.012203,-0.009020,-0.008270,5,1
4,10.021855,10.025014,9.978700,10.032412,10.006522,10.013739,9.986978,9.990508,10.051446,10.009320,...,0.000288,-0.001961,0.001915,0.002949,0.007039,-0.004036,-0.003410,-0.004388,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3044,9.272278,10.182447,10.068972,10.942623,13.589870,15.347867,12.693709,9.848712,9.089591,8.216777,...,-0.396459,-0.545852,-0.702665,-0.833772,-0.654115,-0.355501,-0.202928,-0.085239,2,30
3045,9.875588,11.838798,14.246144,14.590387,12.374943,9.736751,8.629247,8.567498,6.369888,6.673528,...,-0.445656,-0.579521,-0.846560,-0.700226,-0.574337,-0.199706,0.069693,0.210308,2,30
3046,12.349903,14.863403,14.411655,11.545977,9.413645,9.129181,8.049041,6.541573,6.476163,6.344999,...,0.035472,0.292552,0.203059,-0.175440,0.281563,0.134684,0.217240,0.443779,2,30
3047,7.141534,5.779522,7.626360,7.166491,7.327571,8.351987,10.407518,12.471300,16.709918,11.239780,...,-0.232415,0.024815,0.065954,0.068590,0.115724,0.091088,0.309496,0.256306,2,30


## Pre processing the test data

In [12]:
# Agora vamos abrir os dados de treino do UCI-HAR
pasta = "../../../../../Downloads/UCI HAR Dataset/UCI HAR Dataset/test/"
dados = [None]*6
sensor = [
    'Inertial Signals/total_acc_x_test.txt', 
    'Inertial Signals/total_acc_y_test.txt', 
    'Inertial Signals/total_acc_z_test.txt', 
    'Inertial Signals/body_gyro_x_test.txt', 
    'Inertial Signals/body_gyro_y_test.txt', 
    'Inertial Signals/body_gyro_z_test.txt'
]
dados = [np.loadtxt(pasta+sensor[i]) for i in range(6)]
                    
y = np.loadtxt(pasta+'y_test.txt')
user = np.loadtxt(pasta+'subject_test.txt')
labels = ['WALKING', 'WALKING_UPSTAIRS', 'WALKING_DOWNSTAIRS', 'SITTING', 'STANDING'
          , 'LAYING']
print('Número de amostras por classe \npara janelas de 2,56 segundos e sobreposição de 50%')
for i in np.unique(y):
    print(labels[i.astype(int)-1]+': '+str(np.sum(y==i)))

Número de amostras por classe 
para janelas de 2,56 segundos e sobreposição de 50%
WALKING: 496
WALKING_UPSTAIRS: 471
WALKING_DOWNSTAIRS: 420
SITTING: 491
STANDING: 532
LAYING: 537


In [13]:
new_time_window = 3
sample_rate = 20
points_per_new_window = new_time_window*sample_rate

test = []
for i in range(6):
    mat, yyy, uuuser = transformar(dados[i], y, user, i, filter_the_signal=False)
    if type(test) == list:
        test = mat
    else:
        test = np.concatenate([test, mat], axis=1)
    print(test.shape)
test = np.column_stack((test,yyy.astype(int),uuuser.astype(int)))

(1224, 60)
(1224, 120)
(1224, 180)
(1224, 240)
(1224, 300)
(1224, 360)


In [14]:
df_test = pd.DataFrame(data=test)

In [15]:
df_test.columns = colunas
df_test = df_test.astype({"activity code": int, "user": int})

# Removing the activity laying
df_test = df_test[df_test['activity code'].isin([1, 2, 3, 4, 5])]
df_test

Unnamed: 0,accel-x-0,accel-x-1,accel-x-2,accel-x-3,accel-x-4,accel-x-5,accel-x-6,accel-x-7,accel-x-8,accel-x-9,...,gyro-z-52,gyro-z-53,gyro-z-54,gyro-z-55,gyro-z-56,gyro-z-57,gyro-z-58,gyro-z-59,activity code,user
0,1.013607,1.056876,0.988536,0.938032,0.997057,1.003726,1.001081,0.994299,0.986184,1.005377,...,0.020261,0.019462,0.031394,0.031187,0.028060,0.034830,0.026174,0.013201,5,2
1,0.992596,0.993008,0.996230,0.998100,0.991479,0.991127,0.990286,0.998678,0.995288,0.991508,...,-0.010148,-0.007775,-0.005448,-0.002262,-0.005768,-0.007097,-0.003873,-0.005801,5,2
2,0.992634,0.992242,0.989024,0.991420,0.987317,0.986897,0.992795,0.997789,0.994145,0.989724,...,-0.031461,-0.035463,-0.008491,-0.002806,0.007795,0.011203,0.003488,-0.004301,5,2
3,0.990792,0.992235,0.991534,0.993082,0.992708,0.995733,0.995692,0.992557,0.993635,0.990432,...,-0.010083,-0.005960,-0.006940,-0.009337,-0.009792,-0.001859,-0.004210,-0.000520,5,2
4,0.989140,0.988021,0.984173,0.988020,0.987089,0.987911,0.987954,0.991703,0.990024,0.986760,...,0.003836,0.001970,0.003919,0.005948,-0.002991,0.006746,0.007876,0.006598,5,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1219,0.747782,0.646831,0.800269,0.921769,1.030276,1.268128,1.625497,1.430911,1.062025,0.952009,...,0.117826,-0.216337,-0.498443,-0.221196,0.040862,0.122537,0.130663,0.001037,3,24
1220,0.960313,1.259010,1.269170,1.205525,1.372409,0.991980,0.810903,0.925131,0.924202,0.829489,...,-0.128649,0.097912,0.551742,0.356796,-0.128970,-0.070716,0.436967,-0.035641,2,24
1221,1.042803,0.932268,0.810695,0.946740,0.797250,0.735176,0.769677,0.850904,0.943064,1.039343,...,-0.147423,-0.101344,-0.070548,0.196283,0.036365,-0.383925,-0.499610,-0.496780,2,24
1222,0.960361,0.816116,0.641549,0.709523,0.873479,0.871080,1.033618,1.094586,1.131213,1.374917,...,-0.345504,-0.396574,-0.585145,-0.923041,-0.865991,-0.622471,-0.736700,-0.851594,2,24


In [16]:
print('Número de amostras por classe na formatação final:')
for i in np.unique(yyy):
    print(labels[i.astype(int)-1]+': '+str(np.sum(yyy==i)))

Número de amostras por classe na formatação final:
WALKING: 204
WALKING_UPSTAIRS: 189
WALKING_DOWNSTAIRS: 173
SITTING: 204
STANDING: 227
LAYING: 227


In [17]:
# Mudando a unidade de medida do acelerômetro para m/s² (Basta multiplicar por g = 9.81)
# g = 9.81
g = 9.81
df_test.iloc[:,:180] = df_test.iloc[:,:180]*g
df_test

Unnamed: 0,accel-x-0,accel-x-1,accel-x-2,accel-x-3,accel-x-4,accel-x-5,accel-x-6,accel-x-7,accel-x-8,accel-x-9,...,gyro-z-52,gyro-z-53,gyro-z-54,gyro-z-55,gyro-z-56,gyro-z-57,gyro-z-58,gyro-z-59,activity code,user
0,9.943484,10.367951,9.697540,9.202097,9.781132,9.846554,9.820603,9.754075,9.674465,9.862748,...,0.020261,0.019462,0.031394,0.031187,0.028060,0.034830,0.026174,0.013201,5,2
1,9.737371,9.741410,9.773016,9.791364,9.726406,9.722953,9.714707,9.797028,9.763771,9.726692,...,-0.010148,-0.007775,-0.005448,-0.002262,-0.005768,-0.007097,-0.003873,-0.005801,5,2
2,9.737744,9.733894,9.702324,9.725832,9.685578,9.681457,9.739314,9.788311,9.752565,9.709193,...,-0.031461,-0.035463,-0.008491,-0.002806,0.007795,0.011203,0.003488,-0.004301,5,2
3,9.719673,9.733827,9.726947,9.742131,9.738465,9.768143,9.767742,9.736982,9.747564,9.716135,...,-0.010083,-0.005960,-0.006940,-0.009337,-0.009792,-0.001859,-0.004210,-0.000520,5,2
4,9.703464,9.692484,9.654740,9.692479,9.683348,9.691407,9.691824,9.728604,9.712137,9.680120,...,0.003836,0.001970,0.003919,0.005948,-0.002991,0.006746,0.007876,0.006598,5,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1219,7.335737,6.345408,7.850638,9.042558,10.107012,12.440331,15.946122,14.037241,10.418468,9.339211,...,0.117826,-0.216337,-0.498443,-0.221196,0.040862,0.122537,0.130663,0.001037,3,24
1220,9.420669,12.350890,12.450553,11.826199,13.463330,9.731320,7.954961,9.075540,9.066422,8.137290,...,-0.128649,0.097912,0.551742,0.356796,-0.128970,-0.070716,0.436967,-0.035641,2,24
1221,10.229895,9.145552,7.952917,9.287520,7.821026,7.212080,7.550534,8.347369,9.251462,10.195953,...,-0.147423,-0.101344,-0.070548,0.196283,0.036365,-0.383925,-0.499610,-0.496780,2,24
1222,9.421137,8.006098,6.293598,6.960425,8.568831,8.545294,10.139792,10.737891,11.097196,13.487932,...,-0.345504,-0.396574,-0.585145,-0.923041,-0.865991,-0.622471,-0.736700,-0.851594,2,24


In [18]:
df_train.to_csv('../../data_2/views/UCI-HAR/unbalanced_view_with_gravity_acc_9.81_train_test-resampled_20hz-v1/train.csv',index=False)
df_test.to_csv('../../data_2/views/UCI-HAR/unbalanced_view_with_gravity_acc_9.81_train_test-resampled_20hz-v1/test.csv',index=False)