# Pre processing UCI-HAR

This notebook will pre processing the dataset UCI-HAR (https://archive.ics.uci.edu/ml/datasets/human+activity+recognition+using+smartphones)

1. Load the uci dataset
2. Change the accelerometer mensure from g to m/s²
3. Remove the overlap from samples
4. Remove the gravity appling a ButterWorth filter
5. Resample the data from 50Hz to 20Hz
6. Change the time window to 3s

In [1]:
import numpy as np
import random
import pandas as pd
import os
import plotly.express as px
import plotly.graph_objects as go
import random

## Preprecess the data

In [2]:
from scipy import signal

def filtering(sig):
    h = signal.butter(3, .3, 'hp', fs=50, output='sos')
    zi = signal.sosfilt_zi(h) * sig[:4].mean()
    sample_filtered, zo = signal.sosfilt(h, sig[:], zi=zi)
#     sample_filtered = signal.sosfiltfilt(h, sig)
    
    return sample_filtered

def resample(sig, time_window):
    return signal.resample(sig, 20*time_window)

def windowing(sigs, yy, uuser):
    
    # Vamos construir as janelas de 3 segundos (lembrando que o UCI-HAR está amostrado a uma taxa de 50 Hz)
    new_time_window = 3
    sample_rate = 20
    points_per_new_window = new_time_window*sample_rate
    
    aux = np.zeros(len(sigs))
    for i in range(len(sigs)):
        aux[i] = len(sigs[i])
    Na = np.sum(aux//points_per_new_window, dtype=np.int64)
    mat = np.zeros((Na,points_per_new_window))
    yyy = np.zeros(Na)
    uuuser = np.zeros(Na)
    k = 0
    for i in range(len(sigs)):
        for j in range(len(sigs[i])//points_per_new_window):
            mat[k,:] = sigs[i][j*points_per_new_window:(j+1)*points_per_new_window]
            yyy[k] = yy[i]
            uuuser[k] = uuser[i]
            k+=1
            
    return mat, yyy, uuuser;

def reconstrcut_the_signal(X, y, user, sensor, filter_the_signal=True, resample_the_signal=True):
    
    # Vamos ver quando ocorre uma nova amostra (ou seja, quando não ocorre sobreposição)
    aux = np.zeros(len(y)-1)
    for i in range(len(y)-1):
        aux[i] = np.all(X[i,64:]!=X[i+1,:64])
    ind = np.nonzero(aux)[0]+1
    ind = np.append([0],ind)
    ind = np.append(ind,[len(y)])
    
    # Agora vamos construir as capturas originais (sem sobreposição)
    sigs = []
    yy = []
    uuser = []
    for i in range(len(ind)-1):
        sig = X[ind[i]:ind[i+1],64:].reshape((ind[i+1]-ind[i])*64)
        sig = np.append(X[ind[i],:64],sig)
        
        new_sig = sig
        if sensor < 3:
            if filter_the_signal:
                new_sig = filtering(sig) # Removing the gravity

        time_window = len(new_sig) // 50 # The time window is the time of the total window
        sig_resampled = resample(new_sig, time_window) # Resampling the signal

        sigs.append(sig_resampled)
        yy.append(y[ind[i]])
        uuser.append(user[ind[i]])
        
    return windowing(sigs, yy, uuser)

def train_test_split(
    df: pd.DataFrame,
    users,
    activities,
    train_size=.70,
    validation_size=.10,
    test_size=.20,
    retries: int = 10,
    ensure_distinct_users_per_dataset: bool = True,
    seed: int = 0,
):
    n_users = len(users)
    random.seed(0)
    np.random.seed(0)
    
    for i in range(retries):
        # [start ---> train_size)
        random.shuffle(users)
        train_users = users[0:int(n_users * train_size)]
        # [train_size --> train_size+validation_size)
        validation_users = users[
            int(n_users * train_size):
            int(n_users * (train_size + validation_size))
        ]
        # [train_size+validation_size --> end]
        test_users = users[int(n_users * (train_size + validation_size)):]
        # iterate over user's lists, filter df for users in the respective list
        all_sets = [
            df[df["user"].isin(u)]
            for u in [train_users, validation_users, test_users]
        ]

        if not ensure_distinct_users_per_dataset:
            return all_sets

        # We must guarantee that all sets contains at least 1 sample from each activities listed
        oks = [set(s["activity code"]) == set(activities) for s in all_sets]
        if all(oks):
            # If all sets contains at least 1 sample for each activity, return train, val, test sets!
            return all_sets

    raise DatasetSplitError(
        "Does not found a 3 sets that contain the respective activities!"
    )

def balance_dataset_to_minimum(dataframe: pd.DataFrame, column: str = "activity code"
) -> pd.DataFrame:
    df_list = []
    random.seed(0)
    np.random.seed(0)
    
    histogram = dataframe.groupby(dataframe[column], as_index=False).size()
    for c in histogram[column]:
        temp = dataframe.loc[dataframe[column] == c]
        temp = temp.sample(n=histogram["size"].min())
        df_list.append(temp)
    return pd.concat(df_list)

## Preprocess the data with filter

In [3]:
# Vamos abrir os dados de treino do UCI-HAR
pasta = "../../../../../Downloads/UCI HAR Dataset/UCI HAR Dataset/train/"
# dados = [None]*6
sensor = [
    'Inertial Signals/total_acc_x_train.txt', 
    'Inertial Signals/total_acc_y_train.txt', 
    'Inertial Signals/total_acc_z_train.txt', 
    'Inertial Signals/body_gyro_x_train.txt', 
    'Inertial Signals/body_gyro_y_train.txt', 
    'Inertial Signals/body_gyro_z_train.txt'
]
train_data = [np.loadtxt(pasta+sensor[i]) for i in range(6)]
                    
y_train = np.loadtxt(pasta+'y_train.txt')
user_train = np.loadtxt(pasta+'subject_train.txt')
labels = ['WALKING', 'WALKING_UPSTAIRS', 'WALKING_DOWNSTAIRS', 'SITTING', 'STANDING', 'LAYING']
print('Número de amostras por classe \npara janelas de 2,56 segundos e sobreposição de 50%')
for i in np.unique(y_train):
    print(labels[i.astype(int)-1]+': '+str(np.sum(y_train==i)))

Número de amostras por classe 
para janelas de 2,56 segundos e sobreposição de 50%
WALKING: 1226
WALKING_UPSTAIRS: 1073
WALKING_DOWNSTAIRS: 986
SITTING: 1286
STANDING: 1374
LAYING: 1407


In [4]:
# Agora vamos abrir os dados de teste do UCI-HAR
pasta = "../../../../../Downloads/UCI HAR Dataset/UCI HAR Dataset/test/"
test_data = [None]*6
sensor = [
    'Inertial Signals/total_acc_x_test.txt', 
    'Inertial Signals/total_acc_y_test.txt', 
    'Inertial Signals/total_acc_z_test.txt', 
    'Inertial Signals/body_gyro_x_test.txt', 
    'Inertial Signals/body_gyro_y_test.txt', 
    'Inertial Signals/body_gyro_z_test.txt'
]
test_data = [np.loadtxt(pasta+sensor[i]) for i in range(6)]
                    
y_test = np.loadtxt(pasta+'y_test.txt')
user_test = np.loadtxt(pasta+'subject_test.txt')
labels = ['WALKING', 'WALKING_UPSTAIRS', 'WALKING_DOWNSTAIRS', 'SITTING', 'STANDING'
          , 'LAYING']
print('Número de amostras por classe \npara janelas de 2,56 segundos e sobreposição de 50%')
for i in np.unique(y_test):
    print(labels[i.astype(int)-1]+': '+str(np.sum(y_test==i)))

Número de amostras por classe 
para janelas de 2,56 segundos e sobreposição de 50%
WALKING: 496
WALKING_UPSTAIRS: 471
WALKING_DOWNSTAIRS: 420
SITTING: 491
STANDING: 532
LAYING: 537


In [5]:
# Concatenate all the data

data = np.concatenate([train_data, test_data], axis=1)
y = np.concatenate([y_train, y_test], axis=0)
user = np.concatenate([user_train, user_test], axis=0)

print('Número de amostras por classe \npara janelas de 2,56 segundos e sobreposição de 50%')
for i in np.unique(y):
    print(labels[i.astype(int)-1]+': '+str(np.sum(y==i)))

Número de amostras por classe 
para janelas de 2,56 segundos e sobreposição de 50%
WALKING: 1722
WALKING_UPSTAIRS: 1544
WALKING_DOWNSTAIRS: 1406
SITTING: 1777
STANDING: 1906
LAYING: 1944


In [6]:
for i in range(3):
    data[i] = 9.81 * data[i]

In [7]:
new_time_window = 3
sample_rate = 20
points_per_new_window = new_time_window*sample_rate

df = []
for i in range(6):
    mat, yyy, uuuser = reconstrcut_the_signal(data[i], y, user, i, filter_the_signal=True)
    if type(df) == list:
        df = mat
    else:
        df = np.concatenate([df, mat], axis=1)
    print(df.shape)
df = np.column_stack((df, yyy.astype(int), uuuser.astype(int)))

(4273, 60)
(4273, 120)
(4273, 180)
(4273, 240)
(4273, 300)
(4273, 360)


In [8]:
new_time_window = 3
sample_rate = 20
points_per_new_window = new_time_window*sample_rate

colunas = ['accel-x-' + str(x) for x in np.arange(points_per_new_window).tolist()]
colunas += ['accel-y-' + str(x) for x in np.arange(points_per_new_window).tolist()]
colunas += ['accel-z-' + str(x) for x in np.arange(points_per_new_window).tolist()]
colunas += ['gyro-x-' + str(x) for x in np.arange(points_per_new_window).tolist()]
colunas += ['gyro-y-' + str(x) for x in np.arange(points_per_new_window).tolist()]
colunas += ['gyro-z-' + str(x) for x in np.arange(points_per_new_window).tolist()]
colunas += ['activity code']
colunas += ['user']

In [9]:
new_data = pd.DataFrame(data=df, columns=colunas)

In [10]:
new_data = new_data.astype({"activity code": int, "user": int})

# Removing the activity laying
new_data = new_data[new_data['activity code'].isin([1, 2, 3, 4, 5])]
new_data

Unnamed: 0,accel-x-0,accel-x-1,accel-x-2,accel-x-3,accel-x-4,accel-x-5,accel-x-6,accel-x-7,accel-x-8,accel-x-9,...,gyro-z-52,gyro-z-53,gyro-z-54,gyro-z-55,gyro-z-56,gyro-z-57,gyro-z-58,gyro-z-59,activity code,user
0,-0.014815,0.027706,-0.006349,-0.006567,0.003480,0.005404,0.000741,0.010252,0.013117,0.008212,...,0.001605,-0.000460,-0.001029,-0.000238,0.002616,0.006758,0.009952,-0.005709,5,1
1,-0.007095,-0.003690,0.018640,-0.009081,-0.010651,0.009704,0.035148,-0.017942,0.001335,-0.014070,...,-0.001788,-0.005908,-0.010234,-0.011284,-0.009106,-0.001114,-0.001296,-0.002068,5,1
2,0.008302,-0.003804,0.005953,0.027194,0.018181,-0.007453,0.010778,0.000902,0.018179,0.004500,...,0.000421,0.004503,0.002473,0.001083,0.007256,0.021702,0.007641,0.009008,5,1
3,-0.041830,0.009893,-0.003540,0.005166,0.036491,-0.028931,-0.006925,-0.014687,0.017140,0.012921,...,0.005918,-0.003793,-0.002657,0.001718,-0.007073,-0.012203,-0.009020,-0.008270,5,1
4,0.007449,0.007046,-0.036597,0.018159,-0.010364,-0.002097,-0.027893,-0.018713,0.039114,-0.007774,...,0.000288,-0.001961,0.001915,0.002949,0.007039,-0.004036,-0.003410,-0.004388,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4268,-2.211944,-2.633802,-0.641079,0.656763,1.651677,3.576755,6.128206,3.106753,-0.808398,-1.783122,...,0.117826,-0.216337,-0.498443,-0.221196,0.040862,0.122537,0.130663,0.001037,3,24
4269,-0.753641,1.198333,1.284460,0.363897,1.833448,-2.075769,-3.225848,-1.695445,-1.321961,-1.924049,...,-0.128649,0.097912,0.551742,0.356796,-0.128970,-0.070716,0.436967,-0.035641,2,24
4270,-0.384508,-1.402681,-2.295513,-0.735959,-2.003355,-2.186823,-1.462221,-0.421771,0.552248,1.368636,...,-0.147423,-0.101344,-0.070548,0.196283,0.036365,-0.383925,-0.499610,-0.496780,2,24
4271,-0.560462,-1.839220,-3.059907,-1.861840,-0.015292,0.033238,1.584308,1.870023,1.941111,3.764954,...,-0.345504,-0.396574,-0.585145,-0.923041,-0.865991,-0.622471,-0.736700,-0.851594,2,24


In [11]:
# new_data[new_data['activity code'].isin([4])]
# labels = new_data['activity code'].unique()
# users = new_data['user'].unique()

# minimo = 1000
# maximo = 0
# bad_user = []
# statistic_info = {f'user-{user}': {f'activity-{activity}': None for activity in labels} for user in users}
# for user in users:
#     df_user = new_data[new_data['user'].isin([user])]
#     for activity in labels:
#         df_activity = df_user[df_user['activity code'].isin([activity])]
#         n_samples = df_activity.shape[0]
#         statistic_info[f'user-{user}'][f'activity-{activity}'] = n_samples
#         minimo = n_samples if n_samples < minimo else minimo
#         maximo = n_samples if n_samples > maximo else maximo
#         if n_samples == 0:
#             bad_user.append(user)
        
# statistic_info

In [12]:
activities = new_data['activity code'].unique()
users = new_data['user'].unique()
print(users)
new_data = balance_dataset_to_minimum(new_data)
train, val, test = train_test_split(new_data, users, activities)

[ 1  3  5  6  7  8 11 14 15 16 17 19 21 22 23 25 26 27 28 29 30  2  4  9
 10 12 13 18 20 24]


In [13]:
new_data.shape

(2840, 362)

In [14]:
train['user'].unique(), val['user'].unique(), test['user'].unique()

(array([23,  8, 16, 29,  6, 17, 28, 24,  1, 14,  2, 13, 19,  9,  4, 11, 12,
        27,  5, 30,  7]),
 array([25, 20]),
 array([ 3, 26, 18, 21, 10, 15, 22]))

In [15]:
train.shape, val.shape, test.shape

((1952, 362), (217, 362), (671, 362))

In [16]:
train.to_csv('../../data_2/raw_data/UCI-HAR/balanced_view_filtered_acc_9.81_train_test-v1/train.csv',
             index=False)
val.to_csv('../../data_2/raw_data/UCI-HAR/balanced_view_filtered_acc_9.81_train_test-v1/validation.csv',
           index=False)
test.to_csv('../../data_2/raw_data/UCI-HAR/balanced_view_filtered_acc_9.81_train_test-v1/test.csv',
            index=False)

## Preprocess the data with gravity

In [17]:
new_time_window = 3
sample_rate = 20
points_per_new_window = new_time_window*sample_rate

df = []
for i in range(6):
    mat, yyy, uuuser = reconstrcut_the_signal(data[i], y, user, i, filter_the_signal=False)
    if type(df) == list:
        df = mat
    else:
        df = np.concatenate([df, mat], axis=1)
    print(df.shape)
df = np.column_stack((df, yyy.astype(int), uuuser.astype(int)))

(4273, 60)
(4273, 120)
(4273, 180)
(4273, 240)
(4273, 300)
(4273, 360)


In [18]:
new_data = pd.DataFrame(data=df, columns=colunas)

In [19]:
new_data = new_data.astype({"activity code": int, "user": int})

# Removing the activity laying
new_data = new_data[new_data['activity code'].isin([1, 2, 3, 4, 5])]
new_data

Unnamed: 0,accel-x-0,accel-x-1,accel-x-2,accel-x-3,accel-x-4,accel-x-5,accel-x-6,accel-x-7,accel-x-8,accel-x-9,...,gyro-z-52,gyro-z-53,gyro-z-54,gyro-z-55,gyro-z-56,gyro-z-57,gyro-z-58,gyro-z-59,activity code,user
0,9.982164,10.022004,9.991999,9.989105,9.999790,10.002199,9.998735,10.008979,10.015062,10.012644,...,0.001605,-0.000460,-0.001029,-0.000238,0.002616,0.006758,0.009952,-0.005709,5,1
1,9.994811,9.995747,10.020938,9.993917,9.991015,10.010195,10.041842,9.990567,10.008804,9.992807,...,-0.001788,-0.005908,-0.010234,-0.011284,-0.009106,-0.001114,-0.001296,-0.002068,5,1
2,10.007941,9.994940,10.002836,10.025953,10.020718,9.994851,10.012237,10.003053,10.020745,10.009794,...,0.000421,0.004503,0.002473,0.001083,0.007256,0.021702,0.007641,0.009008,5,1
3,9.970586,10.018501,10.007389,10.014299,10.051722,9.987107,10.005598,9.996123,10.027783,10.027182,...,0.005918,-0.003793,-0.002657,0.001718,-0.007073,-0.012203,-0.009020,-0.008270,5,1
4,10.021855,10.025014,9.978700,10.032412,10.006522,10.013739,9.986978,9.990508,10.051446,10.009320,...,0.000288,-0.001961,0.001915,0.002949,0.007039,-0.004036,-0.003410,-0.004388,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4268,7.335737,6.345408,7.850638,9.042558,10.107012,12.440331,15.946122,14.037241,10.418468,9.339211,...,0.117826,-0.216337,-0.498443,-0.221196,0.040862,0.122537,0.130663,0.001037,3,24
4269,9.420669,12.350890,12.450553,11.826199,13.463330,9.731320,7.954961,9.075540,9.066422,8.137290,...,-0.128649,0.097912,0.551742,0.356796,-0.128970,-0.070716,0.436967,-0.035641,2,24
4270,10.229895,9.145552,7.952917,9.287520,7.821026,7.212080,7.550534,8.347369,9.251462,10.195953,...,-0.147423,-0.101344,-0.070548,0.196283,0.036365,-0.383925,-0.499610,-0.496780,2,24
4271,9.421137,8.006098,6.293598,6.960425,8.568831,8.545294,10.139792,10.737891,11.097196,13.487932,...,-0.345504,-0.396574,-0.585145,-0.923041,-0.865991,-0.622471,-0.736700,-0.851594,2,24


In [20]:
labels = new_data['activity code'].unique()
users = new_data['user'].unique()
print(users)
new_data = balance_dataset_to_minimum(new_data)
train, val, test = train_test_split(new_data, users, activities)

[ 1  3  5  6  7  8 11 14 15 16 17 19 21 22 23 25 26 27 28 29 30  2  4  9
 10 12 13 18 20 24]


In [21]:
new_data.shape

(2840, 362)

In [22]:
train['user'].unique(), val['user'].unique(), test['user'].unique()

(array([23,  8, 16, 29,  6, 17, 28, 24,  1, 14,  2, 13, 19,  9,  4, 11, 12,
        27,  5, 30,  7]),
 array([25, 20]),
 array([ 3, 26, 18, 21, 10, 15, 22]))

In [23]:
train.shape, val.shape, test.shape

((1952, 362), (217, 362), (671, 362))

In [24]:
train.to_csv('../../data_2/raw_data/UCI-HAR/balanced_view_gravity_acc_9.81_train_test-v1/train.csv',
             index=False)
val.to_csv('../../data_2/raw_data/UCI-HAR/balanced_view_gravity_acc_9.81_train_test-v1/validation.csv',
           index=False)
test.to_csv('../../data_2/raw_data/UCI-HAR/balanced_view_gravity_acc_9.81_train_test-v1/test.csv',
            index=False)

## Preprocess the data without gravity

In [25]:
# Vamos abrir os dados de treino do UCI-HAR
pasta = "../../../../../Downloads/UCI HAR Dataset/UCI HAR Dataset/train/"
# dados = [None]*6
sensor = [
    'Inertial Signals/body_acc_x_train.txt', 
    'Inertial Signals/body_acc_y_train.txt', 
    'Inertial Signals/body_acc_z_train.txt', 
    'Inertial Signals/body_gyro_x_train.txt', 
    'Inertial Signals/body_gyro_y_train.txt', 
    'Inertial Signals/body_gyro_z_train.txt'
]
train_data = [np.loadtxt(pasta+sensor[i]) for i in range(6)]
                    
y_train = np.loadtxt(pasta+'y_train.txt')
user_train = np.loadtxt(pasta+'subject_train.txt')
labels = ['WALKING', 'WALKING_UPSTAIRS', 'WALKING_DOWNSTAIRS', 'SITTING', 'STANDING', 'LAYING']
print('Número de amostras por classe \npara janelas de 2,56 segundos e sobreposição de 50%')
for i in np.unique(y_train):
    print(labels[i.astype(int)-1]+': '+str(np.sum(y_train==i)))

Número de amostras por classe 
para janelas de 2,56 segundos e sobreposição de 50%
WALKING: 1226
WALKING_UPSTAIRS: 1073
WALKING_DOWNSTAIRS: 986
SITTING: 1286
STANDING: 1374
LAYING: 1407


In [26]:
# Agora vamos abrir os dados de teste do UCI-HAR
pasta = "../../../../../Downloads/UCI HAR Dataset/UCI HAR Dataset/test/"
test_data = [None]*6
sensor = [
    'Inertial Signals/body_acc_x_test.txt', 
    'Inertial Signals/body_acc_y_test.txt', 
    'Inertial Signals/body_acc_z_test.txt', 
    'Inertial Signals/body_gyro_x_test.txt', 
    'Inertial Signals/body_gyro_y_test.txt', 
    'Inertial Signals/body_gyro_z_test.txt'
]
test_data = [np.loadtxt(pasta+sensor[i]) for i in range(6)]
                    
y_test = np.loadtxt(pasta+'y_test.txt')
user_test = np.loadtxt(pasta+'subject_test.txt')
labels = ['WALKING', 'WALKING_UPSTAIRS', 'WALKING_DOWNSTAIRS', 'SITTING', 'STANDING'
          , 'LAYING']
print('Número de amostras por classe \npara janelas de 2,56 segundos e sobreposição de 50%')
for i in np.unique(y_test):
    print(labels[i.astype(int)-1]+': '+str(np.sum(y_test==i)))

Número de amostras por classe 
para janelas de 2,56 segundos e sobreposição de 50%
WALKING: 496
WALKING_UPSTAIRS: 471
WALKING_DOWNSTAIRS: 420
SITTING: 491
STANDING: 532
LAYING: 537


In [27]:
# Concatenate all the data

data = np.concatenate([train_data, test_data], axis=1)
y = np.concatenate([y_train, y_test], axis=0)
user = np.concatenate([user_train, user_test], axis=0)

print('Número de amostras por classe \npara janelas de 2,56 segundos e sobreposição de 50%')
for i in np.unique(y):
    print(labels[i.astype(int)-1]+': '+str(np.sum(y==i)))

Número de amostras por classe 
para janelas de 2,56 segundos e sobreposição de 50%
WALKING: 1722
WALKING_UPSTAIRS: 1544
WALKING_DOWNSTAIRS: 1406
SITTING: 1777
STANDING: 1906
LAYING: 1944


In [28]:
for i in range(6):
    data[i] = 9.81 * data[i]

In [29]:
new_time_window = 3
sample_rate = 20
points_per_new_window = new_time_window*sample_rate

df = []
for i in range(6):
    mat, yyy, uuuser = reconstrcut_the_signal(data[i], y, user, i, filter_the_signal=False)
    if type(df) == list:
        df = mat
    else:
        df = np.concatenate([df, mat], axis=1)
    print(df.shape)
df = np.column_stack((df, yyy.astype(int), uuuser.astype(int)))

(4273, 60)
(4273, 120)
(4273, 180)
(4273, 240)
(4273, 300)
(4273, 360)


In [30]:
new_data = pd.DataFrame(data=df, columns=colunas)

In [31]:
new_data = new_data.astype({"activity code": int, "user": int})

# Removing the activity laying
new_data = new_data[new_data['activity code'].isin([1, 2, 3, 4, 5])]
new_data

Unnamed: 0,accel-x-0,accel-x-1,accel-x-2,accel-x-3,accel-x-4,accel-x-5,accel-x-6,accel-x-7,accel-x-8,accel-x-9,...,gyro-z-52,gyro-z-53,gyro-z-54,gyro-z-55,gyro-z-56,gyro-z-57,gyro-z-58,gyro-z-59,activity code,user
0,0.023421,0.092810,0.051748,0.053044,0.058151,0.062064,0.054399,0.065013,0.067435,0.064698,...,0.015743,-0.004514,-0.010091,-0.002338,0.025661,0.066292,0.097634,-0.056003,5,1
1,-0.006615,-0.005355,0.019899,-0.006828,-0.009687,0.009754,0.041427,-0.009636,0.008558,-0.007307,...,-0.017540,-0.057957,-0.100396,-0.110695,-0.089326,-0.010930,-0.012716,-0.020284,5,1
2,-0.006539,-0.019581,-0.011634,0.011580,0.006525,-0.019151,-0.001512,-0.010446,0.007561,-0.003103,...,0.004128,0.044178,0.024261,0.010623,0.071183,0.212898,0.074963,0.088367,5,1
3,-0.037877,0.009880,-0.001231,0.005519,0.042951,-0.021841,-0.003378,-0.013059,0.018568,0.017769,...,0.058059,-0.037207,-0.026062,0.016857,-0.069384,-0.119715,-0.088488,-0.081131,5,1
4,0.013536,0.016908,-0.028985,0.024807,-0.000772,0.006416,-0.020137,-0.016722,0.044381,0.002091,...,0.002824,-0.019241,0.018782,0.028931,0.069056,-0.039598,-0.033452,-0.043050,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4268,-2.600420,-3.595345,-2.089147,-0.890123,0.188650,2.541224,6.069649,4.180803,0.576008,-0.497670,...,1.155870,-2.122261,-4.889730,-2.169935,0.400858,1.202092,1.281805,0.010177,3,24
4269,-0.178847,2.702237,2.820533,2.186331,3.826064,0.078957,-1.708842,-0.611605,-0.639540,-1.594183,...,-1.262044,0.960520,5.412594,3.500165,-1.265196,-0.693728,4.286642,-0.349635,2,24
4270,0.707076,-0.374569,-1.568337,-0.237794,-1.709518,-2.324260,-1.989979,-1.194680,-0.288309,0.661777,...,-1.446219,-0.994187,-0.692071,1.925532,0.356736,-3.766303,-4.901173,-4.873415,2,24
4271,-0.016146,-1.436414,-3.155442,-2.493840,-0.887867,-0.909666,0.690148,1.297057,1.666583,4.067945,...,-3.389393,-3.890389,-5.740271,-9.055033,-8.495376,-6.106444,-7.227027,-8.354134,2,24


In [32]:
labels = new_data['activity code'].unique()
users = new_data['user'].unique()
print(users)
new_data = balance_dataset_to_minimum(new_data)
train, val, test = train_test_split(new_data, users, activities)

[ 1  3  5  6  7  8 11 14 15 16 17 19 21 22 23 25 26 27 28 29 30  2  4  9
 10 12 13 18 20 24]


In [33]:
new_data.shape

(2840, 362)

In [34]:
train['user'].unique(), val['user'].unique(), test['user'].unique()

(array([23,  8, 16, 29,  6, 17, 28, 24,  1, 14,  2, 13, 19,  9,  4, 11, 12,
        27,  5, 30,  7]),
 array([25, 20]),
 array([ 3, 26, 18, 21, 10, 15, 22]))

In [35]:
train.shape, val.shape, test.shape

((1952, 362), (217, 362), (671, 362))

In [36]:
train.to_csv('../../data_2/raw_data/UCI-HAR/balanced_view_without_gravity_acc_9.81_train_test-v1/train.csv',
             index=False)
val.to_csv('../../data_2/raw_data/UCI-HAR/balanced_view_without_gravity_acc_9.81_train_test-v1/validation.csv',
           index=False)
test.to_csv('../../data_2/raw_data/UCI-HAR/balanced_view_without_gravity_acc_9.81_train_test-v1/test.csv',
            index=False)

# Teste

(array([23,  8, 16, 29,  6, 17, 28, 24,  1, 14,  2, 13, 19,  9,  4, 11, 12,
        27,  5, 30,  7]),
        
 array([25, 20]),
 
 array([ 3, 26, 18, 21, 10, 15, 22]))

In [37]:
1952+217+671

2840

In [38]:
train.shape[0]+val.shape[0]+test.shape[0]

2840

In [39]:
train['activity code'].unique()

array([1, 2, 3, 4, 5])