# Pre processing UCI-HAR

This notebook will pre processing the dataset UCI-HAR (https://archive.ics.uci.edu/ml/datasets/human+activity+recognition+using+smartphones)

1. Load the uci dataset
2. Remove the overlap from samples
3. Remove the gravity appling a ButterWorth filter
4. Resample the data from 50Hz to 20Hz
5. Change the time window to 3s
6. Change the accelerometer mensure from g to m/s²

In [1]:
import numpy as np
import pandas as pd
import os
import plotly.express as px
import plotly.graph_objects as go
import random

## Preprecess the data

In [2]:
from scipy import signal

def filtering(sig):
    h = signal.butter(3, .3, 'hp', fs=50, output='sos')
    zi = signal.sosfilt_zi(h) * sig[:4].mean()
    sample_filtered = signal.sosfiltfilt(h, sig)
    
    return sample_filtered

def resample(sig, time_window):
    return signal.resample(sig, 20*time_window)

def windowing(sigs, yy, uuser):
    
    # Vamos construir as janelas de 3 segundos (lembrando que o UCI-HAR está amostrado a uma taxa de 50 Hz)
    new_time_window = 3
    sample_rate = 20
    points_per_new_window = new_time_window*sample_rate
    
    aux = np.zeros(len(sigs))
    for i in range(len(sigs)):
        aux[i] = len(sigs[i])
    Na = np.sum(aux//points_per_new_window, dtype=np.int64)
    mat = np.zeros((Na,points_per_new_window))
    yyy = np.zeros(Na)
    uuuser = np.zeros(Na)
    k = 0
    for i in range(len(sigs)):
        for j in range(len(sigs[i])//points_per_new_window):
            mat[k,:] = sigs[i][j*points_per_new_window:(j+1)*points_per_new_window]
            yyy[k] = yy[i]
            uuuser[k] = uuser[i]
            k+=1
            
    return mat, yyy, uuuser;

def reconstrcut_the_signal(X, y, user, sensor, filter_the_signal=True, resample_the_signal=True):
    
    # Vamos ver quando ocorre uma nova amostra (ou seja, quando não ocorre sobreposição)
    aux = np.zeros(len(y)-1)
    for i in range(len(y)-1):
        aux[i] = np.all(X[i,64:]!=X[i+1,:64])
    ind = np.nonzero(aux)[0]+1
    ind = np.append([0],ind)
    ind = np.append(ind,[len(y)])
    
    # Agora vamos construir as capturas originais (sem sobreposição)
    sigs = []
    yy = []
    uuser = []
    for i in range(len(ind)-1):
        sig = X[ind[i]:ind[i+1],64:].reshape((ind[i+1]-ind[i])*64)
        sig = np.append(X[ind[i],:64],sig)
        
        new_sig = sig
        if sensor < 3:
            if filter_the_signal:
                new_sig = sig - filtering(sig) # Removing the gravity

        time_window = len(new_sig) // 50 # The time window is the time of the total window
        sig_resampled = resample(new_sig, time_window) # Resampling the signal

        sigs.append(sig_resampled)
        yy.append(y[ind[i]])
        uuser.append(user[ind[i]])
        
    return windowing(sigs, yy, uuser)

def train_test_split(
    df: pd.DataFrame,
    users,
    activities,
    train_size=.70,
    validation_size=.10,
    test_size=.20,
    retries: int = 10,
    ensure_distinct_users_per_dataset: bool = True,
    seed: int = 0,
):
    n_users = len(users)

    for i in range(retries):
        # [start ---> train_size)
        random.shuffle(users)
        train_users = users[0:int(n_users * train_size)]
        # [train_size --> train_size+validation_size)
        validation_users = users[
            int(n_users * train_size):
            int(n_users * (train_size + validation_size))
        ]
        # [train_size+validation_size --> end]
        test_users = users[int(n_users * (train_size + validation_size)):]
        # iterate over user's lists, filter df for users in the respective list
        all_sets = [
            df[df["user"].isin(u)]
            for u in [train_users, validation_users, test_users]
        ]

        if not ensure_distinct_users_per_dataset:
            return all_sets

        # We must guarantee that all sets contains at least 1 sample from each activities listed
        oks = [set(s["activity code"]) == set(activities) for s in all_sets]
        if all(oks):
            # If all sets contains at least 1 sample for each activity, return train, val, test sets!
            return all_sets

    raise DatasetSplitError(
        "Does not found a 3 sets that contain the respective activities!"
    )

def balance_dataset_to_minimum(dataframe: pd.DataFrame, column: str = "activity code"
) -> pd.DataFrame:
    df_list = []
    histogram = dataframe.groupby(dataframe[column], as_index=False).size()
    for c in histogram[column]:
        temp = dataframe.loc[dataframe[column] == c]
        temp = temp.sample(n=histogram["size"].min())
        df_list.append(temp)
    return pd.concat(df_list)

## Preprocess the data with filter

In [3]:
# Vamos abrir os dados de treino do UCI-HAR
pasta = "../../../../../Downloads/UCI HAR Dataset/UCI HAR Dataset/train/"
# dados = [None]*6
sensor = [
    'Inertial Signals/total_acc_x_train.txt', 
    'Inertial Signals/total_acc_y_train.txt', 
    'Inertial Signals/total_acc_z_train.txt', 
    'Inertial Signals/body_gyro_x_train.txt', 
    'Inertial Signals/body_gyro_y_train.txt', 
    'Inertial Signals/body_gyro_z_train.txt'
]
train_data = [np.loadtxt(pasta+sensor[i]) for i in range(6)]
                    
y_train = np.loadtxt(pasta+'y_train.txt')
user_train = np.loadtxt(pasta+'subject_train.txt')
labels = ['WALKING', 'WALKING_UPSTAIRS', 'WALKING_DOWNSTAIRS', 'SITTING', 'STANDING', 'LAYING']
print('Número de amostras por classe \npara janelas de 2,56 segundos e sobreposição de 50%')
for i in np.unique(y_train):
    print(labels[i.astype(int)-1]+': '+str(np.sum(y_train==i)))

Número de amostras por classe 
para janelas de 2,56 segundos e sobreposição de 50%
WALKING: 1226
WALKING_UPSTAIRS: 1073
WALKING_DOWNSTAIRS: 986
SITTING: 1286
STANDING: 1374
LAYING: 1407


In [4]:
# Agora vamos abrir os dados de teste do UCI-HAR
pasta = "../../../../../Downloads/UCI HAR Dataset/UCI HAR Dataset/test/"
test_data = [None]*6
sensor = [
    'Inertial Signals/total_acc_x_test.txt', 
    'Inertial Signals/total_acc_y_test.txt', 
    'Inertial Signals/total_acc_z_test.txt', 
    'Inertial Signals/body_gyro_x_test.txt', 
    'Inertial Signals/body_gyro_y_test.txt', 
    'Inertial Signals/body_gyro_z_test.txt'
]
test_data = [np.loadtxt(pasta+sensor[i]) for i in range(6)]
                    
y_test = np.loadtxt(pasta+'y_test.txt')
user_test = np.loadtxt(pasta+'subject_test.txt')
labels = ['WALKING', 'WALKING_UPSTAIRS', 'WALKING_DOWNSTAIRS', 'SITTING', 'STANDING'
          , 'LAYING']
print('Número de amostras por classe \npara janelas de 2,56 segundos e sobreposição de 50%')
for i in np.unique(y_test):
    print(labels[i.astype(int)-1]+': '+str(np.sum(y_test==i)))

Número de amostras por classe 
para janelas de 2,56 segundos e sobreposição de 50%
WALKING: 496
WALKING_UPSTAIRS: 471
WALKING_DOWNSTAIRS: 420
SITTING: 491
STANDING: 532
LAYING: 537


In [5]:
# Concatenate all the data

data = np.concatenate([train_data, test_data], axis=1)
y = np.concatenate([y_train, y_test], axis=0)
user = np.concatenate([user_train, user_test], axis=0)

print('Número de amostras por classe \npara janelas de 2,56 segundos e sobreposição de 50%')
for i in np.unique(y):
    print(labels[i.astype(int)-1]+': '+str(np.sum(y==i)))

Número de amostras por classe 
para janelas de 2,56 segundos e sobreposição de 50%
WALKING: 1722
WALKING_UPSTAIRS: 1544
WALKING_DOWNSTAIRS: 1406
SITTING: 1777
STANDING: 1906
LAYING: 1944


In [6]:
for i in range(6):
    data[i] = 9.81 * data[i]

In [7]:
new_time_window = 3
sample_rate = 20
points_per_new_window = new_time_window*sample_rate

df = []
for i in range(6):
    mat, yyy, uuuser = reconstrcut_the_signal(data[i], y, user, i, filter_the_signal=True)
    if type(df) == list:
        df = mat
    else:
        df = np.concatenate([df, mat], axis=1)
    print(df.shape)
df = np.column_stack((df, yyy.astype(int), uuuser.astype(int)))

(4273, 60)
(4273, 120)
(4273, 180)
(4273, 240)
(4273, 300)
(4273, 360)


In [8]:
new_time_window = 3
sample_rate = 20
points_per_new_window = new_time_window*sample_rate

colunas = ['accel-x-' + str(x) for x in np.arange(points_per_new_window).tolist()]
colunas += ['accel-y-' + str(x) for x in np.arange(points_per_new_window).tolist()]
colunas += ['accel-z-' + str(x) for x in np.arange(points_per_new_window).tolist()]
colunas += ['gyro-x-' + str(x) for x in np.arange(points_per_new_window).tolist()]
colunas += ['gyro-y-' + str(x) for x in np.arange(points_per_new_window).tolist()]
colunas += ['gyro-z-' + str(x) for x in np.arange(points_per_new_window).tolist()]
colunas += ['activity code']
colunas += ['user']

In [9]:
new_data = pd.DataFrame(data=df, columns=colunas)

In [10]:
new_data = new_data.astype({"activity code": int, "user": int})

# Removing the activity laying
new_data = new_data[new_data['activity code'].isin([1, 2, 3, 4, 5])]
new_data

Unnamed: 0,accel-x-0,accel-x-1,accel-x-2,accel-x-3,accel-x-4,accel-x-5,accel-x-6,accel-x-7,accel-x-8,accel-x-9,...,gyro-z-52,gyro-z-53,gyro-z-54,gyro-z-55,gyro-z-56,gyro-z-57,gyro-z-58,gyro-z-59,activity code,user
0,9.957551,9.930966,9.944786,9.943322,9.951478,9.952429,9.958903,9.960621,9.966154,9.968123,...,0.015743,-0.004514,-0.010091,-0.002338,0.025661,0.066292,0.097634,-0.056003,5,1
1,10.000102,9.999999,10.000186,10.000155,10.000398,10.000438,10.000735,10.000842,10.001191,10.001363,...,-0.017540,-0.057957,-0.100396,-0.110695,-0.089326,-0.010930,-0.012716,-0.020284,5,1
2,10.009287,10.009075,10.008854,10.008665,10.008462,10.008298,10.008116,10.007978,10.007818,10.007708,...,0.004128,0.044178,0.024261,0.010623,0.071183,0.212898,0.074963,0.088367,5,1
3,10.013070,10.013380,10.013529,10.013822,10.013946,10.014216,10.014309,10.014548,10.014603,10.014804,...,0.058059,-0.037207,-0.026062,0.016857,-0.069384,-0.119715,-0.088488,-0.081131,5,1
4,10.013306,10.013941,10.014301,10.014941,10.015295,10.015927,10.016262,10.016872,10.017174,10.017751,...,0.002824,-0.019241,0.018782,0.028931,0.069056,-0.039598,-0.033452,-0.043050,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4268,9.713478,9.705598,9.686378,9.679353,9.660829,9.655041,9.637606,9.633525,9.617703,9.615962,...,1.155870,-2.122261,-4.889730,-2.169935,0.400858,1.202092,1.281805,0.010177,3,24
4269,9.171248,8.487361,8.794568,8.720293,8.887723,8.874682,9.001128,9.009000,9.113583,9.129999,...,-1.262044,0.960520,5.412594,3.500165,-1.265196,-0.693728,4.286642,-0.349635,2,24
4270,9.555420,9.550196,9.549734,9.544955,9.544445,9.540000,9.539337,9.535083,9.534126,9.529892,...,-1.446219,-0.994187,-0.692071,1.925532,0.356736,-3.766303,-4.901173,-4.873415,2,24
4271,9.467395,9.471047,9.471082,9.474852,9.474780,9.478589,9.478308,9.482037,9.481432,9.484980,...,-3.389393,-3.890389,-5.740271,-9.055033,-8.495376,-6.106444,-7.227027,-8.354134,2,24


In [11]:
# new_data[new_data['activity code'].isin([4])]
# labels = new_data['activity code'].unique()
# users = new_data['user'].unique()

# minimo = 1000
# maximo = 0
# bad_user = []
# statistic_info = {f'user-{user}': {f'activity-{activity}': None for activity in labels} for user in users}
# for user in users:
#     df_user = new_data[new_data['user'].isin([user])]
#     for activity in labels:
#         df_activity = df_user[df_user['activity code'].isin([activity])]
#         n_samples = df_activity.shape[0]
#         statistic_info[f'user-{user}'][f'activity-{activity}'] = n_samples
#         minimo = n_samples if n_samples < minimo else minimo
#         maximo = n_samples if n_samples > maximo else maximo
#         if n_samples == 0:
#             bad_user.append(user)
        
# statistic_info

In [12]:
activities = new_data['activity code'].unique()
users = new_data['user'].unique()

new_data = balance_dataset_to_minimum(new_data)
train, val, test = train_test_split(new_data, users, activities)

In [13]:
new_data.shape

(2840, 362)

In [14]:
train['user'].unique(), val['user'].unique(), test['user'].unique()

(array([12,  1, 11, 22, 30,  2, 28,  7,  5, 26, 17,  4,  3, 27, 23,  8, 13,
        10, 24, 20, 14]),
 array([21, 19]),
 array([29, 25,  6, 15,  9, 18, 16]))

In [15]:
train.shape, val.shape, test.shape

((1967, 362), (204, 362), (669, 362))

In [16]:
train.to_csv('../../data_2/raw_data/UCI-HAR/balanced_view_filtered_acc_9.81_train_test-v1/train.csv',
             index=False)
val.to_csv('../../data_2/raw_data/UCI-HAR/balanced_view_filtered_acc_9.81_train_test-v1/validation.csv',
           index=False)
test.to_csv('../../data_2/raw_data/UCI-HAR/balanced_view_filtered_acc_9.81_train_test-v1/test.csv',
            index=False)

## Preprocess the data with gravity

In [17]:
new_time_window = 3
sample_rate = 20
points_per_new_window = new_time_window*sample_rate

df = []
for i in range(6):
    mat, yyy, uuuser = reconstrcut_the_signal(data[i], y, user, i, filter_the_signal=False)
    if type(df) == list:
        df = mat
    else:
        df = np.concatenate([df, mat], axis=1)
    print(df.shape)
df = np.column_stack((df, yyy.astype(int), uuuser.astype(int)))

(4273, 60)
(4273, 120)
(4273, 180)
(4273, 240)
(4273, 300)
(4273, 360)


In [18]:
new_data = pd.DataFrame(data=df, columns=colunas)

In [19]:
new_data = new_data.astype({"activity code": int, "user": int})

# Removing the activity laying
new_data = new_data[new_data['activity code'].isin([1, 2, 3, 4, 5])]
new_data

Unnamed: 0,accel-x-0,accel-x-1,accel-x-2,accel-x-3,accel-x-4,accel-x-5,accel-x-6,accel-x-7,accel-x-8,accel-x-9,...,gyro-z-52,gyro-z-53,gyro-z-54,gyro-z-55,gyro-z-56,gyro-z-57,gyro-z-58,gyro-z-59,activity code,user
0,9.982164,10.022004,9.991999,9.989105,9.999790,10.002199,9.998735,10.008979,10.015062,10.012644,...,0.015743,-0.004514,-0.010091,-0.002338,0.025661,0.066292,0.097634,-0.056003,5,1
1,9.994811,9.995747,10.020938,9.993917,9.991015,10.010195,10.041842,9.990567,10.008804,9.992807,...,-0.017540,-0.057957,-0.100396,-0.110695,-0.089326,-0.010930,-0.012716,-0.020284,5,1
2,10.007941,9.994940,10.002836,10.025953,10.020718,9.994851,10.012237,10.003053,10.020745,10.009794,...,0.004128,0.044178,0.024261,0.010623,0.071183,0.212898,0.074963,0.088367,5,1
3,9.970586,10.018501,10.007389,10.014299,10.051722,9.987107,10.005598,9.996123,10.027783,10.027182,...,0.058059,-0.037207,-0.026062,0.016857,-0.069384,-0.119715,-0.088488,-0.081131,5,1
4,10.021855,10.025014,9.978700,10.032412,10.006522,10.013739,9.986978,9.990508,10.051446,10.009320,...,0.002824,-0.019241,0.018782,0.028931,0.069056,-0.039598,-0.033452,-0.043050,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4268,7.335737,6.345408,7.850638,9.042558,10.107012,12.440331,15.946122,14.037241,10.418468,9.339211,...,1.155870,-2.122261,-4.889730,-2.169935,0.400858,1.202092,1.281805,0.010177,3,24
4269,9.420669,12.350890,12.450553,11.826199,13.463330,9.731320,7.954961,9.075540,9.066422,8.137290,...,-1.262044,0.960520,5.412594,3.500165,-1.265196,-0.693728,4.286642,-0.349635,2,24
4270,10.229895,9.145552,7.952917,9.287520,7.821026,7.212080,7.550534,8.347369,9.251462,10.195953,...,-1.446219,-0.994187,-0.692071,1.925532,0.356736,-3.766303,-4.901173,-4.873415,2,24
4271,9.421137,8.006098,6.293598,6.960425,8.568831,8.545294,10.139792,10.737891,11.097196,13.487932,...,-3.389393,-3.890389,-5.740271,-9.055033,-8.495376,-6.106444,-7.227027,-8.354134,2,24


In [20]:
labels = new_data['activity code'].unique()
users = new_data['user'].unique()

new_data = balance_dataset_to_minimum(new_data)
train, val, test = train_test_split(new_data, users, activities)

In [21]:
new_data.shape

(2840, 362)

In [22]:
train['user'].unique(), val['user'].unique(), test['user'].unique()

(array([23, 15, 16, 11,  8,  3,  4, 26,  6,  1, 30, 24, 28, 21,  5, 22, 12,
         2,  7, 20, 13]),
 array([29, 18]),
 array([19, 14,  9, 25, 17, 27, 10]))

In [23]:
train.shape, val.shape, test.shape

((1982, 362), (201, 362), (657, 362))

In [24]:
train.to_csv('../../data_2/raw_data/UCI-HAR/balanced_view_gravity_acc_9.81_train_test-v1/train.csv',
             index=False)
val.to_csv('../../data_2/raw_data/UCI-HAR/balanced_view_gravity_acc_9.81_train_test-v1/validation.csv',
           index=False)
test.to_csv('../../data_2/raw_data/UCI-HAR/balanced_view_gravity_acc_9.81_train_test-v1/test.csv',
            index=False)

## Preprocess the data without gravity

In [25]:
# Vamos abrir os dados de treino do UCI-HAR
pasta = "../../../../../Downloads/UCI HAR Dataset/UCI HAR Dataset/train/"
# dados = [None]*6
sensor = [
    'Inertial Signals/body_acc_x_train.txt', 
    'Inertial Signals/body_acc_y_train.txt', 
    'Inertial Signals/body_acc_z_train.txt', 
    'Inertial Signals/body_gyro_x_train.txt', 
    'Inertial Signals/body_gyro_y_train.txt', 
    'Inertial Signals/body_gyro_z_train.txt'
]
train_data = [np.loadtxt(pasta+sensor[i]) for i in range(6)]
                    
y_train = np.loadtxt(pasta+'y_train.txt')
user_train = np.loadtxt(pasta+'subject_train.txt')
labels = ['WALKING', 'WALKING_UPSTAIRS', 'WALKING_DOWNSTAIRS', 'SITTING', 'STANDING', 'LAYING']
print('Número de amostras por classe \npara janelas de 2,56 segundos e sobreposição de 50%')
for i in np.unique(y_train):
    print(labels[i.astype(int)-1]+': '+str(np.sum(y_train==i)))

Número de amostras por classe 
para janelas de 2,56 segundos e sobreposição de 50%
WALKING: 1226
WALKING_UPSTAIRS: 1073
WALKING_DOWNSTAIRS: 986
SITTING: 1286
STANDING: 1374
LAYING: 1407


In [26]:
# Agora vamos abrir os dados de teste do UCI-HAR
pasta = "../../../../../Downloads/UCI HAR Dataset/UCI HAR Dataset/test/"
test_data = [None]*6
sensor = [
    'Inertial Signals/body_acc_x_test.txt', 
    'Inertial Signals/body_acc_y_test.txt', 
    'Inertial Signals/body_acc_z_test.txt', 
    'Inertial Signals/body_gyro_x_test.txt', 
    'Inertial Signals/body_gyro_y_test.txt', 
    'Inertial Signals/body_gyro_z_test.txt'
]
test_data = [np.loadtxt(pasta+sensor[i]) for i in range(6)]
                    
y_test = np.loadtxt(pasta+'y_test.txt')
user_test = np.loadtxt(pasta+'subject_test.txt')
labels = ['WALKING', 'WALKING_UPSTAIRS', 'WALKING_DOWNSTAIRS', 'SITTING', 'STANDING'
          , 'LAYING']
print('Número de amostras por classe \npara janelas de 2,56 segundos e sobreposição de 50%')
for i in np.unique(y_test):
    print(labels[i.astype(int)-1]+': '+str(np.sum(y_test==i)))

Número de amostras por classe 
para janelas de 2,56 segundos e sobreposição de 50%
WALKING: 496
WALKING_UPSTAIRS: 471
WALKING_DOWNSTAIRS: 420
SITTING: 491
STANDING: 532
LAYING: 537


In [27]:
for i in range(6):
    data[i] = 9.81 * data[i]

In [28]:
new_time_window = 3
sample_rate = 20
points_per_new_window = new_time_window*sample_rate

df = []
for i in range(6):
    mat, yyy, uuuser = reconstrcut_the_signal(data[i], y, user, i, filter_the_signal=False)
    if type(df) == list:
        df = mat
    else:
        df = np.concatenate([df, mat], axis=1)
    print(df.shape)
df = np.column_stack((df, yyy.astype(int), uuuser.astype(int)))

(4273, 60)
(4273, 120)
(4273, 180)
(4273, 240)
(4273, 300)
(4273, 360)


In [29]:
new_data = pd.DataFrame(data=df, columns=colunas)

In [30]:
new_data = new_data.astype({"activity code": int, "user": int})

# Removing the activity laying
new_data = new_data[new_data['activity code'].isin([1, 2, 3, 4, 5])]
new_data

Unnamed: 0,accel-x-0,accel-x-1,accel-x-2,accel-x-3,accel-x-4,accel-x-5,accel-x-6,accel-x-7,accel-x-8,accel-x-9,...,gyro-z-52,gyro-z-53,gyro-z-54,gyro-z-55,gyro-z-56,gyro-z-57,gyro-z-58,gyro-z-59,activity code,user
0,97.925030,98.315863,98.021510,97.993123,98.097941,98.121575,98.087589,98.188081,98.247758,98.224035,...,0.154441,-0.044280,-0.098990,-0.022935,0.251730,0.650328,0.957785,-0.549387,5,1
1,98.049099,98.058281,98.305398,98.040321,98.011853,98.200010,98.510469,98.007457,98.186365,98.029433,...,-0.172072,-0.568556,-0.984889,-1.085920,-0.876289,-0.107223,-0.124748,-0.198981,5,1
2,98.177897,98.050362,98.127824,98.354600,98.303243,98.049486,98.220043,98.129948,98.303508,98.196075,...,0.040500,0.433390,0.238000,0.104210,0.698307,2.088529,0.735385,0.866876,5,1
3,97.811449,98.281500,98.172487,98.240272,98.607392,97.973516,98.154920,98.061971,98.372555,98.366658,...,0.569562,-0.365003,-0.255668,0.165370,-0.680656,-1.174403,-0.868068,-0.795894,5,1
4,98.314399,98.345385,97.891048,98.417963,98.163978,98.234775,97.972253,98.006879,98.604686,98.191428,...,0.027700,-0.188750,0.184254,0.283812,0.677438,-0.388454,-0.328162,-0.422325,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4268,71.963581,62.248451,77.014762,88.707493,99.149784,122.039645,156.431456,137.705333,102.205169,91.617656,...,11.339082,-20.819384,-47.968247,-21.287063,3.932418,11.792520,12.574504,0.099835,3,24
4269,92.416764,121.162230,122.139924,116.015015,132.075268,95.464252,78.038170,89.031043,88.941598,79.826815,...,-12.380654,9.422702,53.097544,34.336616,-12.411571,-6.805474,42.051956,-3.429920,2,24
4270,100.355267,89.717870,78.018116,91.110574,76.724266,70.750507,74.070739,81.887688,90.756845,100.022302,...,-14.187409,-9.752971,-6.789220,18.889465,3.499581,-36.947431,-48.080504,-47.808201,2,24
4271,92.421349,78.539822,61.740196,68.281773,84.060235,83.829335,99.471358,105.338709,108.863496,132.316611,...,-33.249940,-38.164717,-56.312061,-88.829877,-83.339636,-59.904214,-70.897131,-81.954054,2,24


In [31]:
labels = new_data['activity code'].unique()
users = new_data['user'].unique()

new_data = balance_dataset_to_minimum(new_data)
train, val, test = train_test_split(new_data, users, activities)

In [32]:
new_data.shape

(2840, 362)

In [33]:
train['user'].unique(), val['user'].unique(), test['user'].unique()

(array([13, 15, 27,  6, 11, 26, 25,  9, 22,  1,  2,  3, 16, 14, 24, 30,  4,
        10, 29, 21, 19]),
 array([12, 18]),
 array([20,  5, 28, 17,  8,  7, 23]))

In [34]:
train.shape, val.shape, test.shape

((2000, 362), (189, 362), (651, 362))

In [35]:
train.to_csv('../../data_2/raw_data/UCI-HAR/balanced_view_without_gravity_acc_9.81_train_test-v1/train.csv',
             index=False)
val.to_csv('../../data_2/raw_data/UCI-HAR/balanced_view_without_gravity_acc_9.81_train_test-v1/validation.csv',
           index=False)
test.to_csv('../../data_2/raw_data/UCI-HAR/balanced_view_without_gravity_acc_9.81_train_test-v1/test.csv',
            index=False)