<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [1]:
# Required imports
import datetime
import os
import numpy as np
import pandas as pd
import gzip
import glob
import pickle
import copy
import math
from io import StringIO
import importlib.machinery

In [2]:
prefix='dataset/Extrasensory_uuid_fl_uTAR/'
user_sample='3600D531-0C55-44A7-AE95-A7A38519464E.features_labels'

In [3]:
# Dataset parsers for header/ body for CSVs
def parse_header_of_csv(csv_str):
    # Isolate the headline columns:
    headline = csv_str[:csv_str.index('\n')];
    columns = headline.split(',');

    # The first column should be timestamp:
    assert columns[0] == 'timestamp';
    # The last column should be label_source:
    assert columns[-1] == 'label_source';
    
    # Search for the column of the first label:
    for (ci,col) in enumerate(columns):
        if col.startswith('label:'):
            first_label_ind = ci;
            break;
        pass;

    # Feature columns come after timestamp and before the labels:
    feature_names = columns[1:first_label_ind];
    # Then come the labels, till the one-before-last column:
    label_names = columns[first_label_ind:-1];
    for (li,label) in enumerate(label_names):
        # In the CSV the label names appear with prefix 'label:', but we don't need it after reading the data:
        assert label.startswith('label:');
        label_names[li] = label.replace('label:','');
        pass;
    
    return (feature_names,label_names);

def parse_body_of_csv(csv_str,n_features):
    # Read the entire CSV body into a single numeric matrix:
    full_table = np.loadtxt(StringIO(csv_str),delimiter=',',skiprows=1);
    
    # Timestamp is the primary key for the records (examples):
    timestamps = full_table[:,0].astype(int);
    
    # Read the sensor features:
    X = full_table[:,1:(n_features+1)];
    
    # Read the binary label values, and the 'missing label' indicators:
    trinary_labels_mat = full_table[:,(n_features+1):-1]; # This should have values of either 0., 1. or NaN
    M = np.isnan(trinary_labels_mat); # M is the missing label matrix
    
    #print("M matrix shape:",M.shape)
    #print("Matrix: ",np.argwhere(M))
    trinary_labels_mat[M]=-1 # Replace NaNs with -1.0 for which we then apply a mask
    unique,counts=np.unique(trinary_labels_mat,return_counts=True)
    print(*zip(unique,counts)) 
    
#     Y = np.where(M,0,trinary_labels_mat) > 0.; # Y is the label matrix
    
    return (X,trinary_labels_mat,M,timestamps);

def read_user_data(directory):
    print('Reading {}'.format(directory.split("/")[-1]))

    # Read the entire csv file of the user:
    with gzip.open(directory,'rb') as fid:
        csv_str = fid.read();
        csv_str = csv_str.decode("utf-8")
        pass;

    (feature_names,label_names) = parse_header_of_csv(csv_str);
    n_features = len(feature_names);
    (X,Y,M,timestamps) = parse_body_of_csv(csv_str,n_features);

    return (X,Y,M,timestamps,feature_names,label_names);

In [4]:
# Reading sample data
sample_loc='{}/{}.csv.gz'.format(prefix,user_sample)
x_user,y_user,missedlabel_user,tstamp_user,featurename_user,labelname_user=read_user_data(sample_loc)

Reading 3600D531-0C55-44A7-AE95-A7A38519464E.features_labels.csv.gz
(-1.0, 148794) (0.0, 97289) (1.0, 19270)


In [5]:
globs=glob.glob(prefix+'*')

In [6]:
df=pd.DataFrame(data=None,columns=featurename_user)
df_label=pd.DataFrame(data=None,columns=labelname_user)
for g in globs:
    print(g)
    (x_user_train,y_user_train,missed_label_user,tstamp_user,featurename_user,labelname_user)=read_user_data(g)
    temp_df=pd.DataFrame(x_user_train,columns=featurename_user)
    temp_df_label=pd.DataFrame(y_user_train,columns=labelname_user)
    df=df.append(temp_df,ignore_index=True)
    df_label=df_label.append(temp_df_label,ignore_index=True)

dataset/Extrasensory_uuid_fl_uTAR/3600D531-0C55-44A7-AE95-A7A38519464E.features_labels.csv.gz
Reading 3600D531-0C55-44A7-AE95-A7A38519464E.features_labels.csv.gz
(-1.0, 148794) (0.0, 97289) (1.0, 19270)
dataset/Extrasensory_uuid_fl_uTAR/8023FE1A-D3B0-4E2C-A57A-9321B7FC755F.features_labels.csv.gz
Reading 8023FE1A-D3B0-4E2C-A57A-9321B7FC755F.features_labels.csv.gz
(-1.0, 224199) (0.0, 219348) (1.0, 25092)
dataset/Extrasensory_uuid_fl_uTAR/86A4F379-B305-473D-9D83-FC7D800180EF.features_labels.csv.gz
Reading 86A4F379-B305-473D-9D83-FC7D800180EF.features_labels.csv.gz
(-1.0, 431262) (0.0, 98651) (1.0, 17725)
dataset/Extrasensory_uuid_fl_uTAR/1538C99F-BA1E-4EFB-A949-6C7C47701B20.features_labels.csv.gz
Reading 1538C99F-BA1E-4EFB-A949-6C7C47701B20.features_labels.csv.gz
(-1.0, 179351) (0.0, 131384) (1.0, 23264)
dataset/Extrasensory_uuid_fl_uTAR/11B5EC4D-4133-4289-B475-4E737182A406.features_labels.csv.gz
Reading 11B5EC4D-4133-4289-B475-4E737182A406.features_labels.csv.gz
(-1.0, 272896) (0.0, 151

dataset/Extrasensory_uuid_fl_uTAR/797D145F-3858-4A7F-A7C2-A4EB721E133C.features_labels.csv.gz
Reading 797D145F-3858-4A7F-A7C2-A4EB721E133C.features_labels.csv.gz
(-1.0, 149790) (0.0, 27089) (1.0, 6364)
dataset/Extrasensory_uuid_fl_uTAR/ECECC2AB-D32F-4F90-B74C-E12A1C69BBE2.features_labels.csv.gz
Reading ECECC2AB-D32F-4F90-B74C-E12A1C69BBE2.features_labels.csv.gz
(-1.0, 142883) (0.0, 29344) (1.0, 7803)
dataset/Extrasensory_uuid_fl_uTAR/2C32C23E-E30C-498A-8DD2-0EFB9150A02E.features_labels.csv.gz
Reading 2C32C23E-E30C-498A-8DD2-0EFB9150A02E.features_labels.csv.gz
(-1.0, 218936) (0.0, 186517) (1.0, 28863)
dataset/Extrasensory_uuid_fl_uTAR/CCAF77F0-FABB-4F2F-9E24-D56AD0C5A82F.features_labels.csv.gz
Reading CCAF77F0-FABB-4F2F-9E24-D56AD0C5A82F.features_labels.csv.gz
(-1.0, 215586) (0.0, 177334) (1.0, 39152)
dataset/Extrasensory_uuid_fl_uTAR/5119D0F8-FCA8-4184-A4EB-19421A40DE0D.features_labels.csv.gz
Reading 5119D0F8-FCA8-4184-A4EB-19421A40DE0D.features_labels.csv.gz
(-1.0, 110004) (0.0, 20371

In [7]:
df.describe()

Unnamed: 0,raw_acc:magnitude_stats:mean,raw_acc:magnitude_stats:std,raw_acc:magnitude_stats:moment3,raw_acc:magnitude_stats:moment4,raw_acc:magnitude_stats:percentile25,raw_acc:magnitude_stats:percentile50,raw_acc:magnitude_stats:percentile75,raw_acc:magnitude_stats:value_entropy,raw_acc:magnitude_stats:time_entropy,raw_acc:magnitude_spectrum:log_energy_band0,...,lf_measurements:screen_brightness,lf_measurements:temperature_ambient,discrete:time_of_day:between0and6,discrete:time_of_day:between3and9,discrete:time_of_day:between6and12,discrete:time_of_day:between9and15,discrete:time_of_day:between12and18,discrete:time_of_day:between15and21,discrete:time_of_day:between18and24,discrete:time_of_day:between21and3
count,377056.0,377056.0,377056.0,377056.0,377056.0,377056.0,377056.0,377056.0,377056.0,377056.0,...,220949.0,26450.0,377346.0,377346.0,377346.0,377346.0,377346.0,377346.0,377346.0,377346.0
mean,1.002223,0.038832,0.037772,0.072804,0.983165,0.998471,1.016691,2.045331,6.680278,5.039597,...,0.326232,25.690594,0.218865,0.206169,0.229066,0.267147,0.285608,0.285624,0.266461,0.276425
std,0.079623,0.096109,0.113198,0.170481,0.08296,0.076687,0.104874,0.616899,0.021246,0.025809,...,0.29635,3.384491,0.413478,0.404554,0.420232,0.442471,0.451704,0.451712,0.442109,0.44723
min,0.018148,3e-05,-0.493806,3.9e-05,0.015845,0.017998,0.020365,0.009605,5.460637,4.338109,...,0.0,11.455976,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.992556,0.001709,-0.000785,0.002328,0.982671,0.991248,0.994859,1.671571,6.684369,5.042891,...,0.043978,23.9416,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.001258,0.003265,0.000772,0.005051,0.9953,1.000116,1.003813,2.296398,6.684606,5.04335,...,0.305609,25.545454,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.012745,0.021366,0.009864,0.045333,1.003679,1.009232,1.019163,2.523618,6.68461,5.043574,...,0.461145,27.650335,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0
max,3.185837,1.936343,2.47275,3.360718,1.942718,2.636697,3.958338,2.971272,6.684612,6.489025,...,1.0,132.231644,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [8]:
for i in range(df_label.values.shape[-1]):
    temp_col=df_label.values[:,i]
    print("LABEL:",labelname_user[i])
    unique,counts=np.unique(temp_col,return_counts=True)
    print(*zip(unique,counts)) 
    print("-"*50)

LABEL: LYING_DOWN
(-1.0, 73623) (0.0, 199513) (1.0, 104210)
--------------------------------------------------
LABEL: SITTING
(-1.0, 70752) (0.0, 170238) (1.0, 136356)
--------------------------------------------------
LABEL: FIX_walking
(-1.0, 70752) (0.0, 284458) (1.0, 22136)
--------------------------------------------------
LABEL: FIX_running
(-1.0, 236476) (0.0, 139780) (1.0, 1090)
--------------------------------------------------
LABEL: BICYCLING
(-1.0, 242163) (0.0, 130163) (1.0, 5020)
--------------------------------------------------
LABEL: SLEEPING
(-1.0, 92078) (0.0, 202213) (1.0, 83055)
--------------------------------------------------
LABEL: LAB_WORK
(-1.0, 327951) (0.0, 45547) (1.0, 3848)
--------------------------------------------------
LABEL: IN_CLASS
(-1.0, 268283) (0.0, 102953) (1.0, 6110)
--------------------------------------------------
LABEL: IN_A_MEETING
(-1.0, 146133) (0.0, 226060) (1.0, 5153)
--------------------------------------------------
LABEL: LOC_main