<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Imports" data-toc-modified-id="Imports-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Imports</a></span></li><li><span><a href="#Dataset-parsers-and-cleaning-functions" data-toc-modified-id="Dataset-parsers-and-cleaning-functions-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Dataset parsers and cleaning functions</a></span></li><li><span><a href="#User-data-test" data-toc-modified-id="User-data-test-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>User data test</a></span><ul class="toc-item"><li><span><a href="#Finding-out-labels-that-will-not-have-values-attached-(nans)" data-toc-modified-id="Finding-out-labels-that-will-not-have-values-attached-(nans)-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Finding out labels that will not have values attached (nans)</a></span></li></ul></li><li><span><a href="#Training" data-toc-modified-id="Training-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Training</a></span><ul class="toc-item"><li><span><a href="#Importing-data-(no-cross-validation):-Setup-for-single-label-instances" data-toc-modified-id="Importing-data-(no-cross-validation):-Setup-for-single-label-instances-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>Importing data (no cross-validation): Setup for single-label instances</a></span></li><li><span><a href="#Creating-a-new-data-structure-for-all-valid-data-and-pickling-it" data-toc-modified-id="Creating-a-new-data-structure-for-all-valid-data-and-pickling-it-4.2"><span class="toc-item-num">4.2&nbsp;&nbsp;</span>Creating a new data structure for all valid data and pickling it</a></span></li></ul></li><li><span><a href="#Single-Class-Classifier:-Train/Test-Functions" data-toc-modified-id="Single-Class-Classifier:-Train/Test-Functions-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Single Class Classifier: Train/Test Functions</a></span></li><li><span><a href="#Initial-Setup-and-Trials" data-toc-modified-id="Initial-Setup-and-Trials-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Initial Setup and Trials</a></span><ul class="toc-item"><li><span><a href="#Model-Choices" data-toc-modified-id="Model-Choices-6.1"><span class="toc-item-num">6.1&nbsp;&nbsp;</span>Model Choices</a></span></li><li><span><a href="#Logistic-Regression" data-toc-modified-id="Logistic-Regression-6.2"><span class="toc-item-num">6.2&nbsp;&nbsp;</span>Logistic Regression</a></span></li><li><span><a href="#Support-Vector" data-toc-modified-id="Support-Vector-6.3"><span class="toc-item-num">6.3&nbsp;&nbsp;</span>Support-Vector</a></span></li><li><span><a href="#Random-Forest" data-toc-modified-id="Random-Forest-6.4"><span class="toc-item-num">6.4&nbsp;&nbsp;</span>Random Forest</a></span></li><li><span><a href="#ANN" data-toc-modified-id="ANN-6.5"><span class="toc-item-num">6.5&nbsp;&nbsp;</span>ANN</a></span></li></ul></li><li><span><a href="#Single-Class-Classifier:-SOTA-Comparison" data-toc-modified-id="Single-Class-Classifier:-SOTA-Comparison-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Single Class Classifier: SOTA Comparison</a></span><ul class="toc-item"><li><span><a href="#Logistic-Regression-Baseline" data-toc-modified-id="Logistic-Regression-Baseline-7.1"><span class="toc-item-num">7.1&nbsp;&nbsp;</span>Logistic Regression Baseline</a></span></li></ul></li><li><span><a href="#Multi-Class-Classifier:-Train/Test-Functions" data-toc-modified-id="Multi-Class-Classifier:-Train/Test-Functions-8"><span class="toc-item-num">8&nbsp;&nbsp;</span>Multi Class Classifier: Train/Test Functions</a></span></li><li><span><a href="#Multi-Class-Classifier:-SOTA-Comparison" data-toc-modified-id="Multi-Class-Classifier:-SOTA-Comparison-9"><span class="toc-item-num">9&nbsp;&nbsp;</span>Multi Class Classifier: SOTA Comparison</a></span><ul class="toc-item"><li><span><a href="#Multi-Layer-Perceptron-(0-Hidden-Layers)" data-toc-modified-id="Multi-Layer-Perceptron-(0-Hidden-Layers)-9.1"><span class="toc-item-num">9.1&nbsp;&nbsp;</span>Multi-Layer Perceptron (0 Hidden Layers)</a></span></li><li><span><a href="#Multi-Layer-Perceptron-(1-Hidden-Layer)" data-toc-modified-id="Multi-Layer-Perceptron-(1-Hidden-Layer)-9.2"><span class="toc-item-num">9.2&nbsp;&nbsp;</span>Multi-Layer Perceptron (1 Hidden Layer)</a></span></li></ul></li></ul></div>

# Imports

In [173]:
# Required imports
import os
import numpy as np
import pandas as pd
import gzip
import glob
import pickle
from io import StringIO
import importlib.machinery

from sklearn.model_selection import train_test_split as TT_split
# Models
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.neural_network import MLPClassifier
from sklearn.multiclass import OneVsRestClassifier as OvR

from sklearn.metrics import accuracy_score,confusion_matrix,balanced_accuracy_score
from sklearn.metrics import precision_recall_fscore_support,classification_report
from imblearn.metrics import sensitivity_specificity_support,make_index_balanced_accuracy
#from sklearn.metrics import multilabel_confusion_matrix # Only available in dev .21

# Need Pytorch for multilabel classifications
import torch
from torch.autograd import Variable as V
from torch import nn,optim
import torch.nn.functional as F
import torch.utils.data as utils

random_state=10

In [2]:
# Data location and sample user
prefix='dataset/Extrasensory_uuid_fl_uTAR/'
cross_validation_user_loc='dataset/cv_5_folds/'
user_sample='3600D531-0C55-44A7-AE95-A7A38519464E.features_labels'

# Dataset parsers and cleaning functions

In [3]:
# Dataset parsers for header/ body for CSVs
def parse_header_of_csv(csv_str):
    # Isolate the headline columns:
    headline = csv_str[:csv_str.index('\n')];
    columns = headline.split(',');

    # The first column should be timestamp:
    assert columns[0] == 'timestamp';
    # The last column should be label_source:
    assert columns[-1] == 'label_source';
    
    # Search for the column of the first label:
    for (ci,col) in enumerate(columns):
        if col.startswith('label:'):
            first_label_ind = ci;
            break;
        pass;

    # Feature columns come after timestamp and before the labels:
    feature_names = columns[1:first_label_ind];
    # Then come the labels, till the one-before-last column:
    label_names = columns[first_label_ind:-1];
    for (li,label) in enumerate(label_names):
        # In the CSV the label names appear with prefix 'label:', but we don't need it after reading the data:
        assert label.startswith('label:');
        label_names[li] = label.replace('label:','');
        pass;
    
    return (feature_names,label_names);

def parse_body_of_csv(csv_str,n_features):
    # Read the entire CSV body into a single numeric matrix:
    full_table = np.loadtxt(StringIO(csv_str),delimiter=',',skiprows=1);
    
    # Timestamp is the primary key for the records (examples):
    timestamps = full_table[:,0].astype(int);
    
    # Read the sensor features:
    X = full_table[:,1:(n_features+1)];
    
    # Read the binary label values, and the 'missing label' indicators:
    trinary_labels_mat = full_table[:,(n_features+1):-1]; # This should have values of either 0., 1. or NaN
    M = np.isnan(trinary_labels_mat); # M is the missing label matrix
    
    #print("M matrix shape:",M.shape)
    #print("Matrix: ",np.argwhere(M))
    
    Y = np.where(M,0,trinary_labels_mat) > 0.; # Y is the label matrix
    
    return (X,Y,M,timestamps);

def read_user_data(directory):
    print('Reading {}'.format(directory.split("/")[-1]))

    # Read the entire csv file of the user:
    with gzip.open(directory,'rb') as fid:
        csv_str = fid.read();
        csv_str = csv_str.decode("utf-8")
        pass;

    (feature_names,label_names) = parse_header_of_csv(csv_str);
    n_features = len(feature_names);
    (X,Y,M,timestamps) = parse_body_of_csv(csv_str,n_features);

    return (X,Y,M,timestamps,feature_names,label_names);

In [4]:
# Clean labels
def clean_labels(input_label):
    if label.endswith('_'):
        label=label[:-1]+')'
    label=label.replace('__',' (').replace('_',' ')
    label=label[0]+label[1:].lower()
    label=label.replace('i m','I\'m')
    return label

In [5]:
# Get a summary of the sensor feature
'''
# Summarize features as we are only using phone_acc,phone_gyro,phone_mag,phone_loc,phone_audio,
# phone_app,phone_battery,phone_use,phone_callstat,phone_wifi,phone_lf,phone_time
# We are ignoring the use of the smartwatch features. There are definitely features that will be used
# much more (e.g. than the phone_callstat) but we'll leave that up to the ML algorithm.
'''
def summarize_features(feature_list):
    summary_feature_list=np.empty_like(feature_list)
    for (ind,feature) in enumerate(feature_list):
        if feature.startswith('raw_acc'):
            summary_feature_list[ind]='phone_acc' 
        if feature.startswith('proc_gyro'):
            summary_feature_list[ind]='phone_gyro'
        if feature.startswith('raw_magnet'):
            summary_feature_list[ind]='phone_mag'
        if feature.startswith('watch_acc'):
            summary_feature_list[ind]='watch_acc'
        if feature.startswith('watch_heading'):
            summary_feature_list[ind]='watch_dir'
        if feature.startswith('location'):
            summary_feature_list[ind]='phone_loc'
        if feature.startswith('audio_naive'):
            summary_feature_list[ind]='phone_audio'
        if feature.startswith('discrete:app_state'):
            summary_feature_list[ind]='phone_app'
        if feature.startswith('discrete:battery'):
            summary_feature_list[ind]='phone_battery'
        if feature.startswith('discrete:on'):
            summary_feature_list[ind]='phone_use'
        if feature.startswith('discrete:ringer'):
            summary_feature_list[ind]='phone_callstat'
        if feature.startswith('discrete:wifi'):
            summary_feature_list[ind]='phone_wifi'
        if feature.startswith('lf'):
            summary_feature_list[ind]='phone_lf'
        if feature.startswith('discrete:time'):
            summary_feature_list[ind]='phone_time'

    return summary_feature_list

In [6]:
# Custom dictionary class with help for duplicate keys
class Customdictionary(dict):
    def __setitem__(self,key,value):
        try:
            self[key]
        except KeyError:
            super(Customdictionary,self).__setitem__(key,[])
        self[key].append(value)

# User data test

In [7]:
# Reading sample data
sample_loc='{}/{}.csv.gz'.format(prefix,user_sample)
x_user,y_user,missedlabel_user,tstamp_user,featurename_user,labelname_user=read_user_data(sample_loc)

Reading 3600D531-0C55-44A7-AE95-A7A38519464E.features_labels.csv.gz


In [8]:
# Dataset summaries for this user
print('Data shape input for user (Len minutes/num examples, num sensors): ',x_user.shape) # Timestep examples, number of sensors
print('Label shape for user (Len minutes, num labels): ',y_user.shape,'\n') # Timestep examples, labels

countlabels_user=np.sum(y_user,axis=0) # Column summary
labelname_countlabel_user=zip(labelname_user,countlabels_user) # Zip together names, counts
labelname_countlabel_user=sorted(labelname_countlabel_user,key=lambda row:row[-1],reverse=True)

print('Sensor feature names:\n')
feature_names=summarize_features(featurename_user)
    
for i,sensor_feature in enumerate(featurename_user):
    print('{} :: {} ::--> {}\n'.format(i,feature_names[i],sensor_feature))

print('Activities and counts:')
print(labelname_countlabel_user)

Data shape input for user (Len minutes/num examples, num sensors):  (5203, 225)
Label shape for user (Len minutes, num labels):  (5203, 51) 

Sensor feature names:

0 :: phone_acc ::--> raw_acc:magnitude_stats:mean

1 :: phone_acc ::--> raw_acc:magnitude_stats:std

2 :: phone_acc ::--> raw_acc:magnitude_stats:moment3

3 :: phone_acc ::--> raw_acc:magnitude_stats:moment4

4 :: phone_acc ::--> raw_acc:magnitude_stats:percentile25

5 :: phone_acc ::--> raw_acc:magnitude_stats:percentile50

6 :: phone_acc ::--> raw_acc:magnitude_stats:percentile75

7 :: phone_acc ::--> raw_acc:magnitude_stats:value_entropy

8 :: phone_acc ::--> raw_acc:magnitude_stats:time_entropy

9 :: phone_acc ::--> raw_acc:magnitude_spectrum:log_energy_band0

10 :: phone_acc ::--> raw_acc:magnitude_spectrum:log_energy_band1

11 :: phone_acc ::--> raw_acc:magnitude_spectrum:log_energy_band2

12 :: phone_acc ::--> raw_acc:magnitude_spectrum:log_energy_band3

13 :: phone_acc ::--> raw_acc:magnitude_spectrum:log_energy_ban

## Finding out labels that will not have values attached (nans)

In [9]:
# Counting label instances across all users
label_counts=Customdictionary()

for u_file in glob.glob('{}/*.csv.gz'.format(prefix)):
    x_user,y_user,missed_label_user,tstamp_user,featurename_user,labelname_user=read_user_data(u_file)
    countlabels_user=np.sum(y_user,axis=0) # Column summary
    labelname_countlabel_user=zip(labelname_user,countlabels_user) # Zip together names, counts
    labelname_countlabel_user=sorted(labelname_countlabel_user,key=lambda row:row[-1],reverse=True)

    for index in range(len(labelname_countlabel_user)):
        key=labelname_countlabel_user[index][0]
        value=labelname_countlabel_user[index][1]
        print("\t\t Label {} --> {} minutes".format(key,str(value)))
        
        label_counts[key]=value
    print("\n")

Reading 3600D531-0C55-44A7-AE95-A7A38519464E.features_labels.csv.gz
		 Label LOC_home --> 3040 minutes
		 Label OR_indoors --> 2487 minutes
		 Label PHONE_ON_TABLE --> 2179 minutes
		 Label SITTING --> 1916 minutes
		 Label WITH_FRIENDS --> 1730 minutes
		 Label LYING_DOWN --> 1336 minutes
		 Label SLEEPING --> 1021 minutes
		 Label WATCHING_TV --> 912 minutes
		 Label EATING --> 762 minutes
		 Label PHONE_IN_POCKET --> 706 minutes
		 Label TALKING --> 638 minutes
		 Label DRIVE_-_I_M_A_PASSENGER --> 409 minutes
		 Label OR_standing --> 384 minutes
		 Label IN_A_CAR --> 342 minutes
		 Label OR_exercise --> 162 minutes
		 Label AT_THE_GYM --> 162 minutes
		 Label SINGING --> 136 minutes
		 Label FIX_walking --> 132 minutes
		 Label OR_outside --> 127 minutes
		 Label SHOPPING --> 111 minutes
		 Label AT_SCHOOL --> 105 minutes
		 Label BATHING_-_SHOWER --> 85 minutes
		 Label DRESSING --> 67 minutes
		 Label DRINKING__ALCOHOL_ --> 66 minutes
		 Label PHONE_IN_HAND --> 64 minutes
		 Label

		 Label LOC_home --> 5757 minutes
		 Label OR_indoors --> 5448 minutes
		 Label PHONE_ON_TABLE --> 3771 minutes
		 Label SITTING --> 2627 minutes
		 Label LYING_DOWN --> 2500 minutes
		 Label SLEEPING --> 2485 minutes
		 Label SURFING_THE_INTERNET --> 1860 minutes
		 Label OR_standing --> 918 minutes
		 Label OR_exercise --> 787 minutes
		 Label BICYCLING --> 692 minutes
		 Label PHONE_IN_POCKET --> 669 minutes
		 Label EATING --> 344 minutes
		 Label CLEANING --> 320 minutes
		 Label WATCHING_TV --> 270 minutes
		 Label IN_A_MEETING --> 243 minutes
		 Label COOKING --> 131 minutes
		 Label FIX_running --> 88 minutes
		 Label IN_A_CAR --> 81 minutes
		 Label TOILET --> 79 minutes
		 Label DRESSING --> 77 minutes
		 Label DRIVE_-_I_M_THE_DRIVER --> 73 minutes
		 Label SHOPPING --> 69 minutes
		 Label GROOMING --> 30 minutes
		 Label STROLLING --> 17 minutes
		 Label FIX_walking --> 15 minutes
		 Label WASHING_DISHES --> 13 minutes
		 Label AT_SCHOOL --> 12 minutes
		 Label OR_outside -

		 Label PHONE_ON_TABLE --> 5235 minutes
		 Label SITTING --> 3044 minutes
		 Label LYING_DOWN --> 2548 minutes
		 Label LOC_home --> 2358 minutes
		 Label OR_indoors --> 2061 minutes
		 Label TALKING --> 1826 minutes
		 Label WITH_FRIENDS --> 1654 minutes
		 Label PHONE_IN_POCKET --> 1304 minutes
		 Label OR_outside --> 1184 minutes
		 Label FIX_walking --> 1181 minutes
		 Label COMPUTER_WORK --> 1080 minutes
		 Label OR_standing --> 621 minutes
		 Label LOC_main_workplace --> 483 minutes
		 Label COOKING --> 444 minutes
		 Label WATCHING_TV --> 275 minutes
		 Label PHONE_IN_HAND --> 240 minutes
		 Label EATING --> 218 minutes
		 Label ON_A_BUS --> 141 minutes
		 Label SHOPPING --> 124 minutes
		 Label TOILET --> 67 minutes
		 Label DRESSING --> 67 minutes
		 Label AT_SCHOOL --> 65 minutes
		 Label IN_A_MEETING --> 27 minutes
		 Label IN_A_CAR --> 20 minutes
		 Label FIX_running --> 8 minutes
		 Label OR_exercise --> 8 minutes
		 Label WASHING_DISHES --> 2 minutes
		 Label BICYCLING -

		 Label SITTING --> 1096 minutes
		 Label PHONE_ON_TABLE --> 984 minutes
		 Label AT_SCHOOL --> 929 minutes
		 Label PHONE_IN_POCKET --> 878 minutes
		 Label LYING_DOWN --> 516 minutes
		 Label OR_standing --> 323 minutes
		 Label COMPUTER_WORK --> 234 minutes
		 Label FIX_walking --> 223 minutes
		 Label DRIVE_-_I_M_THE_DRIVER --> 147 minutes
		 Label EATING --> 127 minutes
		 Label PHONE_IN_HAND --> 37 minutes
		 Label TALKING --> 21 minutes
		 Label WATCHING_TV --> 15 minutes
		 Label FIX_running --> 0 minutes
		 Label BICYCLING --> 0 minutes
		 Label SLEEPING --> 0 minutes
		 Label LAB_WORK --> 0 minutes
		 Label IN_CLASS --> 0 minutes
		 Label IN_A_MEETING --> 0 minutes
		 Label LOC_main_workplace --> 0 minutes
		 Label OR_indoors --> 0 minutes
		 Label OR_outside --> 0 minutes
		 Label IN_A_CAR --> 0 minutes
		 Label ON_A_BUS --> 0 minutes
		 Label DRIVE_-_I_M_A_PASSENGER --> 0 minutes
		 Label LOC_home --> 0 minutes
		 Label FIX_restaurant --> 0 minutes
		 Label OR_exercise -->

		 Label OR_indoors --> 7903 minutes
		 Label LOC_home --> 5106 minutes
		 Label SITTING --> 3880 minutes
		 Label PHONE_ON_TABLE --> 3159 minutes
		 Label LYING_DOWN --> 2747 minutes
		 Label TALKING --> 2711 minutes
		 Label SLEEPING --> 2552 minutes
		 Label PHONE_IN_POCKET --> 2029 minutes
		 Label OR_standing --> 1667 minutes
		 Label AT_SCHOOL --> 1532 minutes
		 Label WITH_FRIENDS --> 1506 minutes
		 Label OR_outside --> 1062 minutes
		 Label WATCHING_TV --> 710 minutes
		 Label OR_exercise --> 676 minutes
		 Label PHONE_IN_HAND --> 557 minutes
		 Label BICYCLING --> 540 minutes
		 Label SURFING_THE_INTERNET --> 512 minutes
		 Label EATING --> 503 minutes
		 Label FIX_walking --> 494 minutes
		 Label PHONE_IN_BAG --> 414 minutes
		 Label IN_CLASS --> 409 minutes
		 Label DRINKING__ALCOHOL_ --> 351 minutes
		 Label AT_A_PARTY --> 291 minutes
		 Label DRESSING --> 254 minutes
		 Label GROOMING --> 226 minutes
		 Label DRIVE_-_I_M_THE_DRIVER --> 218 minutes
		 Label COOKING --> 173

		 Label PHONE_ON_TABLE --> 4300 minutes
		 Label SITTING --> 3467 minutes
		 Label OR_indoors --> 1591 minutes
		 Label COMPUTER_WORK --> 1492 minutes
		 Label SLEEPING --> 1432 minutes
		 Label LYING_DOWN --> 1394 minutes
		 Label LOC_main_workplace --> 1020 minutes
		 Label AT_SCHOOL --> 932 minutes
		 Label TALKING --> 540 minutes
		 Label PHONE_IN_BAG --> 538 minutes
		 Label OR_standing --> 467 minutes
		 Label PHONE_IN_HAND --> 452 minutes
		 Label FIX_walking --> 412 minutes
		 Label EATING --> 373 minutes
		 Label SURFING_THE_INTERNET --> 303 minutes
		 Label IN_A_CAR --> 298 minutes
		 Label DRIVE_-_I_M_A_PASSENGER --> 298 minutes
		 Label OR_outside --> 248 minutes
		 Label PHONE_IN_POCKET --> 215 minutes
		 Label WATCHING_TV --> 199 minutes
		 Label IN_A_MEETING --> 181 minutes
		 Label CLEANING --> 173 minutes
		 Label GROOMING --> 70 minutes
		 Label DRESSING --> 45 minutes
		 Label BATHING_-_SHOWER --> 36 minutes
		 Label COOKING --> 15 minutes
		 Label FIX_restaurant --

		 Label OR_indoors --> 8093 minutes
		 Label LOC_home --> 6798 minutes
		 Label PHONE_ON_TABLE --> 4613 minutes
		 Label LYING_DOWN --> 3696 minutes
		 Label SITTING --> 3668 minutes
		 Label SLEEPING --> 2702 minutes
		 Label WATCHING_TV --> 1327 minutes
		 Label AT_SCHOOL --> 1190 minutes
		 Label SURFING_THE_INTERNET --> 1128 minutes
		 Label TALKING --> 1103 minutes
		 Label OR_standing --> 1053 minutes
		 Label WITH_FRIENDS --> 1040 minutes
		 Label COMPUTER_WORK --> 799 minutes
		 Label PHONE_IN_POCKET --> 784 minutes
		 Label PHONE_IN_HAND --> 781 minutes
		 Label EATING --> 626 minutes
		 Label FIX_walking --> 506 minutes
		 Label OR_outside --> 442 minutes
		 Label GROOMING --> 343 minutes
		 Label IN_CLASS --> 341 minutes
		 Label TOILET --> 290 minutes
		 Label DRESSING --> 273 minutes
		 Label PHONE_IN_BAG --> 269 minutes
		 Label CLEANING --> 234 minutes
		 Label BATHING_-_SHOWER --> 178 minutes
		 Label COOKING --> 175 minutes
		 Label LOC_main_workplace --> 173 minutes


		 Label OR_indoors --> 3954 minutes
		 Label PHONE_ON_TABLE --> 3084 minutes
		 Label LOC_home --> 3040 minutes
		 Label SITTING --> 2972 minutes
		 Label SLEEPING --> 2440 minutes
		 Label TALKING --> 2422 minutes
		 Label LYING_DOWN --> 2319 minutes
		 Label AT_SCHOOL --> 2016 minutes
		 Label SURFING_THE_INTERNET --> 1674 minutes
		 Label PHONE_IN_HAND --> 1480 minutes
		 Label OR_standing --> 576 minutes
		 Label PHONE_IN_BAG --> 467 minutes
		 Label EATING --> 415 minutes
		 Label FIX_walking --> 384 minutes
		 Label LOC_beach --> 236 minutes
		 Label BICYCLING --> 136 minutes
		 Label OR_exercise --> 136 minutes
		 Label ELEVATOR --> 23 minutes
		 Label IN_A_MEETING --> 13 minutes
		 Label WITH_CO-WORKERS --> 13 minutes
		 Label FIX_running --> 0 minutes
		 Label LAB_WORK --> 0 minutes
		 Label IN_CLASS --> 0 minutes
		 Label LOC_main_workplace --> 0 minutes
		 Label OR_outside --> 0 minutes
		 Label IN_A_CAR --> 0 minutes
		 Label ON_A_BUS --> 0 minutes
		 Label DRIVE_-_I_M_THE

		 Label LOC_home --> 6404 minutes
		 Label OR_indoors --> 5994 minutes
		 Label LYING_DOWN --> 3264 minutes
		 Label SITTING --> 2500 minutes
		 Label OR_standing --> 1054 minutes
		 Label SURFING_THE_INTERNET --> 849 minutes
		 Label SLEEPING --> 615 minutes
		 Label WATCHING_TV --> 544 minutes
		 Label COOKING --> 349 minutes
		 Label FIX_walking --> 302 minutes
		 Label EATING --> 259 minutes
		 Label TOILET --> 197 minutes
		 Label TALKING --> 180 minutes
		 Label SHOPPING --> 101 minutes
		 Label ON_A_BUS --> 43 minutes
		 Label PHONE_ON_TABLE --> 34 minutes
		 Label WASHING_DISHES --> 29 minutes
		 Label DRESSING --> 9 minutes
		 Label CLEANING --> 8 minutes
		 Label AT_SCHOOL --> 7 minutes
		 Label DOING_LAUNDRY --> 5 minutes
		 Label FIX_running --> 4 minutes
		 Label IN_A_MEETING --> 4 minutes
		 Label OR_exercise --> 4 minutes
		 Label BICYCLING --> 0 minutes
		 Label LAB_WORK --> 0 minutes
		 Label IN_CLASS --> 0 minutes
		 Label LOC_main_workplace --> 0 minutes
		 Label OR

		 Label LOC_home --> 3009 minutes
		 Label SITTING --> 2936 minutes
		 Label OR_indoors --> 2680 minutes
		 Label PHONE_ON_TABLE --> 2676 minutes
		 Label LYING_DOWN --> 1628 minutes
		 Label SLEEPING --> 1458 minutes
		 Label LOC_main_workplace --> 1298 minutes
		 Label OR_standing --> 973 minutes
		 Label WITH_CO-WORKERS --> 678 minutes
		 Label PHONE_IN_BAG --> 643 minutes
		 Label PHONE_IN_HAND --> 640 minutes
		 Label DRIVE_-_I_M_A_PASSENGER --> 602 minutes
		 Label COMPUTER_WORK --> 601 minutes
		 Label TALKING --> 553 minutes
		 Label FIX_walking --> 437 minutes
		 Label IN_A_CAR --> 388 minutes
		 Label OR_outside --> 357 minutes
		 Label EATING --> 331 minutes
		 Label ON_A_BUS --> 307 minutes
		 Label CLEANING --> 279 minutes
		 Label DRIVE_-_I_M_THE_DRIVER --> 240 minutes
		 Label GROOMING --> 134 minutes
		 Label FIX_restaurant --> 104 minutes
		 Label WATCHING_TV --> 97 minutes
		 Label IN_A_MEETING --> 93 minutes
		 Label DOING_LAUNDRY --> 81 minutes
		 Label SURFING_THE

		 Label SITTING --> 4488 minutes
		 Label OR_indoors --> 4477 minutes
		 Label AT_SCHOOL --> 4137 minutes
		 Label PHONE_ON_TABLE --> 3888 minutes
		 Label LOC_home --> 3832 minutes
		 Label SLEEPING --> 3470 minutes
		 Label LYING_DOWN --> 3432 minutes
		 Label COMPUTER_WORK --> 2313 minutes
		 Label EATING --> 976 minutes
		 Label TALKING --> 870 minutes
		 Label WITH_FRIENDS --> 705 minutes
		 Label FIX_walking --> 658 minutes
		 Label PHONE_IN_HAND --> 550 minutes
		 Label OR_standing --> 312 minutes
		 Label DRIVE_-_I_M_A_PASSENGER --> 183 minutes
		 Label ON_A_BUS --> 170 minutes
		 Label BATHING_-_SHOWER --> 160 minutes
		 Label TOILET --> 89 minutes
		 Label FIX_restaurant --> 34 minutes
		 Label IN_A_MEETING --> 22 minutes
		 Label SURFING_THE_INTERNET --> 17 minutes
		 Label DRESSING --> 10 minutes
		 Label FIX_running --> 0 minutes
		 Label BICYCLING --> 0 minutes
		 Label LAB_WORK --> 0 minutes
		 Label IN_CLASS --> 0 minutes
		 Label LOC_main_workplace --> 0 minutes
		 La

		 Label SITTING --> 1728 minutes
		 Label PHONE_IN_POCKET --> 1363 minutes
		 Label PHONE_ON_TABLE --> 1137 minutes
		 Label LOC_home --> 1084 minutes
		 Label AT_SCHOOL --> 982 minutes
		 Label OR_standing --> 978 minutes
		 Label TALKING --> 958 minutes
		 Label LOC_main_workplace --> 697 minutes
		 Label OR_indoors --> 631 minutes
		 Label COMPUTER_WORK --> 517 minutes
		 Label DRIVE_-_I_M_THE_DRIVER --> 410 minutes
		 Label FIX_walking --> 363 minutes
		 Label PHONE_IN_HAND --> 362 minutes
		 Label EATING --> 324 minutes
		 Label FIX_restaurant --> 202 minutes
		 Label DRINKING__ALCOHOL_ --> 105 minutes
		 Label LYING_DOWN --> 102 minutes
		 Label OR_outside --> 99 minutes
		 Label STAIRS_-_GOING_DOWN --> 88 minutes
		 Label STAIRS_-_GOING_UP --> 86 minutes
		 Label WITH_FRIENDS --> 62 minutes
		 Label BICYCLING --> 59 minutes
		 Label OR_exercise --> 59 minutes
		 Label COOKING --> 58 minutes
		 Label SHOPPING --> 44 minutes
		 Label DRESSING --> 36 minutes
		 Label SINGING --> 3

In [10]:
# Summing up label instances across all counts.
label_sum={}
for k,v in label_counts.items():
    label_sum[k]=np.sum(v)
print("Sorted tuple for values across all users")
sorted(label_sum.items(),key=lambda x:x[1])

Sorted tuple for values across all users


[('ELEVATOR', 200),
 ('AT_A_BAR', 551),
 ('DOING_LAUNDRY', 556),
 ('LOC_beach', 585),
 ('SINGING', 651),
 ('STAIRS_-_GOING_DOWN', 774),
 ('STAIRS_-_GOING_UP', 798),
 ('STROLLING', 806),
 ('FIX_running', 1090),
 ('AT_THE_GYM', 1151),
 ('WASHING_DISHES', 1228),
 ('DRINKING__ALCOHOL_', 1456),
 ('AT_A_PARTY', 1470),
 ('ON_A_BUS', 1794),
 ('SHOPPING', 1841),
 ('BATHING_-_SHOWER', 2087),
 ('FIX_restaurant', 2098),
 ('DRESSING', 2233),
 ('DRIVE_-_I_M_A_PASSENGER', 2526),
 ('TOILET', 2655),
 ('GROOMING', 3064),
 ('CLEANING', 3806),
 ('LAB_WORK', 3848),
 ('COOKING', 4029),
 ('BICYCLING', 5020),
 ('IN_A_MEETING', 5153),
 ('IN_A_CAR', 6083),
 ('IN_CLASS', 6110),
 ('WITH_CO-WORKERS', 6224),
 ('DRIVE_-_I_M_THE_DRIVER', 7975),
 ('OR_exercise', 8081),
 ('PHONE_IN_BAG', 10201),
 ('OR_outside', 12114),
 ('WATCHING_TV', 13311),
 ('PHONE_IN_HAND', 14573),
 ('EATING', 16594),
 ('SURFING_THE_INTERNET', 19416),
 ('FIX_walking', 22136),
 ('PHONE_IN_POCKET', 23401),
 ('WITH_FRIENDS', 24737),
 ('LOC_main_workp

<span style="color:red">
    ISSUE: There are some labels (e.g. Phone location:bag etc.) that some users have not filled out for any timestep and shows up as np.nan. The label sum above was a check to see if the same label wasn't filled out for other users (hence would have a count of zero) and would let the label being completely removed. The lowest count was (Elevator:200) which doesn't help.
    I cannot do blindly remove rows because a particular label wasn't filled out for any timestep for a user. For single label case, this is fine...but for a multi-label case, this will mean that other valid labels are ignored. The only option that I have so far is to naively convert all nans in the labels to zeros. This could mean a loss of accuracy (the user might have been doing the task in the label but have omitted annotating it, and so we are incorrectly training a feature vector....but there is no choice so far.
</span>

# Training

In [11]:
# Choosing sensor labels
'''
Summary sensor choices are: phone_acc,phone_gyro,phone_mag,watch_acc,watch_dir,phone_loc,phone_audio,
phone_app,phone_battery,phone_use,phone_callstat,phone_wifi,phone_lf,phone_time
In this project, we aren't using watch_acc,watch_dir (no smartwatch)
'''

def choose_sensors(X_train,used_sensors,summarized_feature_names):
    used_sensor_feature_names=np.zeros(len(summarized_feature_names),dtype=bool)
    # Creates a zero boolean vector of all possible feature names
    for s in used_sensors:
        used_sensor_feature_names=np.logical_or(used_sensor_feature_names,(s==summarized_feature_names))
    X_train=X_train[:,used_sensor_feature_names]
    return X_train

In [12]:
# Returns a standardized (0 mean, 1 variance) dataset
def standardize(X_train):
    mean=np.nanmean(X_train,axis=0).reshape((1,-1))# Ignores NaNs while finding the mean across rows
    standard_dev=np.nanstd(X_train,axis=0) # Ignores NaNs while finding the standard deviation across rows
    standard_dev_nonzero=np.where(standard_dev>0,standard_dev,1.).reshape((1,-1)) # Div zero
    
    X=(X_train-mean)/standard_dev_nonzero
    return X,mean,standard_dev_nonzero   

## Importing data (no cross-validation): Setup for single-label instances

In [None]:
# Reading data in the directory (Stacked)
X_train=np.empty((0,225))
Y_train=np.empty((0,51))
X_test=np.empty((0,225))
Y_test=np.empty((0,51))
M_train=np.empty((0,51))
M_test=np.empty((0,51))

for u_file in glob.glob('{}/*.csv.gz'.format(prefix)):
        x_user,y_user,missed_label_user,tstamp_user,featurename_user,labelname_user=read_user_data(u_file)
        # Split each user data into train-test splits .70-.30 as in literature
        x_train_u,x_test_u,y_train_u,y_test_u=TT_split(x_user,y_user,test_size=0.30,random_state=random_state)
        m_train,m_test=TT_split(missed_label_user,test_size=0.30,random_state=random_state)
        
        # Stacking data. Will be changed for K-Fold cross-validation
        X_train=np.vstack((X_train,x_train_u))
        Y_train=np.vstack((Y_train,y_train_u))
        X_test=np.vstack((X_test,x_test_u))
        Y_test=np.vstack((Y_test,y_test_u))
    
        # Missing data matrix
        M_train=np.vstack((M_train,m_train))
        M_test=np.vstack((M_test,m_test))
        
assert len(X_train)==len(Y_train)
assert len(X_test)==len(Y_test)

print('\nTraining: X::{} ,Y::{}'.format(X_train.shape,Y_train.shape))
print('Testing: X::{} ,Y::{}'.format(X_test.shape,Y_test.shape))

## Creating a new data structure for all valid data and pickling it

Remove rows with np.nan labels (missing labels). Zero impute missing feature entries. Standardization done at train time.

In [13]:
# Reading data in the directory (Stacked)
X_train_t=np.empty((0,225))
Y_train_t=np.empty((0,51))
X_test_t=np.empty((0,225))
Y_test_t=np.empty((0,51))
#M_train_t=np.empty((0,51))
#M_test_t=np.empty((0,51))

for u_file in glob.glob('{}/*.csv.gz'.format(prefix)):
        x_user,y_user,missed_label_user,tstamp_user,featurename_user,labelname_user=read_user_data(u_file)
        x_sh=x_user.shape
        y_sh=y_user.shape
        # Removing invalid labels, imputing missing features before splitting
        #missed_label_user=missed_label_user.astype(int) # Convert Boolean to int array
        #missed_label_user=np.sum(missed_label_user,axis=1)# Sum across columns creating a n_row*1 vector
        # If the value for a particular row ==0, no features are missing : Can use that row
        #use_labels=np.logical_not(missed_label_user)
        #x_user=x_user[use_labels,:]
        y_user=np.nan_to_num(y_user) # Blind way to replace NAN labels in y_train/y_test to 0
        # Assuming that if the user hasn't bothered with that label, it means that it wasn't too applicable.
        x_user=np.nan_to_num(x_user)
        #y_user=y_user[use_labels,:]
        
        print('X_shape before removing invalid labels:{}, after:{}'.format(x_sh,x_user.shape))
        print('Y_shape before removing invalid labels:{}, after:{}'.format(y_sh,y_user.shape))
        
        # Split each user data into train-test splits .70-.30 as in literature
        x_train_u,x_test_u,y_train_u,y_test_u=TT_split(x_user,y_user,test_size=0.30,random_state=random_state)
        #m_train,m_test=TT_split(missed_label_user,test_size=0.30,random_state=random_state)
        
        # Stacking data. Will be changed for K-Fold cross-validation
        X_train_t=np.vstack((X_train_t,x_train_u))
        Y_train_t=np.vstack((Y_train_t,y_train_u))
        X_test_t=np.vstack((X_test_t,x_test_u))
        Y_test_t=np.vstack((Y_test_t,y_test_u))
        
        print('\t Per User Training examples:{}, Testing examples:{}'.
              format(y_train_u.shape[0],y_test_u.shape[0]))
assert len(X_train_t)==len(Y_train_t)
assert len(X_test_t)==len(Y_test_t)

print('\nTraining: X::{} ,Y::{}'.format(X_train_t.shape,Y_train_t.shape))
print('Testing: X::{} ,Y::{}'.format(X_test_t.shape,Y_test_t.shape))

print("Pickling data files")
with open('dataset/pickled/x_train.pkl','wb') as f:
    pickle.dump(X_train_t,f)
with open('dataset/pickled/y_train.pkl','wb') as f:
    pickle.dump(Y_train_t,f)
with open('dataset/pickled/x_test.pkl','wb') as f:
    pickle.dump(X_test_t,f)
with open('dataset/pickled/y_test.pkl','wb') as f:
    pickle.dump(Y_test_t,f)
print("Done")

Reading 3600D531-0C55-44A7-AE95-A7A38519464E.features_labels.csv.gz
X_shape before removing invalid labels:(5203, 225), after:(5203, 225)
Y_shape before removing invalid labels:(5203, 51), after:(5203, 51)
	 Per User Training examples:3642, Testing examples:1561
Reading 8023FE1A-D3B0-4E2C-A57A-9321B7FC755F.features_labels.csv.gz
X_shape before removing invalid labels:(9189, 225), after:(9189, 225)
Y_shape before removing invalid labels:(9189, 51), after:(9189, 51)
	 Per User Training examples:6432, Testing examples:2757
Reading 86A4F379-B305-473D-9D83-FC7D800180EF.features_labels.csv.gz
X_shape before removing invalid labels:(10738, 225), after:(10738, 225)
Y_shape before removing invalid labels:(10738, 51), after:(10738, 51)
	 Per User Training examples:7516, Testing examples:3222
Reading 1538C99F-BA1E-4EFB-A949-6C7C47701B20.features_labels.csv.gz
X_shape before removing invalid labels:(6549, 225), after:(6549, 225)
Y_shape before removing invalid labels:(6549, 51), after:(6549, 51)
	

X_shape before removing invalid labels:(3108, 225), after:(3108, 225)
Y_shape before removing invalid labels:(3108, 51), after:(3108, 51)
	 Per User Training examples:2175, Testing examples:933
Reading F50235E0-DD67-4F2A-B00B-1F31ADA998B9.features_labels.csv.gz
X_shape before removing invalid labels:(2266, 225), after:(2266, 225)
Y_shape before removing invalid labels:(2266, 51), after:(2266, 51)
	 Per User Training examples:1586, Testing examples:680
Reading 1155FF54-63D3-4AB2-9863-8385D0BD0A13.features_labels.csv.gz
X_shape before removing invalid labels:(2685, 225), after:(2685, 225)
Y_shape before removing invalid labels:(2685, 51), after:(2685, 51)
	 Per User Training examples:1879, Testing examples:806
Reading 9759096F-1119-4E19-A0AD-6F16989C7E1C.features_labels.csv.gz
X_shape before removing invalid labels:(9959, 225), after:(9959, 225)
Y_shape before removing invalid labels:(9959, 51), after:(9959, 51)
	 Per User Training examples:6971, Testing examples:2988
Reading 5152A2DF-FA

# Single Class Classifier: Train/Test Functions

In [None]:
# Train model function repeat for every training label
def scc_train(X_train,Y_train,M,all_sensornames,all_labelnames,used_sensors,Y_target,clf,clf_type,root):
    print('-'*50)
    out_model={}
    X_train=choose_sensors(X_train,used_sensors,all_sensornames)
    print('Current X_train shape is {}'.format(X_train.shape))
    print('Using sensors {}'.format(used_sensors))
    
    X_train,mean,standard_dev_nonzero=standardize(X_train) # Standardizing X_train to have zero mean and unit variance
    index_label=all_labelnames.index(Y_target)
    y_train=Y_train[:,index_label] # Choosing a column of data for training consisting of only one target label
    
    any_missingdata_label=M[:,index_label] # Check if there's data missing for that label
    any_presentdata_features=np.logical_not(any_missingdata_label) # Present data indices boolean
    
    x_train=X_train[any_presentdata_features,:] # Training dataset consists of data that's available ignoring all missing rows
    x_train[np.isnan(x_train)]=0 # Zero imputation of NaN values
    y_train=y_train[any_presentdata_features] # Training labels consists of data that's available
    print('Current X_train shape after removing missing data & zero-impute is {}'.format(x_train.shape))
    
    # Fitting classifier
    clf.fit(x_train,y_train)
    
    out_model['train_mean']=mean
    out_model['train_std_dev_nonzero']=standard_dev_nonzero
    out_model['classifier']=clf
    
    filename='scc_model_{}_label_{}.sav'.format(clf_type,Y_target)
    pickle.dump(clf,open(os.path.join(root,filename), 'wb'))
    
    return out_model

In [None]:
# Test model function repeat for every training label
def scc_test(X_test,Y_test,M,all_sensornames,all_labelnames,used_sensors,Y_target,trained_model):
    print('-'*50)
    X_test=choose_sensors(X_test,used_sensors,all_sensornames)
    print('Current X_test shape is {}'.format(X_test.shape))
    # Normalizing test set the same way training set was normalized
    X=(X_test-trained_model['train_mean'])/trained_model['train_std_dev_nonzero']
       
    index_label=all_labelnames.index(Y_target)
    y_test=Y_test[:,index_label] # Choosing a column of data for training consisting of only one target label
    
    any_missingdata_label=M[:,index_label] # Check if there's data missing for that label
    any_presentdata_features=np.logical_not(any_missingdata_label) # Present data indices boolean
    
    x_test=X[any_presentdata_features,:] # Training dataset consists of data that's available ignoring all missing rows
    x_test[np.isnan(x_test)]=0 # Zero imputation of NaN values
    y_test=y_test[any_presentdata_features] # Training labels consists of data that's available
    
    y_test_predicted=trained_model['classifier'].predict(x_test) # Prediction
    
    tn,fp,fn,tp=confusion_matrix(y_test,y_test_predicted).ravel()
    bal_accuracy=balanced_accuracy_score(y_test,y_test_predicted)
    return tn,fp,fn,tp,bal_accuracy

# Initial Setup and Trials

In [None]:
# Sensor Types, Label Possibilities variables
sensor_types=['phone_acc','phone_gyro','phone_mag','phone_loc','phone_audio',
'phone_app','phone_battery','phone_use','phone_callstat','phone_wifi','phone_lf','phone_time']
label_possibilities=['LOC_home','OR_indoors','PHONE_ON_TABLE','SITTING','WITH_FRIENDS',
 'LYING_DOWN','SLEEPING','WATCHING_TV','EATING','PHONE_IN_POCKET',
 'TALKING','DRIVE_-_I_M_A_PASSENGER','OR_standing','IN_A_CAR',
 'OR_exercise','AT_THE_GYM','SINGING','FIX_walking','OR_outside',
 'SHOPPING','AT_SCHOOL','BATHING_-_SHOWER','DRESSING','DRINKING__ALCOHOL_',
 'PHONE_IN_HAND','FIX_restaurant','IN_CLASS','PHONE_IN_BAG','IN_A_MEETING',
 'TOILET','COOKING','ELEVATOR','FIX_running','BICYCLING','LAB_WORK',
 'LOC_main_workplace','ON_A_BUS','DRIVE_-_I_M_THE_DRIVER','STROLLING',
 'CLEANING','DOING_LAUNDRY','WASHING_DISHES','SURFING_THE_INTERNET',
 'AT_A_PARTY','AT_A_BAR','LOC_beach','COMPUTER_WORK','GROOMING','STAIRS_-_GOING_UP',
 'STAIRS_-_GOING_DOWN','WITH_CO-WORKERS']

## Model Choices

In [None]:
# Models
clf1=LogisticRegression(n_jobs=-1,warm_start=True,class_weight='balanced',solver='lbfgs') # Account for class imbalance
clf2=SVC(kernel='linear',cache_size=2000)
clf3=RandomForestClassifier(n_estimators=100,n_jobs=-1,warm_start=False,class_weight='balanced')
clf4=MLPClassifier(hidden_layer_sizes=(100, ),activation='relu',
                   solver='adam',batch_size=300,learning_rate_init=0.01,
                   max_iter=10,random_state=random_state,
                   verbose=True,warm_start=True,early_stopping=False,validation_fraction=0.05,
                   epsilon=1e-08, n_iter_no_change=10)

## Logistic Regression

In [None]:
# Loop through label possibilities and train/test logistic regression model
for label in label_possibilities:
    print("\n")
    print('*'*50)
    print('Predicting {} label'.format(label))
    trained_model=scc_train(X_train=X_train,
                            Y_train=Y_train,
                            M=M_train,
                            all_sensornames=feature_names,
                            all_labelnames=labelname_user,
                            used_sensors=sensor_types,
                            Y_target=label,
                            clf=clf1,
                            clf_type='logisticregression')

    tn,fp,fn,tp,bal_accuracy=scc_test(X_test=X_test,
                                           Y_test=Y_test,
                                           M=M_test,
                                           all_sensornames=feature_names,
                                           all_labelnames=labelname_user,
                                           used_sensors=sensor_types,
                                           Y_target=label,
                                           trained_model=trained_model)
    print('-'*25)
    print('TP:{}, FP:{}, FN:{}, TP:{}, Balanced Accuracy:{}\n'.format(tn,fp,fn,tp,bal_accuracy))
    print('*'*50)

## Support-Vector

In [None]:
# Loop through label possibilities and train/test logistic regression model
for label in label_possibilities:
    print("\n")
    print('*'*50)
    print('Predicting {} label'.format(label))
    trained_model=scc_train(X_train=X_train,
                            Y_train=Y_train,
                            M=M_train,
                            all_sensornames=feature_names,
                            all_labelnames=labelname_user,
                            used_sensors=sensor_types,
                            Y_target=label,
                            clf=clf2,
                            clf_type='svc')

    tn,fp,fn,tp,bal_accuracy=scc_test(X_test=X_test,
                                           Y_test=Y_test,
                                           M=M_test,
                                           all_sensornames=feature_names,
                                           all_labelnames=labelname_user,
                                           used_sensors=sensor_types,
                                           Y_target=label,
                                           trained_model=trained_model)
    print('-'*25)
    print('TP:{}, FP:{}, FN:{}, TP:{}, Balanced Accuracy:{}\n'.format(tn,fp,fn,tp,bal_accuracy))
    print('*'*50)

## Random Forest

In [None]:
# Loop through label possibilities and train/test logistic regression model
for label in label_possibilities:
    print("\n")
    print('*'*50)
    print('Predicting {} label'.format(label))
    trained_model=scc_train(X_train=X_train,
                            Y_train=Y_train,
                            M=M_train,
                            all_sensornames=feature_names,
                            all_labelnames=labelname_user,
                            used_sensors=sensor_types,
                            Y_target=label,
                            clf=clf3,
                            clf_type='rf')

    tn,fp,fn,tp,bal_accuracy=scc_test(X_test=X_test,
                                           Y_test=Y_test,
                                           M=M_test,
                                           all_sensornames=feature_names,
                                           all_labelnames=labelname_user,
                                           used_sensors=sensor_types,
                                           Y_target=label,
                                           trained_model=trained_model)
    print('-'*25)
    print('TP:{}, FP:{}, FN:{}, TP:{}, Balanced Accuracy:{}\n'.format(tn,fp,fn,tp,bal_accuracy))
    print('*'*50)

## ANN

In [None]:
# Loop through label possibilities and train/test logistic regression model
for label in label_possibilities:
    print("\n")
    print('*'*50)
    print('Predicting {} label'.format(label))
    trained_model=scc_train(X_train=X_train,
                            Y_train=Y_train,
                            M=M_train,
                            all_sensornames=feature_names,
                            all_labelnames=labelname_user,
                            used_sensors=sensor_types,
                            Y_target=label,
                            clf=clf4,
                            clf_type='ann')

    tn,fp,fn,tp,bal_accuracy=scc_test(X_test=X_test,
                                           Y_test=Y_test,
                                           M=M_test,
                                           all_sensornames=feature_names,
                                           all_labelnames=labelname_user,
                                           used_sensors=sensor_types,
                                           Y_target=label,
                                           trained_model=trained_model)
    print('-'*25)
    print('TP:{}, FP:{}, FN:{}, TP:{}, Balanced Accuracy:{}\n'.format(tn,fp,fn,tp,bal_accuracy))
    print('*'*50)

# Single Class Classifier: SOTA Comparison

## Logistic Regression Baseline

Researchers fitted a new LR for each label- so no warm start

In [None]:
clf=LogisticRegression(n_jobs=-1,warm_start=False,class_weight='balanced',solver='lbfgs') # Account for class imbalance
root='sota_comparison/lr/'
if not os.path.exists(root):
        os.mkdir(root)

#filename='scc_model_{}_label_{}.sav'.format(clf_type,Y_target)

# Loop through label possibilities and train/test logistic regression model
for label in label_possibilities:
    print("\n")
    print('*'*50)
    print('Predicting {} label'.format(label))
    trained_model=scc_train(X_train=X_train,
                            Y_train=Y_train,
                            M=M_train,
                            all_sensornames=feature_names,
                            all_labelnames=labelname_user,
                            used_sensors=sensor_types,
                            Y_target=label,
                            clf=clf,
                            clf_type='lr_baseline',
                            root=root)

    tn,fp,fn,tp,bal_accuracy=scc_test(X_test=X_test,
                                           Y_test=Y_test,
                                           M=M_test,
                                           all_sensornames=feature_names,
                                           all_labelnames=labelname_user,
                                           used_sensors=sensor_types,
                                           Y_target=label,
                                           trained_model=trained_model)
    print('-'*25)
    print('TP:{}, FP:{}, FN:{}, TP:{}, Balanced Accuracy:{}\n'.format(tn,fp,fn,tp,bal_accuracy))
    print('*'*50)

# Multi Class Classifier: Train/Test Functions

 Using the saved pickle files for this

In [14]:
# Defining sizes for neural networks and other hyperparameters
input_size=X_train_t.shape[-1]
output_size=Y_train_t.shape[-1]
n_epoch=40
bs=300
lr_init=0.1
momentum=0.5
#torch.set_default_tensor_type('torch.cuda.FloatTensor')

In [15]:
# Simple function to run using GPU when available
def C(structure):
    if torch.cuda.is_available():
        device=torch.device("cuda")
        return structure.to(device)

In [16]:
# Load pickle file datasets and normalize (and normalize the test set using same values)
with open('dataset/pickled/x_train.pkl','rb') as f:
    X_train=pickle.load(f)
    X_train,mean,standard_dev_nonzero=standardize(X_train) # Standardizing X_train
    X_train=C(torch.from_numpy(X_train).double())
    
with open('dataset/pickled/y_train.pkl','rb') as f:
    Y_train=pickle.load(f)
    Y_train=C(torch.from_numpy(Y_train).double())
    
with open('dataset/pickled/x_test.pkl','rb') as f:
    X_test=pickle.load(f)
    X_test=(X_test-mean)/standard_dev_nonzero
    X_test=C(torch.from_numpy(X_test).double())
    
with open('dataset/pickled/y_test.pkl','rb') as f:
    Y_test=pickle.load(f)
    Y_test=C(torch.from_numpy(Y_test).double())

In [17]:
# Dataloader creation
train_dataset=utils.TensorDataset(X_train,Y_train)
train_loader=utils.DataLoader(dataset=train_dataset,batch_size=bs,shuffle=True,drop_last=False)

test_dataset=utils.TensorDataset(X_test,Y_test)
test_loader=utils.DataLoader(dataset=test_dataset,batch_size=bs,shuffle=True,drop_last=False)

In [18]:
# Linear decreasing LR scheduler
def linear_lr_scheduler(optimizer, epoch):
    """
    LR_init=0.1, LR_final=0.01, n_epochs=40
    Sets the learning rate to the initial LR decayed by 1.04 every epoch"""
    for param_group in optimizer.param_groups:
        lr=param_group['lr']
    lr=lr*(0.94**(epoch//1))
    for param_group in optimizer.param_groups:
        param_group['lr']=lr

# Multi Class Classifier: SOTA Comparison

## Multi-Layer Perceptron (0 Hidden Layers)

In [210]:
class LinearMLP(nn.Module):
    def __init__(self):
        super(LinearMLP,self).__init__()
        self.fc1=nn.Linear(input_size,output_size)
    def forward(self,x):
#         if np.isnan(np.sum(x.cpu().numpy())):
#             print("some values from input are nan")
#             exit(0)
        x = self.fc1(x)
        return x
    
model=LinearMLP()
C(model) # Train model with CUDA

optimizer=optim.SGD(model.parameters(),lr=lr_init,momentum=momentum)
criterion=C(nn.BCEWithLogitsLoss()) # Or MultiLabelSoftMarginLoss (same thing in this case)

for epoch in range(n_epoch):
    sum_total=0.
    done=1
    for i,data in enumerate(train_loader,0):
        inputs,labels=data
        inputs=V(C(inputs)).float()
        labels=V(C(labels),requires_grad=True).type(torch.cuda.FloatTensor)
        
        optimizer.zero_grad() # Zero gradients
        if done:
            linear_lr_scheduler(optimizer,epoch) # Reduce LR once every epoch
            done=0
        
        output=model(inputs) # Log probabilities
        sigmoid_output=torch.sigmoid(output) # Squash log probabilities to between 0 -1 (linear scale)
        sigmoid_output=(sigmoid_output>=0.50).type(torch.cuda.FloatTensor)# Binarize outputs using a threshold
        sigmoid_output=V(sigmoid_output,requires_grad=True)
        
        loss=criterion(sigmoid_output,labels)
        loss.backward()
        optimizer.step()
        
        sum_total+=loss.item()
        #print("Batch Loss: ",loss.item())
        for param_group in optimizer.param_groups:
            epoch_lr=param_group['lr']
        if i%300==0: # Every minibatch
            print("Epoch {}::Minibatch {}::LR {} --> Loss {}".format(epoch+1,i+1,epoch_lr,sum_total/bs))
            sum_total=0.
    done=1
print('\n Finished training')

Epoch 1::Minibatch 1::LR 0.1 --> Loss 0.0032183112700780235
Epoch 1::Minibatch 301::LR 0.1 --> Loss 0.9650590946276982
Epoch 1::Minibatch 601::LR 0.1 --> Loss 0.9651288859049479
Epoch 2::Minibatch 1::LR 0.094 --> Loss 0.003210269808769226
Epoch 2::Minibatch 301::LR 0.094 --> Loss 0.9650534854332606
Epoch 2::Minibatch 601::LR 0.094 --> Loss 0.9652992163101832
Epoch 3::Minibatch 1::LR 0.08305839999999999 --> Loss 0.0032250189781188963
Epoch 3::Minibatch 301::LR 0.08305839999999999 --> Loss 0.9650082747141521
Epoch 3::Minibatch 601::LR 0.08305839999999999 --> Loss 0.9652988994121552
Epoch 4::Minibatch 1::LR 0.06898697810559998 --> Loss 0.003210179607073466
Epoch 4::Minibatch 301::LR 0.06898697810559998 --> Loss 0.9650662424166997
Epoch 4::Minibatch 601::LR 0.06898697810559998 --> Loss 0.9654047530889511
Epoch 5::Minibatch 1::LR 0.05386151140948994 --> Loss 0.0032131701707839968
Epoch 5::Minibatch 301::LR 0.05386151140948994 --> Loss 0.9650795473655065
Epoch 5::Minibatch 601::LR 0.05386151

Epoch 36::Minibatch 601::LR 1.1763803669959275e-18 --> Loss 0.965437204639117
Epoch 37::Minibatch 1::LR 1.2680912252701432e-19 --> Loss 0.0032036340236663817
Epoch 37::Minibatch 301::LR 1.2680912252701432e-19 --> Loss 0.9649083656072617
Epoch 37::Minibatch 601::LR 1.2680912252701432e-19 --> Loss 0.9653339302539825
Epoch 38::Minibatch 1::LR 1.2849347682763036e-20 --> Loss 0.003221744696299235
Epoch 38::Minibatch 301::LR 1.2849347682763036e-20 --> Loss 0.965207592844963
Epoch 38::Minibatch 601::LR 1.2849347682763036e-20 --> Loss 0.9651787579059601
Epoch 39::Minibatch 1::LR 1.223881915018486e-21 --> Loss 0.003232840100924174
Epoch 39::Minibatch 301::LR 1.223881915018486e-21 --> Loss 0.9651710498332977
Epoch 39::Minibatch 601::LR 1.223881915018486e-21 --> Loss 0.965292199254036
Epoch 40::Minibatch 1::LR 1.095786152073354e-22 --> Loss 0.0032187873125076294
Epoch 40::Minibatch 301::LR 1.095786152073354e-22 --> Loss 0.9651271146535874
Epoch 40::Minibatch 601::LR 1.095786152073354e-22 --> Loss

<span style="color:red">
    ISSUE: Doesn't seem to train well.
</span>

In [203]:
# Saving trained models
root='saved_models/multilabel_classifier/'
model_path=root+'mlp_0hidden'
checkpoint_path=root+'mlp_0hidden_checkpoint'

torch.save(model,model_path) # Saving the whole model

# Saving checkpoint model
torch.save({'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss':loss.item(),
            'sumloss':sum_total/bs},checkpoint_path)

  "type " + obj.__name__ + ". It won't be checked "


In [204]:
# Test dataset model performance

concat_predictions=torch.zeros(0,output_size)
concat_truelabels=torch.zeros(0,output_size)

for data in test_loader:
    inputs,labels=data
    inputs=V(C(inputs)).float()
    labels=V(C(labels)).type(torch.cuda.FloatTensor)
    
    outputs=model.forward(inputs).cpu() # Perform test time on CPU instead of GPU
    
    # Concat test set into one tensor
    concat_predictions=torch.cat((concat_predictions,outputs),0)
    concat_truelabels=torch.cat((concat_truelabels,labels.cpu()),0)

concat_predictions=torch.sigmoid(concat_predictions) # Squash log probabilities to between 0 -1 (linear scale)
concat_predictions=concat_predictions>=0.50 # Binarize outputs using a threshold

# Convert tensor to numpy float array
concat_predictions=concat_predictions.numpy().astype(np.float)
concat_truelabels=concat_truelabels.numpy().astype(np.float)

# Precision, Recall, F-1, support
mlp_0H_clfreport=classification_report(y_true=concat_truelabels,y_pred=concat_predictions,
                                       target_names=labelname_user,output_dict=True)
print('Test Set')
for i in range(output_size):
    true_perlabel=concat_truelabels[:,i]
    prediction_perlabel=concat_predictions[:,i]
    bal_acc=balanced_accuracy_score(y_true=true_perlabel,y_pred=prediction_perlabel)
    
    print('Label {} :::-> Balanced Accuracy {}'.format(labelname_user[i],round(bal_acc,5)))
    
for key,value in enumerate(mlp_0H_clfreport.items()):
    print(key,"\n")
    print("\t",value)

  'recall', 'true', average, warn_for)


Test Set
Label LYING_DOWN :::-> Balanced Accuracy 0.49367
Label SITTING :::-> Balanced Accuracy 0.49039
Label FIX_walking :::-> Balanced Accuracy 0.53409
Label FIX_running :::-> Balanced Accuracy 0.54461
Label BICYCLING :::-> Balanced Accuracy 0.60208
Label SLEEPING :::-> Balanced Accuracy 0.37799
Label LAB_WORK :::-> Balanced Accuracy 0.38557
Label IN_CLASS :::-> Balanced Accuracy 0.46811
Label IN_A_MEETING :::-> Balanced Accuracy 0.51059
Label LOC_main_workplace :::-> Balanced Accuracy 0.53529
Label OR_indoors :::-> Balanced Accuracy 0.53955
Label OR_outside :::-> Balanced Accuracy 0.46179
Label IN_A_CAR :::-> Balanced Accuracy 0.63833
Label ON_A_BUS :::-> Balanced Accuracy 0.54325
Label DRIVE_-_I_M_THE_DRIVER :::-> Balanced Accuracy 0.49334
Label DRIVE_-_I_M_A_PASSENGER :::-> Balanced Accuracy 0.48506
Label LOC_home :::-> Balanced Accuracy 0.54941
Label FIX_restaurant :::-> Balanced Accuracy 0.51578
Label PHONE_IN_POCKET :::-> Balanced Accuracy 0.56543
Label OR_exercise :::-> Balanc

In [205]:
# Train dataset model performance

concat_predictions=torch.zeros(0,output_size)
concat_truelabels=torch.zeros(0,output_size)

for data in train_loader:
    inputs,labels=data
    inputs=V(C(inputs)).float()
    labels=V(C(labels)).type(torch.cuda.FloatTensor)
    
    outputs=model.forward(inputs).cpu() # Perform test time on CPU instead of GPU
    
    # Concat test set into one tensor
    concat_predictions=torch.cat((concat_predictions,outputs),0)
    concat_truelabels=torch.cat((concat_truelabels,labels.cpu()),0)

concat_predictions=torch.sigmoid(concat_predictions) # Squash log probabilities to between 0 -1 (linear scale)
concat_predictions=concat_predictions>=0.50 # Binarize outputs using a threshold

# Convert tensor to numpy float array
concat_predictions=concat_predictions.numpy().astype(np.float)
concat_truelabels=concat_truelabels.numpy().astype(np.float)

mlp_0H_clfreport_train=classification_report(y_true=concat_truelabels,y_pred=concat_predictions,
                                       target_names=labelname_user,output_dict=True)
print('Train Set')
for i in range(output_size):
    true_perlabel=concat_truelabels[:,i]
    prediction_perlabel=concat_predictions[:,i]
    bal_acc=balanced_accuracy_score(y_true=true_perlabel,y_pred=prediction_perlabel)
    
    print('Label {} :::-> Balanced Accuracy {}'.format(labelname_user[i],round(bal_acc,5)))
    
for key,value in enumerate(mlp_0H_clfreport_train.items()):
    print(key,"\n")
    print("\t",value)

Train Set
Label LYING_DOWN :::-> Balanced Accuracy 0.49183
Label SITTING :::-> Balanced Accuracy 0.48878
Label FIX_walking :::-> Balanced Accuracy 0.53798
Label FIX_running :::-> Balanced Accuracy 0.52991
Label BICYCLING :::-> Balanced Accuracy 0.61741
Label SLEEPING :::-> Balanced Accuracy 0.37668
Label LAB_WORK :::-> Balanced Accuracy 0.38688
Label IN_CLASS :::-> Balanced Accuracy 0.45732
Label IN_A_MEETING :::-> Balanced Accuracy 0.50898
Label LOC_main_workplace :::-> Balanced Accuracy 0.53326
Label OR_indoors :::-> Balanced Accuracy 0.53908
Label OR_outside :::-> Balanced Accuracy 0.45888
Label IN_A_CAR :::-> Balanced Accuracy 0.64389
Label ON_A_BUS :::-> Balanced Accuracy 0.53288
Label DRIVE_-_I_M_THE_DRIVER :::-> Balanced Accuracy 0.49158
Label DRIVE_-_I_M_A_PASSENGER :::-> Balanced Accuracy 0.47796
Label LOC_home :::-> Balanced Accuracy 0.55037
Label FIX_restaurant :::-> Balanced Accuracy 0.53036
Label PHONE_IN_POCKET :::-> Balanced Accuracy 0.5636
Label OR_exercise :::-> Balanc

## Multi-Layer Perceptron (1 Hidden Layer)

In [None]:
# Linear Model
# Training was done using gradient descent with back-propagation:
#n_epoch=40,bs=300 examples,lr[0.1-0.01] linearly epoch decreases,momentum weight 0.5.
clf=MLPClassifier(hidden_layer_sizes=(0,),activation='relu',
                   solver='adam',batch_size=300,learning_rate_init=0.1,
                   learning_rate='adaptive',max_iter=40,random_state=random_state,tol=0.0001,
                   verbose=True,warm_start=True,early_stopping=False,validation_fraction=0.05,
                   momentum=0.5,epsilon=1e-08, n_iter_no_change=10)


root='sota_comparison/mlp/'
if not os.path.exists(root):
        os.mkdir(root)

# Loop through label possibilities and train/test logistic regression model
trained_model=mcc_train(X_train=X_train,
                        Y_train=Y_train,
                        M=M_train,
                        all_sensornames=feature_names,
                        used_sensors=sensor_types,
                        clf=clf,
                        clf_type='mlp_linear_0hidden',
                        root=root)

#mcc_test(X_test=X_test,Y_test=Y_test,all_sensornames=feature_names,used_sensors=sensor_types,
        #trained_model=trained_model)