<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Imports" data-toc-modified-id="Imports-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Imports</a></span></li><li><span><a href="#Dataset-parsers-and-cleaning-functions" data-toc-modified-id="Dataset-parsers-and-cleaning-functions-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Dataset parsers and cleaning functions</a></span></li><li><span><a href="#User-data-test" data-toc-modified-id="User-data-test-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>User data test</a></span></li><li><span><a href="#Training" data-toc-modified-id="Training-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Training</a></span><ul class="toc-item"><li><span><a href="#Importing-data-(no-cross-validation)" data-toc-modified-id="Importing-data-(no-cross-validation)-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>Importing data (no cross-validation)</a></span></li></ul></li><li><span><a href="#Single-Class-Classifier:-Train/Test-Functions" data-toc-modified-id="Single-Class-Classifier:-Train/Test-Functions-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Single Class Classifier: Train/Test Functions</a></span></li><li><span><a href="#Multi-Class-Classifier:-Train/Test-Functions" data-toc-modified-id="Multi-Class-Classifier:-Train/Test-Functions-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Multi Class Classifier: Train/Test Functions</a></span></li><li><span><a href="#Initial-Setup-and-Trials" data-toc-modified-id="Initial-Setup-and-Trials-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Initial Setup and Trials</a></span><ul class="toc-item"><li><span><a href="#Model-Choices" data-toc-modified-id="Model-Choices-7.1"><span class="toc-item-num">7.1&nbsp;&nbsp;</span>Model Choices</a></span></li><li><span><a href="#Logistic-Regression" data-toc-modified-id="Logistic-Regression-7.2"><span class="toc-item-num">7.2&nbsp;&nbsp;</span>Logistic Regression</a></span></li><li><span><a href="#Support-Vector" data-toc-modified-id="Support-Vector-7.3"><span class="toc-item-num">7.3&nbsp;&nbsp;</span>Support-Vector</a></span></li><li><span><a href="#Random-Forest" data-toc-modified-id="Random-Forest-7.4"><span class="toc-item-num">7.4&nbsp;&nbsp;</span>Random Forest</a></span></li><li><span><a href="#ANN" data-toc-modified-id="ANN-7.5"><span class="toc-item-num">7.5&nbsp;&nbsp;</span>ANN</a></span></li></ul></li><li><span><a href="#Single-Class-Classifier:-SOTA-Comparison" data-toc-modified-id="Single-Class-Classifier:-SOTA-Comparison-8"><span class="toc-item-num">8&nbsp;&nbsp;</span>Single Class Classifier: SOTA Comparison</a></span><ul class="toc-item"><li><span><a href="#Logistic-Regression-Baseline" data-toc-modified-id="Logistic-Regression-Baseline-8.1"><span class="toc-item-num">8.1&nbsp;&nbsp;</span>Logistic Regression Baseline</a></span></li></ul></li><li><span><a href="#Multi-Class-Classifier:-SOTA-Comparison" data-toc-modified-id="Multi-Class-Classifier:-SOTA-Comparison-9"><span class="toc-item-num">9&nbsp;&nbsp;</span>Multi Class Classifier: SOTA Comparison</a></span><ul class="toc-item"><li><span><a href="#Multi-Layer-Perceptron-(0-Hidden-Layers)" data-toc-modified-id="Multi-Layer-Perceptron-(0-Hidden-Layers)-9.1"><span class="toc-item-num">9.1&nbsp;&nbsp;</span>Multi-Layer Perceptron (0 Hidden Layers)</a></span></li></ul></li><li><span><a href="#Custom-Cross-Validation" data-toc-modified-id="Custom-Cross-Validation-10"><span class="toc-item-num">10&nbsp;&nbsp;</span>Custom Cross-Validation</a></span></li><li><span><a href="#Given-Cross-Validation-splits" data-toc-modified-id="Given-Cross-Validation-splits-11"><span class="toc-item-num">11&nbsp;&nbsp;</span>Given Cross-Validation splits</a></span></li></ul></div>

# Imports

In [1]:
# Required imports
import os
import numpy as np
import pandas as pd
import gzip
import glob
import pickle
from io import StringIO
import importlib.machinery

from sklearn.model_selection import train_test_split as TT_split
# Models
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import accuracy_score,confusion_matrix,balanced_accuracy_score,multilabel_confusion_matrix
#from sklearn.model_selection import StratifiedShuffleSplit, ShuffleSplit, KFold, StratifiedKFold
#from sklearn.preprocessing import StandardScaler

random_state=10

In [2]:
# Data location and sample user
prefix='dataset/Extrasensory_uuid_fl_uTAR/'
cross_validation_user_loc='dataset/cv_5_folds/'
user_sample='3600D531-0C55-44A7-AE95-A7A38519464E.features_labels'

# Dataset parsers and cleaning functions

In [3]:
# Dataset parsers for header/ body for CSVs

def parse_header_of_csv(csv_str):
    # Isolate the headline columns:
    headline = csv_str[:csv_str.index('\n')];
    columns = headline.split(',');

    # The first column should be timestamp:
    assert columns[0] == 'timestamp';
    # The last column should be label_source:
    assert columns[-1] == 'label_source';
    
    # Search for the column of the first label:
    for (ci,col) in enumerate(columns):
        if col.startswith('label:'):
            first_label_ind = ci;
            break;
        pass;

    # Feature columns come after timestamp and before the labels:
    feature_names = columns[1:first_label_ind];
    # Then come the labels, till the one-before-last column:
    label_names = columns[first_label_ind:-1];
    for (li,label) in enumerate(label_names):
        # In the CSV the label names appear with prefix 'label:', but we don't need it after reading the data:
        assert label.startswith('label:');
        label_names[li] = label.replace('label:','');
        pass;
    
    return (feature_names,label_names);

def parse_body_of_csv(csv_str,n_features):
    # Read the entire CSV body into a single numeric matrix:
    full_table = np.loadtxt(StringIO(csv_str),delimiter=',',skiprows=1);
    
    # Timestamp is the primary key for the records (examples):
    timestamps = full_table[:,0].astype(int);
    
    # Read the sensor features:
    X = full_table[:,1:(n_features+1)];
    
    # Read the binary label values, and the 'missing label' indicators:
    trinary_labels_mat = full_table[:,(n_features+1):-1]; # This should have values of either 0., 1. or NaN
    M = np.isnan(trinary_labels_mat); # M is the missing label matrix
    Y = np.where(M,0,trinary_labels_mat) > 0.; # Y is the label matrix
    
    return (X,Y,M,timestamps);

def read_user_data(directory):
    print('Reading {}'.format(directory.split("/")[-1]))

    # Read the entire csv file of the user:
    with gzip.open(directory,'rb') as fid:
        csv_str = fid.read();
        csv_str = csv_str.decode("utf-8")
        pass;

    (feature_names,label_names) = parse_header_of_csv(csv_str);
    n_features = len(feature_names);
    (X,Y,M,timestamps) = parse_body_of_csv(csv_str,n_features);

    return (X,Y,M,timestamps,feature_names,label_names);

In [4]:
# Clean labels
def clean_labels(input_label):
    if label.endswith('_'):
        label=label[:-1]+')'
    label=label.replace('__',' (').replace('_',' ')
    label=label[0]+label[1:].lower()
    label=label.replace('i m','I\'m')
    return label

In [5]:
# Get a summary of the sensor feature
'''
# Summarize features as we are only using phone_acc,phone_gyro,phone_mag,phone_loc,phone_audio,
# phone_app,phone_battery,phone_use,phone_callstat,phone_wifi,phone_lf,phone_time
# We are ignoring the use of the smartwatch features. There are definitely features that will be used
# much more (e.g. than the phone_callstat) but we'll leave that up to the ML algorithm.
'''
def summarize_features(feature_list):
    summary_feature_list=np.empty_like(feature_list)
    for (ind,feature) in enumerate(feature_list):
        if feature.startswith('raw_acc'):
            summary_feature_list[ind]='phone_acc' 
        if feature.startswith('proc_gyro'):
            summary_feature_list[ind]='phone_gyro'
        if feature.startswith('raw_magnet'):
            summary_feature_list[ind]='phone_mag'
        if feature.startswith('watch_acc'):
            summary_feature_list[ind]='watch_acc'
        if feature.startswith('watch_heading'):
            summary_feature_list[ind]='watch_dir'
        if feature.startswith('location'):
            summary_feature_list[ind]='phone_loc'
        if feature.startswith('audio_naive'):
            summary_feature_list[ind]='phone_audio'
        if feature.startswith('discrete:app_state'):
            summary_feature_list[ind]='phone_app'
        if feature.startswith('discrete:battery'):
            summary_feature_list[ind]='phone_battery'
        if feature.startswith('discrete:on'):
            summary_feature_list[ind]='phone_use'
        if feature.startswith('discrete:ringer'):
            summary_feature_list[ind]='phone_callstat'
        if feature.startswith('discrete:wifi'):
            summary_feature_list[ind]='phone_wifi'
        if feature.startswith('lf'):
            summary_feature_list[ind]='phone_lf'
        if feature.startswith('discrete:time'):
            summary_feature_list[ind]='phone_time'

    return summary_feature_list

# User data test

In [6]:
# Reading sample data
sample_loc='{}/{}.csv.gz'.format(prefix,user_sample)
x_user,y_user,missedlabel_user,tstamp_user,featurename_user,labelname_user=read_user_data(sample_loc)

Reading 3600D531-0C55-44A7-AE95-A7A38519464E.features_labels.csv.gz


In [7]:
# Dataset summaries for this user
print('Data shape input for user (Len minutes/num examples, num sensors): ',x_user.shape) # Timestep examples, number of sensors
print('Label shape for user (Len minutes, num labels): ',y_user.shape,'\n') # Timestep examples, labels

countlabels_user=np.sum(y_user,axis=0) # Column summary
labelname_countlabel_user=zip(labelname_user,countlabels_user) # Zip together names, counts
labelname_countlabel_user=sorted(labelname_countlabel_user,key=lambda row:row[-1],reverse=True)

print('Sensor feature names:\n')
feature_names=summarize_features(featurename_user)

for i,sensor_feature in enumerate(featurename_user):
    print('{} :: {} ::--> {}\n'.format(i,feature_names[i],sensor_feature))

print('Activities and counts:')
print(labelname_countlabel_user)

Data shape input for user (Len minutes/num examples, num sensors):  (5203, 225)
Label shape for user (Len minutes, num labels):  (5203, 51) 

Sensor feature names:

0 :: phone_acc ::--> raw_acc:magnitude_stats:mean

1 :: phone_acc ::--> raw_acc:magnitude_stats:std

2 :: phone_acc ::--> raw_acc:magnitude_stats:moment3

3 :: phone_acc ::--> raw_acc:magnitude_stats:moment4

4 :: phone_acc ::--> raw_acc:magnitude_stats:percentile25

5 :: phone_acc ::--> raw_acc:magnitude_stats:percentile50

6 :: phone_acc ::--> raw_acc:magnitude_stats:percentile75

7 :: phone_acc ::--> raw_acc:magnitude_stats:value_entropy

8 :: phone_acc ::--> raw_acc:magnitude_stats:time_entropy

9 :: phone_acc ::--> raw_acc:magnitude_spectrum:log_energy_band0

10 :: phone_acc ::--> raw_acc:magnitude_spectrum:log_energy_band1

11 :: phone_acc ::--> raw_acc:magnitude_spectrum:log_energy_band2

12 :: phone_acc ::--> raw_acc:magnitude_spectrum:log_energy_band3

13 :: phone_acc ::--> raw_acc:magnitude_spectrum:log_energy_ban

# Training

In [8]:
# Choosing sensor labels
'''
Summary sensor choices are: phone_acc,phone_gyro,phone_mag,watch_acc,watch_dir,phone_loc,phone_audio,
phone_app,phone_battery,phone_use,phone_callstat,phone_wifi,phone_lf,phone_time
In this project, we aren't using watch_acc,watch_dir (no smartwatch)
'''

def choose_sensors(X_train,used_sensors,summarized_feature_names):
    used_sensor_feature_names=np.zeros(len(summarized_feature_names),dtype=bool)
    # Creates a zero boolean vector of all possible feature names
    for s in used_sensors:
        used_sensor_feature_names=np.logical_or(used_sensor_feature_names,(s==summarized_feature_names))
    X_train=X_train[:,used_sensor_feature_names]
    return X_train

In [9]:
# Returns a standardized (0 mean, 1 variance) dataset
def standardize(X_train):
    mean=np.nanmean(X_train,axis=0).reshape((1,-1))# Ignores NaNs while finding the mean across rows
    standard_dev=np.nanstd(X_train,axis=0) # Ignores NaNs while finding the standard deviation across rows
    standard_dev_nonzero=np.where(standard_dev>0,standard_dev,1.).reshape((1,-1)) # Div zero
    
    X=(X_train-mean)/standard_dev_nonzero
    return X,mean,standard_dev_nonzero   

## Importing data (no cross-validation)

In [10]:
# Reading data in the directory (Stacked)
X_train=np.empty((0,225))
Y_train=np.empty((0,51))
X_test=np.empty((0,225))
Y_test=np.empty((0,51))
M_train=np.empty((0,51))
M_test=np.empty((0,51))

for u_file in glob.glob('{}/*.csv.gz'.format(prefix)):
        x_user,y_user,missed_label_user,tstamp_user,featurename_user,labelname_user=read_user_data(u_file)
        # Split each user data into train-test splits .70-.30 as in literature
        x_train_u,x_test_u,y_train_u,y_test_u=TT_split(x_user,y_user,test_size=0.30,random_state=random_state)
        m_train,m_test=TT_split(missed_label_user,test_size=0.30,random_state=random_state)
        
        # Stacking data. Will be changed for K-Fold cross-validation
        X_train=np.vstack((X_train,x_train_u))
        Y_train=np.vstack((Y_train,y_train_u))
        X_test=np.vstack((X_test,x_test_u))
        Y_test=np.vstack((Y_test,y_test_u))
    
        # Missing data matrix
        M_train=np.vstack((M_train,m_train))
        M_test=np.vstack((M_test,m_test))
        
assert len(X_train)==len(Y_train)
assert len(X_test)==len(Y_test)

print('\nTraining: X::{} ,Y::{}'.format(X_train.shape,Y_train.shape))
print('Testing: X::{} ,Y::{}'.format(X_test.shape,Y_test.shape))

Reading 3600D531-0C55-44A7-AE95-A7A38519464E.features_labels.csv.gz
Reading 8023FE1A-D3B0-4E2C-A57A-9321B7FC755F.features_labels.csv.gz
Reading 86A4F379-B305-473D-9D83-FC7D800180EF.features_labels.csv.gz
Reading 1538C99F-BA1E-4EFB-A949-6C7C47701B20.features_labels.csv.gz
Reading 11B5EC4D-4133-4289-B475-4E737182A406.features_labels.csv.gz
Reading 74B86067-5D4B-43CF-82CF-341B76BEA0F4.features_labels.csv.gz
Reading 4FC32141-E888-4BFF-8804-12559A491D8C.features_labels.csv.gz
Reading B9724848-C7E2-45F4-9B3F-A1F38D864495.features_labels.csv.gz
Reading A76A5AF5-5A93-4CF2-A16E-62353BB70E8A.features_labels.csv.gz
Reading 96A358A0-FFF2-4239-B93E-C7425B901B47.features_labels.csv.gz
Reading 665514DE-49DC-421F-8DCB-145D0B2609AD.features_labels.csv.gz
Reading BE3CA5A6-A561-4BBD-B7C9-5DF6805400FC.features_labels.csv.gz
Reading A5A30F76-581E-4757-97A2-957553A2C6AA.features_labels.csv.gz
Reading 27E04243-B138-4F40-A164-F40B60165CF3.features_labels.csv.gz
Reading 0E6184E1-90C0-48EE-B25A-F1ECB7B9714E.fea

# Single Class Classifier: Train/Test Functions

In [11]:
# Train model function repeat for every training label
def scc_train(X_train,Y_train,M,all_sensornames,all_labelnames,used_sensors,Y_target,clf,clf_type,root):
    print('-'*50)
    out_model={}
    X_train=choose_sensors(X_train,used_sensors,all_sensornames)
    print('Current X_train shape is {}'.format(X_train.shape))
    print('Using sensors {}'.format(used_sensors))
    
    X_train,mean,standard_dev_nonzero=standardize(X_train) # Standardizing X_train to have zero mean and unit variance
    index_label=all_labelnames.index(Y_target)
    y_train=Y_train[:,index_label] # Choosing a column of data for training consisting of only one target label
    
    any_missingdata_label=M[:,index_label] # Check if there's data missing for that label
    any_presentdata_features=np.logical_not(any_missingdata_label) # Present data indices boolean
    
    x_train=X_train[any_presentdata_features,:] # Training dataset consists of data that's available ignoring all missing rows
    x_train[np.isnan(x_train)]=0 # Zero imputation of NaN values
    y_train=y_train[any_presentdata_features] # Training labels consists of data that's available
    print('Current X_train shape after removing missing data & zero-impute is {}'.format(x_train.shape))
    
    # Fitting classifier
    clf.fit(x_train,y_train)
    
    out_model['train_mean']=mean
    out_model['train_std_dev_nonzero']=standard_dev_nonzero
    out_model['classifier']=clf
    
    filename='scc_model_{}_label_{}.sav'.format(clf_type,Y_target)
    pickle.dump(clf,open(os.path.join(root,filename), 'wb'))
    
    return out_model

In [12]:
# Test model function repeat for every training label
def scc_test(X_test,Y_test,M,all_sensornames,all_labelnames,used_sensors,Y_target,trained_model):
    print('-'*50)
    X_test=choose_sensors(X_test,used_sensors,all_sensornames)
    print('Current X_test shape is {}'.format(X_test.shape))
    # Normalizing test set the same way training set was normalized
    X=(X_test-trained_model['train_mean'])/trained_model['train_std_dev_nonzero']
       
    index_label=all_labelnames.index(Y_target)
    y_test=Y_test[:,index_label] # Choosing a column of data for training consisting of only one target label
    
    any_missingdata_label=M[:,index_label] # Check if there's data missing for that label
    any_presentdata_features=np.logical_not(any_missingdata_label) # Present data indices boolean
    
    x_test=X[any_presentdata_features,:] # Training dataset consists of data that's available ignoring all missing rows
    x_test[np.isnan(x_test)]=0 # Zero imputation of NaN values
    y_test=y_test[any_presentdata_features] # Training labels consists of data that's available
    
    y_test_predicted=trained_model['classifier'].predict(x_test) # Prediction
    
    tn,fp,fn,tp=confusion_matrix(y_test,y_test_predicted).ravel()
    bal_accuracy=balanced_accuracy_score(y_test,y_test_predicted)
    return tn,fp,fn,tp,bal_accuracy

# Multi Class Classifier: Train/Test Functions

In [13]:
# Train model function multi-label
def mcc_train(X_train,Y_train,M,all_sensornames,used_sensors,clf,clf_type,root):
    print('-'*50)
    out_model={}
    X_train=choose_sensors(X_train,used_sensors,all_sensornames)
    print('Current X_train shape is {}'.format(X_train.shape))
    print('Using sensors {}'.format(used_sensors))
    
    X_train,mean,standard_dev_nonzero=standardize(X_train) # Standardizing X_train to have zero mean and unit variance
    #index_label=all_labelnames.index(Y_target)
    #y_train=Y_train[:,index_label] # Choosing a column of data for training consisting of only one target label
    
    #any_missingdata_label=M[:,index_label] # Check if there's data missing for that label
    any_presentdata_features=np.logical_not(M) # Present data indices boolean
    
    x_train=X_train[any_presentdata_features,:] # Training dataset consists of data that's available ignoring all missing rows
    x_train[np.isnan(x_train)]=0 # Zero imputation of NaN values
    y_train=Y_train[any_presentdata_features] # Training labels consists of data that's available
    print('Current X_train shape after removing missing data & zero-impute is {}'.format(x_train.shape))
    
    # Fitting classifier
    clf.fit(x_train,y_train)
    
    out_model['train_mean']=mean
    out_model['train_std_dev_nonzero']=standard_dev_nonzero
    out_model['classifier']=clf
    
    filename='mcc_model_{}.sav'.format(clf_type)
    pickle.dump(clf,open(os.path.join(root,filename), 'wb'))
    
    return out_model

In [18]:
# Test model function multi-label
def mcc_test(X_test,Y_test,M,all_sensornames,used_sensors,trained_model):
    print('-'*50)
    X_test=choose_sensors(X_test,used_sensors,all_sensornames)
    print('Current X_test shape is {}'.format(X_test.shape))
    # Normalizing test set the same way training set was normalized
    X=(X_test-trained_model['train_mean'])/trained_model['train_std_dev_nonzero']
       
    #index_label=all_labelnames.index(Y_target)
    #y_test=Y_test[:,index_label] # Choosing a column of data for training consisting of only one target label
    
    #any_missingdata_label=M[:,index_label] # Check if there's data missing for that label
    any_presentdata_features=np.logical_not(M) # Present data indices boolean
    
    x_test=X[any_presentdata_features,:] # Training dataset consists of data that's available ignoring all missing rows
    x_test[np.isnan(x_test)]=0 # Zero imputation of NaN values
    y_test=y_test[any_presentdata_features] # Training labels consists of data that's available
    
    y_test_predicted=trained_model['classifier'].predict(x_test) # Prediction
    
    cm=multilabel_confusion_matrix(y_test,y_test_predicted).ravel()
    
    for i in range(len(cm)):
        print("\n")
        print("Label {}::".format(label_possibilities[i]))
        tn,fp,fn,tp=cm[i].ravel()
        
        tpr=tp/(tp+fn) # Sensitivity/True Positive Rate
        tnr=tn/(fp+tn) # Specificity/True Negative Rate
        bal_accuracy=0.5*(tpr+tnr)
        print("\t TN:{}, FP:{}, FN:{}, TP:{}, Balanced accuracy:{}".format(tn,fp,fn,tp,bal_accuracy))
        #bal_accuracy=balanced_accuracy_score(y_test,y_test_predicted)
        print('-'*50)
        
    return

# Initial Setup and Trials

In [16]:
# Sensor Types, Label Possibilities variables
sensor_types=['phone_acc','phone_gyro','phone_mag','phone_loc','phone_audio',
'phone_app','phone_battery','phone_use','phone_callstat','phone_wifi','phone_lf','phone_time']
label_possibilities=['LOC_home','OR_indoors','PHONE_ON_TABLE','SITTING','WITH_FRIENDS',
 'LYING_DOWN','SLEEPING','WATCHING_TV','EATING','PHONE_IN_POCKET',
 'TALKING','DRIVE_-_I_M_A_PASSENGER','OR_standing','IN_A_CAR',
 'OR_exercise','AT_THE_GYM','SINGING','FIX_walking','OR_outside',
 'SHOPPING','AT_SCHOOL','BATHING_-_SHOWER','DRESSING','DRINKING__ALCOHOL_',
 'PHONE_IN_HAND','FIX_restaurant','IN_CLASS','PHONE_IN_BAG','IN_A_MEETING',
 'TOILET','COOKING','ELEVATOR','FIX_running','BICYCLING','LAB_WORK',
 'LOC_main_workplace','ON_A_BUS','DRIVE_-_I_M_THE_DRIVER','STROLLING',
 'CLEANING','DOING_LAUNDRY','WASHING_DISHES','SURFING_THE_INTERNET',
 'AT_A_PARTY','AT_A_BAR','LOC_beach','COMPUTER_WORK','GROOMING','STAIRS_-_GOING_UP',
 'STAIRS_-_GOING_DOWN','WITH_CO-WORKERS']

## Model Choices

In [18]:
# Models
clf1=LogisticRegression(n_jobs=-1,warm_start=True,class_weight='balanced',solver='lbfgs') # Account for class imbalance
clf2=SVC(kernel='linear',cache_size=2000)
clf3=RandomForestClassifier(n_estimators=100,n_jobs=-1,warm_start=False,class_weight='balanced')
clf4=MLPClassifier(hidden_layer_sizes=(100, ),activation='relu',
                   solver='adam',batch_size=300,learning_rate_init=0.01,
                   max_iter=10,random_state=random_state,
                   verbose=True,warm_start=True,early_stopping=False,validation_fraction=0.05,
                   epsilon=1e-08, n_iter_no_change=10)

## Logistic Regression

In [15]:
# Loop through label possibilities and train/test logistic regression model
for label in label_possibilities:
    print("\n")
    print('*'*50)
    print('Predicting {} label'.format(label))
    trained_model=scc_train(X_train=X_train,
                            Y_train=Y_train,
                            M=M_train,
                            all_sensornames=feature_names,
                            all_labelnames=labelname_user,
                            used_sensors=sensor_types,
                            Y_target=label,
                            clf=clf1,
                            clf_type='logisticregression')

    tn,fp,fn,tp,bal_accuracy=scc_test(X_test=X_test,
                                           Y_test=Y_test,
                                           M=M_test,
                                           all_sensornames=feature_names,
                                           all_labelnames=labelname_user,
                                           used_sensors=sensor_types,
                                           Y_target=label,
                                           trained_model=trained_model)
    print('-'*25)
    print('TP:{}, FP:{}, FN:{}, TP:{}, Balanced Accuracy:{}\n'.format(tn,fp,fn,tp,bal_accuracy))
    print('*'*50)



**************************************************
Predicting LOC_home label
--------------------------------------------------
Current X_train shape is (301853, 168)
Using sensors ['phone_acc', 'phone_gyro', 'phone_mag', 'phone_loc', 'phone_audio', 'phone_app', 'phone_battery', 'phone_use', 'phone_callstat', 'phone_wifi', 'phone_lf', 'phone_time']
Current X_train shape after removing missing data & zero-impute is (284244, 168)
--------------------------------------------------
Current X_test shape is (75493, 168)
-------------------------
TP:31907, FP:8847, FN:5817, TP:24519, Balanced Accuracy:0.7955823204315426

**************************************************


**************************************************
Predicting OR_indoors label
--------------------------------------------------
Current X_train shape is (301853, 168)
Using sensors ['phone_acc', 'phone_gyro', 'phone_mag', 'phone_loc', 'phone_audio', 'phone_app', 'phone_battery', 'phone_use', 'phone_callstat', 'phone_wif

Current X_train shape is (301853, 168)
Using sensors ['phone_acc', 'phone_gyro', 'phone_mag', 'phone_loc', 'phone_audio', 'phone_app', 'phone_battery', 'phone_use', 'phone_callstat', 'phone_wifi', 'phone_lf', 'phone_time']
Current X_train shape after removing missing data & zero-impute is (245191, 168)
--------------------------------------------------
Current X_test shape is (75493, 168)
-------------------------
TP:34061, FP:19863, FN:1932, TP:5547, Balanced Accuracy:0.6866624702121954

**************************************************


**************************************************
Predicting IN_A_CAR label
--------------------------------------------------
Current X_train shape is (301853, 168)
Using sensors ['phone_acc', 'phone_gyro', 'phone_mag', 'phone_loc', 'phone_audio', 'phone_app', 'phone_battery', 'phone_use', 'phone_callstat', 'phone_wifi', 'phone_lf', 'phone_time']
Current X_train shape after removing missing data & zero-impute is (139665, 168)
---------------------

Current X_train shape after removing missing data & zero-impute is (113415, 168)
--------------------------------------------------
Current X_test shape is (75493, 168)
-------------------------
TP:19769, FP:5590, FN:696, TP:2233, Balanced Accuracy:0.770970838950688

**************************************************


**************************************************
Predicting FIX_restaurant label
--------------------------------------------------
Current X_train shape is (301853, 168)
Using sensors ['phone_acc', 'phone_gyro', 'phone_mag', 'phone_loc', 'phone_audio', 'phone_app', 'phone_battery', 'phone_use', 'phone_callstat', 'phone_wifi', 'phone_lf', 'phone_time']
Current X_train shape after removing missing data & zero-impute is (126026, 168)
--------------------------------------------------
Current X_test shape is (75493, 168)
-------------------------
TP:27161, FP:3973, FN:50, TP:372, Balanced Accuracy:0.876953450259496

**************************************************


***

-------------------------
TP:29359, FP:4067, FN:42, TP:335, Balanced Accuracy:0.8834612059641306

**************************************************


**************************************************
Predicting DRIVE_-_I_M_THE_DRIVER label
--------------------------------------------------
Current X_train shape is (301853, 168)
Using sensors ['phone_acc', 'phone_gyro', 'phone_mag', 'phone_loc', 'phone_audio', 'phone_app', 'phone_battery', 'phone_use', 'phone_callstat', 'phone_wifi', 'phone_lf', 'phone_time']
Current X_train shape after removing missing data & zero-impute is (129769, 168)
--------------------------------------------------
Current X_test shape is (75493, 168)
-------------------------
TP:27593, FP:3268, FN:197, TP:1437, Balanced Accuracy:0.8867713969341031

**************************************************


**************************************************
Predicting STROLLING label
--------------------------------------------------
Current X_train shape is (301853,

Current X_train shape is (301853, 168)
Using sensors ['phone_acc', 'phone_gyro', 'phone_mag', 'phone_loc', 'phone_audio', 'phone_app', 'phone_battery', 'phone_use', 'phone_callstat', 'phone_wifi', 'phone_lf', 'phone_time']
Current X_train shape after removing missing data & zero-impute is (75536, 168)
--------------------------------------------------
Current X_test shape is (75493, 168)
-------------------------
TP:16247, FP:2472, FN:19, TP:141, Balanced Accuracy:0.8745958317752016

**************************************************


**************************************************
Predicting WITH_CO-WORKERS label
--------------------------------------------------
Current X_train shape is (301853, 168)
Using sensors ['phone_acc', 'phone_gyro', 'phone_mag', 'phone_loc', 'phone_audio', 'phone_app', 'phone_battery', 'phone_use', 'phone_callstat', 'phone_wifi', 'phone_lf', 'phone_time']
Current X_train shape after removing missing data & zero-impute is (89679, 168)
--------------------

## Support-Vector

In [None]:
# Loop through label possibilities and train/test logistic regression model
for label in label_possibilities:
    print("\n")
    print('*'*50)
    print('Predicting {} label'.format(label))
    trained_model=scc_train(X_train=X_train,
                            Y_train=Y_train,
                            M=M_train,
                            all_sensornames=feature_names,
                            all_labelnames=labelname_user,
                            used_sensors=sensor_types,
                            Y_target=label,
                            clf=clf2,
                            clf_type='svc')

    tn,fp,fn,tp,bal_accuracy=scc_test(X_test=X_test,
                                           Y_test=Y_test,
                                           M=M_test,
                                           all_sensornames=feature_names,
                                           all_labelnames=labelname_user,
                                           used_sensors=sensor_types,
                                           Y_target=label,
                                           trained_model=trained_model)
    print('-'*25)
    print('TP:{}, FP:{}, FN:{}, TP:{}, Balanced Accuracy:{}\n'.format(tn,fp,fn,tp,bal_accuracy))
    print('*'*50)



**************************************************
Predicting LOC_home label
--------------------------------------------------
Current X_train shape is (301853, 168)
Using sensors ['phone_acc', 'phone_gyro', 'phone_mag', 'phone_loc', 'phone_audio', 'phone_app', 'phone_battery', 'phone_use', 'phone_callstat', 'phone_wifi', 'phone_lf', 'phone_time']
Current X_train shape after removing missing data & zero-impute is (284244, 168)


## Random Forest

In [19]:
# Loop through label possibilities and train/test logistic regression model
for label in label_possibilities:
    print("\n")
    print('*'*50)
    print('Predicting {} label'.format(label))
    trained_model=scc_train(X_train=X_train,
                            Y_train=Y_train,
                            M=M_train,
                            all_sensornames=feature_names,
                            all_labelnames=labelname_user,
                            used_sensors=sensor_types,
                            Y_target=label,
                            clf=clf3,
                            clf_type='rf')

    tn,fp,fn,tp,bal_accuracy=scc_test(X_test=X_test,
                                           Y_test=Y_test,
                                           M=M_test,
                                           all_sensornames=feature_names,
                                           all_labelnames=labelname_user,
                                           used_sensors=sensor_types,
                                           Y_target=label,
                                           trained_model=trained_model)
    print('-'*25)
    print('TP:{}, FP:{}, FN:{}, TP:{}, Balanced Accuracy:{}\n'.format(tn,fp,fn,tp,bal_accuracy))
    print('*'*50)



**************************************************
Predicting LOC_home label
--------------------------------------------------
Current X_train shape is (301853, 168)
Using sensors ['phone_acc', 'phone_gyro', 'phone_mag', 'phone_loc', 'phone_audio', 'phone_app', 'phone_battery', 'phone_use', 'phone_callstat', 'phone_wifi', 'phone_lf', 'phone_time']
Current X_train shape after removing missing data & zero-impute is (284244, 168)
--------------------------------------------------
Current X_test shape is (75493, 168)
-------------------------
TP:39313, FP:1441, FN:1397, TP:28939, Balanced Accuracy:0.9592953054788026

**************************************************


**************************************************
Predicting OR_indoors label
--------------------------------------------------
Current X_train shape is (301853, 168)
Using sensors ['phone_acc', 'phone_gyro', 'phone_mag', 'phone_loc', 'phone_audio', 'phone_app', 'phone_battery', 'phone_use', 'phone_callstat', 'phone_wif

Current X_train shape is (301853, 168)
Using sensors ['phone_acc', 'phone_gyro', 'phone_mag', 'phone_loc', 'phone_audio', 'phone_app', 'phone_battery', 'phone_use', 'phone_callstat', 'phone_wifi', 'phone_lf', 'phone_time']
Current X_train shape after removing missing data & zero-impute is (245191, 168)
--------------------------------------------------
Current X_test shape is (75493, 168)
-------------------------
TP:53730, FP:194, FN:4184, TP:3295, Balanced Accuracy:0.7184846323755424

**************************************************


**************************************************
Predicting IN_A_CAR label
--------------------------------------------------
Current X_train shape is (301853, 168)
Using sensors ['phone_acc', 'phone_gyro', 'phone_mag', 'phone_loc', 'phone_audio', 'phone_app', 'phone_battery', 'phone_use', 'phone_callstat', 'phone_wifi', 'phone_lf', 'phone_time']
Current X_train shape after removing missing data & zero-impute is (139665, 168)
-----------------------

Current X_train shape after removing missing data & zero-impute is (113415, 168)
--------------------------------------------------
Current X_test shape is (75493, 168)
-------------------------
TP:25278, FP:81, FN:1327, TP:1602, Balanced Accuracy:0.7718751086733193

**************************************************


**************************************************
Predicting FIX_restaurant label
--------------------------------------------------
Current X_train shape is (301853, 168)
Using sensors ['phone_acc', 'phone_gyro', 'phone_mag', 'phone_loc', 'phone_audio', 'phone_app', 'phone_battery', 'phone_use', 'phone_callstat', 'phone_wifi', 'phone_lf', 'phone_time']
Current X_train shape after removing missing data & zero-impute is (126026, 168)
--------------------------------------------------
Current X_test shape is (75493, 168)
-------------------------
TP:31130, FP:4, FN:219, TP:203, Balanced Accuracy:0.74045708856108

**************************************************


******

-------------------------
TP:33417, FP:9, FN:288, TP:89, Balanced Accuracy:0.617902509538073

**************************************************


**************************************************
Predicting DRIVE_-_I_M_THE_DRIVER label
--------------------------------------------------
Current X_train shape is (301853, 168)
Using sensors ['phone_acc', 'phone_gyro', 'phone_mag', 'phone_loc', 'phone_audio', 'phone_app', 'phone_battery', 'phone_use', 'phone_callstat', 'phone_wifi', 'phone_lf', 'phone_time']
Current X_train shape after removing missing data & zero-impute is (129769, 168)
--------------------------------------------------
Current X_test shape is (75493, 168)
-------------------------
TP:30759, FP:102, FN:636, TP:998, Balanced Accuracy:0.8037329857091677

**************************************************


**************************************************
Predicting STROLLING label
--------------------------------------------------
Current X_train shape is (301853, 168)


Current X_train shape is (301853, 168)
Using sensors ['phone_acc', 'phone_gyro', 'phone_mag', 'phone_loc', 'phone_audio', 'phone_app', 'phone_battery', 'phone_use', 'phone_callstat', 'phone_wifi', 'phone_lf', 'phone_time']
Current X_train shape after removing missing data & zero-impute is (75536, 168)
--------------------------------------------------
Current X_test shape is (75493, 168)
-------------------------
TP:18716, FP:3, FN:118, TP:42, Balanced Accuracy:0.6311698675142903

**************************************************


**************************************************
Predicting WITH_CO-WORKERS label
--------------------------------------------------
Current X_train shape is (301853, 168)
Using sensors ['phone_acc', 'phone_gyro', 'phone_mag', 'phone_loc', 'phone_audio', 'phone_app', 'phone_battery', 'phone_use', 'phone_callstat', 'phone_wifi', 'phone_lf', 'phone_time']
Current X_train shape after removing missing data & zero-impute is (89679, 168)
-----------------------

## ANN

In [20]:
# Loop through label possibilities and train/test logistic regression model
for label in label_possibilities:
    print("\n")
    print('*'*50)
    print('Predicting {} label'.format(label))
    trained_model=scc_train(X_train=X_train,
                            Y_train=Y_train,
                            M=M_train,
                            all_sensornames=feature_names,
                            all_labelnames=labelname_user,
                            used_sensors=sensor_types,
                            Y_target=label,
                            clf=clf4,
                            clf_type='ann')

    tn,fp,fn,tp,bal_accuracy=scc_test(X_test=X_test,
                                           Y_test=Y_test,
                                           M=M_test,
                                           all_sensornames=feature_names,
                                           all_labelnames=labelname_user,
                                           used_sensors=sensor_types,
                                           Y_target=label,
                                           trained_model=trained_model)
    print('-'*25)
    print('TP:{}, FP:{}, FN:{}, TP:{}, Balanced Accuracy:{}\n'.format(tn,fp,fn,tp,bal_accuracy))
    print('*'*50)



**************************************************
Predicting LOC_home label
--------------------------------------------------
Current X_train shape is (301853, 168)
Using sensors ['phone_acc', 'phone_gyro', 'phone_mag', 'phone_loc', 'phone_audio', 'phone_app', 'phone_battery', 'phone_use', 'phone_callstat', 'phone_wifi', 'phone_lf', 'phone_time']
Current X_train shape after removing missing data & zero-impute is (284244, 168)
Iteration 1, loss = 0.30106044
Iteration 2, loss = 0.23351982
Iteration 3, loss = 0.21276231
Iteration 4, loss = 0.19832531
Iteration 5, loss = 0.18961893
Iteration 6, loss = 0.18236083
Iteration 7, loss = 0.17708023
Iteration 8, loss = 0.17377306
Iteration 9, loss = 0.16866136
Iteration 10, loss = 0.16516570
--------------------------------------------------




Current X_test shape is (75493, 168)
-------------------------
TP:38459, FP:2295, FN:2467, TP:27869, Balanced Accuracy:0.9311819941013271

**************************************************


**************************************************
Predicting OR_indoors label
--------------------------------------------------
Current X_train shape is (301853, 168)
Using sensors ['phone_acc', 'phone_gyro', 'phone_mag', 'phone_loc', 'phone_audio', 'phone_app', 'phone_battery', 'phone_use', 'phone_callstat', 'phone_wifi', 'phone_lf', 'phone_time']
Current X_train shape after removing missing data & zero-impute is (157494, 168)
Iteration 11, loss = 0.10014632
--------------------------------------------------
Current X_test shape is (75493, 168)
-------------------------
TP:1658, FP:754, FN:234, TP:36666, Balanced Accuracy:0.8405274440804109

**************************************************


**************************************************
Predicting PHONE_ON_TABLE label
-------------------

Current X_train shape after removing missing data & zero-impute is (245191, 168)
Iteration 22, loss = 0.28398040
--------------------------------------------------
Current X_test shape is (75493, 168)
-------------------------
TP:52676, FP:1248, FN:4490, TP:2989, Balanced Accuracy:0.6882543381190896

**************************************************


**************************************************
Predicting IN_A_CAR label
--------------------------------------------------
Current X_train shape is (301853, 168)
Using sensors ['phone_acc', 'phone_gyro', 'phone_mag', 'phone_loc', 'phone_audio', 'phone_app', 'phone_battery', 'phone_use', 'phone_callstat', 'phone_wifi', 'phone_lf', 'phone_time']
Current X_train shape after removing missing data & zero-impute is (139665, 168)
Iteration 23, loss = 0.07744269
--------------------------------------------------
Current X_test shape is (75493, 168)
-------------------------
TP:33501, FP:279, FN:492, TP:759, Balanced Accuracy:0.7992276516264

Current X_train shape is (301853, 168)
Using sensors ['phone_acc', 'phone_gyro', 'phone_mag', 'phone_loc', 'phone_audio', 'phone_app', 'phone_battery', 'phone_use', 'phone_callstat', 'phone_wifi', 'phone_lf', 'phone_time']
Current X_train shape after removing missing data & zero-impute is (113415, 168)
Iteration 34, loss = 0.22098289
--------------------------------------------------
Current X_test shape is (75493, 168)
-------------------------
TP:24587, FP:772, FN:1265, TP:1664, Balanced Accuracy:0.7688345714030644

**************************************************


**************************************************
Predicting FIX_restaurant label
--------------------------------------------------
Current X_train shape is (301853, 168)
Using sensors ['phone_acc', 'phone_gyro', 'phone_mag', 'phone_loc', 'phone_audio', 'phone_app', 'phone_battery', 'phone_use', 'phone_callstat', 'phone_wifi', 'phone_lf', 'phone_time']
Current X_train shape after removing missing data & zero-impute is

Current X_train shape after removing missing data & zero-impute is (161367, 168)
Iteration 45, loss = 0.17408567
--------------------------------------------------
Current X_test shape is (75493, 168)
-------------------------
TP:32792, FP:794, FN:1330, TP:5444, Balanced Accuracy:0.8900101271335816

**************************************************


**************************************************
Predicting ON_A_BUS label
--------------------------------------------------
Current X_train shape is (301853, 168)
Using sensors ['phone_acc', 'phone_gyro', 'phone_mag', 'phone_loc', 'phone_audio', 'phone_app', 'phone_battery', 'phone_use', 'phone_callstat', 'phone_wifi', 'phone_lf', 'phone_time']
Current X_train shape after removing missing data & zero-impute is (135229, 168)
Iteration 46, loss = 0.04397577
--------------------------------------------------
Current X_test shape is (75493, 168)
-------------------------
TP:33367, FP:59, FN:263, TP:114, Balanced Accuracy:0.650311087431582

Iteration 56, loss = 0.25975675
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
--------------------------------------------------
Current X_test shape is (75493, 168)
-------------------------
TP:38667, FP:1195, FN:2780, TP:4940, Balanced Accuracy:0.8049589737443923

**************************************************


**************************************************
Predicting GROOMING label
--------------------------------------------------
Current X_train shape is (301853, 168)
Using sensors ['phone_acc', 'phone_gyro', 'phone_mag', 'phone_loc', 'phone_audio', 'phone_app', 'phone_battery', 'phone_use', 'phone_callstat', 'phone_wifi', 'phone_lf', 'phone_time']
Current X_train shape after removing missing data & zero-impute is (159027, 168)
Iteration 57, loss = 0.06956691
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
--------------------------------------------------
Current X_test shape is (75493,

# Single Class Classifier: SOTA Comparison

## Logistic Regression Baseline

Researchers fitted a new LR for each label- so no warm start

In [17]:
clf=LogisticRegression(n_jobs=-1,warm_start=False,class_weight='balanced',solver='lbfgs') # Account for class imbalance
root='sota_comparison/lr/'
if not os.path.exists(root):
        os.mkdir(root)

#filename='scc_model_{}_label_{}.sav'.format(clf_type,Y_target)

# Loop through label possibilities and train/test logistic regression model
for label in label_possibilities:
    print("\n")
    print('*'*50)
    print('Predicting {} label'.format(label))
    trained_model=scc_train(X_train=X_train,
                            Y_train=Y_train,
                            M=M_train,
                            all_sensornames=feature_names,
                            all_labelnames=labelname_user,
                            used_sensors=sensor_types,
                            Y_target=label,
                            clf=clf,
                            clf_type='lr_baseline',
                            root=root)

    tn,fp,fn,tp,bal_accuracy=scc_test(X_test=X_test,
                                           Y_test=Y_test,
                                           M=M_test,
                                           all_sensornames=feature_names,
                                           all_labelnames=labelname_user,
                                           used_sensors=sensor_types,
                                           Y_target=label,
                                           trained_model=trained_model)
    print('-'*25)
    print('TP:{}, FP:{}, FN:{}, TP:{}, Balanced Accuracy:{}\n'.format(tn,fp,fn,tp,bal_accuracy))
    print('*'*50)



**************************************************
Predicting LOC_home label
--------------------------------------------------
Current X_train shape is (264117, 168)
Using sensors ['phone_acc', 'phone_gyro', 'phone_mag', 'phone_loc', 'phone_audio', 'phone_app', 'phone_battery', 'phone_use', 'phone_callstat', 'phone_wifi', 'phone_lf', 'phone_time']
Current X_train shape after removing missing data & zero-impute is (248709, 168)
--------------------------------------------------
Current X_test shape is (113229, 168)
-------------------------
TP:47733, FP:13233, FN:8819, TP:36840, Balanced Accuracy:0.7948976886180114

**************************************************


**************************************************
Predicting OR_indoors label
--------------------------------------------------
Current X_train shape is (264117, 168)
Using sensors ['phone_acc', 'phone_gyro', 'phone_mag', 'phone_loc', 'phone_audio', 'phone_app', 'phone_battery', 'phone_use', 'phone_callstat', 'phone_w

Current X_train shape is (264117, 168)
Using sensors ['phone_acc', 'phone_gyro', 'phone_mag', 'phone_loc', 'phone_audio', 'phone_app', 'phone_battery', 'phone_use', 'phone_callstat', 'phone_wifi', 'phone_lf', 'phone_time']
Current X_train shape after removing missing data & zero-impute is (214550, 168)
--------------------------------------------------
Current X_test shape is (113229, 168)
-------------------------
TP:51185, FP:29577, FN:2949, TP:8333, Balanced Accuracy:0.68619298057128

**************************************************


**************************************************
Predicting IN_A_CAR label
--------------------------------------------------
Current X_train shape is (264117, 168)
Using sensors ['phone_acc', 'phone_gyro', 'phone_mag', 'phone_loc', 'phone_audio', 'phone_app', 'phone_battery', 'phone_use', 'phone_callstat', 'phone_wifi', 'phone_lf', 'phone_time']
Current X_train shape after removing missing data & zero-impute is (122174, 168)
----------------------

Current X_train shape after removing missing data & zero-impute is (99261, 168)
--------------------------------------------------
Current X_test shape is (113229, 168)
-------------------------
TP:29646, FP:8426, FN:1004, TP:3366, Balanced Accuracy:0.7744671062849482

**************************************************


**************************************************
Predicting FIX_restaurant label
--------------------------------------------------
Current X_train shape is (264117, 168)
Using sensors ['phone_acc', 'phone_gyro', 'phone_mag', 'phone_loc', 'phone_audio', 'phone_app', 'phone_battery', 'phone_use', 'phone_callstat', 'phone_wifi', 'phone_lf', 'phone_time']
Current X_train shape after removing missing data & zero-impute is (110262, 168)
--------------------------------------------------
Current X_test shape is (113229, 168)
-------------------------
TP:40786, FP:5887, FN:76, TP:571, Balanced Accuracy:0.8782009469613492

**************************************************



Current X_test shape is (113229, 168)
-------------------------
TP:44209, FP:5981, FN:65, TP:469, Balanced Accuracy:0.8795549943920966

**************************************************


**************************************************
Predicting DRIVE_-_I_M_THE_DRIVER label
--------------------------------------------------
Current X_train shape is (264117, 168)
Using sensors ['phone_acc', 'phone_gyro', 'phone_mag', 'phone_loc', 'phone_audio', 'phone_app', 'phone_battery', 'phone_use', 'phone_callstat', 'phone_wifi', 'phone_lf', 'phone_time']
Current X_train shape after removing missing data & zero-impute is (113549, 168)
--------------------------------------------------
Current X_test shape is (113229, 168)
-------------------------
TP:41401, FP:4861, FN:276, TP:2177, Balanced Accuracy:0.8912046363554764

**************************************************


**************************************************
Predicting STROLLING label
---------------------------------------------

Current X_train shape is (264117, 168)
Using sensors ['phone_acc', 'phone_gyro', 'phone_mag', 'phone_loc', 'phone_audio', 'phone_app', 'phone_battery', 'phone_use', 'phone_callstat', 'phone_wifi', 'phone_lf', 'phone_time']
Current X_train shape after removing missing data & zero-impute is (66110, 168)
--------------------------------------------------
Current X_test shape is (113229, 168)
-------------------------
TP:24488, FP:3583, FN:39, TP:195, Balanced Accuracy:0.8528463538883546

**************************************************


**************************************************
Predicting WITH_CO-WORKERS label
--------------------------------------------------
Current X_train shape is (264117, 168)
Using sensors ['phone_acc', 'phone_gyro', 'phone_mag', 'phone_loc', 'phone_audio', 'phone_app', 'phone_battery', 'phone_use', 'phone_callstat', 'phone_wifi', 'phone_lf', 'phone_time']
Current X_train shape after removing missing data & zero-impute is (78464, 168)
-------------------

# Multi Class Classifier: SOTA Comparison

## Multi-Layer Perceptron (0 Hidden Layers)

In [19]:
# Linear Model
# Training was done using gradient descent with back-propagation:
#n_epoch=40,bs=300 examples,lr[0.1-0.01] linearly epoch decreases,momentum weight 0.5.

clf=MLPClassifier(hidden_layer_sizes=(0,),activation='relu',
                   solver='adam',batch_size=300,learning_rate_init=0.1,
                   learning_rate='adaptive',max_iter=40,random_state=random_state,tol=0.0001,
                   verbose=True,warm_start=True,early_stopping=False,validation_fraction=0.05,
                   momentum=0.5,epsilon=1e-08, n_iter_no_change=10)

root='sota_comparison/mlp/'
if not os.path.exists(root):
        os.mkdir(root)

# Loop through label possibilities and train/test logistic regression model
trained_model=mcc_train(X_train=X_train,
                        Y_train=Y_train,
                        M=M_train,
                        all_sensornames=feature_names,
                        used_sensors=sensor_types,
                        clf=clf,
                        clf_type='mlp_linear_0hidden',
                        root=root)

mcc_test(X_test=X_test,Y_test=Y_test,all_sensornames=feature_names,used_sensors=sensor_types,
        trained_model=trained_model)

--------------------------------------------------
Current X_train shape is (264117, 168)
Using sensors ['phone_acc', 'phone_gyro', 'phone_mag', 'phone_loc', 'phone_audio', 'phone_app', 'phone_battery', 'phone_use', 'phone_callstat', 'phone_wifi', 'phone_lf', 'phone_time']


IndexError: too many indices for array

# Custom Cross-Validation

In [None]:
# Load cross-validation user IDs
def cross_validation_id(location):

# Given Cross-Validation splits