<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Imports" data-toc-modified-id="Imports-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Imports</a></span></li><li><span><a href="#Dataset-parsers-and-cleaning-functions" data-toc-modified-id="Dataset-parsers-and-cleaning-functions-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Dataset parsers and cleaning functions</a></span><ul class="toc-item"><li><span><a href="#Test-data" data-toc-modified-id="Test-data-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Test data</a></span></li></ul></li><li><span><a href="#Training-Functions" data-toc-modified-id="Training-Functions-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Training Functions</a></span><ul class="toc-item"><li><ul class="toc-item"><li><span><a href="#Test-data-on-particular-sensors" data-toc-modified-id="Test-data-on-particular-sensors-3.0.1"><span class="toc-item-num">3.0.1&nbsp;&nbsp;</span>Test data on particular sensors</a></span></li></ul></li><li><span><a href="#Creating-a-new-data-structure-for-all-valid-data-and-pickling-it" data-toc-modified-id="Creating-a-new-data-structure-for-all-valid-data-and-pickling-it-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Creating a new data structure for all valid data and pickling it</a></span></li><li><span><a href="#Creating-and-pickling-instance-weight-matrix" data-toc-modified-id="Creating-and-pickling-instance-weight-matrix-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Creating and pickling instance weight matrix</a></span></li><li><span><a href="#Miscellaneous-train/test-functions" data-toc-modified-id="Miscellaneous-train/test-functions-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>Miscellaneous train/test functions</a></span><ul class="toc-item"><li><span><a href="#Cuda-enable" data-toc-modified-id="Cuda-enable-3.3.1"><span class="toc-item-num">3.3.1&nbsp;&nbsp;</span>Cuda-enable</a></span></li><li><span><a href="#Tackling-missing-labels-using-a-mask" data-toc-modified-id="Tackling-missing-labels-using-a-mask-3.3.2"><span class="toc-item-num">3.3.2&nbsp;&nbsp;</span>Tackling missing labels using a mask</a></span></li><li><span><a href="#Linear-Learning-Rate-scheduler" data-toc-modified-id="Linear-Learning-Rate-scheduler-3.3.3"><span class="toc-item-num">3.3.3&nbsp;&nbsp;</span>Linear Learning-Rate scheduler</a></span></li><li><span><a href="#Euclidean-Norm-for-weight-matrices" data-toc-modified-id="Euclidean-Norm-for-weight-matrices-3.3.4"><span class="toc-item-num">3.3.4&nbsp;&nbsp;</span>Euclidean Norm for weight matrices</a></span></li><li><span><a href="#Accuracy-(Precision,-Recall,-F1,-Support,-Balanced-Accuracy)-metrics" data-toc-modified-id="Accuracy-(Precision,-Recall,-F1,-Support,-Balanced-Accuracy)-metrics-3.3.5"><span class="toc-item-num">3.3.5&nbsp;&nbsp;</span>Accuracy (Precision, Recall, F1, Support, Balanced Accuracy) metrics</a></span></li></ul></li></ul></li></ul></div>

# Imports

In [1]:
# Required imports
import datetime
import os
import numpy as np
import pandas as pd
import gzip
import glob
import pickle
import copy
import math
from io import StringIO
import importlib.machinery

from sklearn.metrics import accuracy_score,confusion_matrix,balanced_accuracy_score
from sklearn.metrics import precision_recall_fscore_support,classification_report
#from sklearn.metrics import multilabel_confusion_matrix # Only available in dev .21

# Need Pytorch for multilabel classifications
import torch
from torch.autograd import Variable as V
from torch import nn,optim
import torch.nn.functional as F
import torch.utils.data as utils
#import skorch [Scikit-learn wrapper around Pytorch so allowing for K-fold cross-validation]
random_state=10
np.random.seed(random_state)

from utils import cm,remove_outliers

In [2]:
# Data location and sample user
prefix='dataset/Extrasensory_uuid_fl_uTAR/'
cross_validation_user_loc='dataset/cv_5_folds/'
user_sample='3600D531-0C55-44A7-AE95-A7A38519464E.features_labels'
done=1 # Pickled files are created

# Dataset parsers and cleaning functions

In [3]:
# Dataset parsers for header/ body for CSVs
def parse_header_of_csv(csv_str):
    # Isolate the headline columns:
    headline = csv_str[:csv_str.index('\n')];
    columns = headline.split(',');

    # The first column should be timestamp:
    assert columns[0] == 'timestamp';
    # The last column should be label_source:
    assert columns[-1] == 'label_source';
    
    # Search for the column of the first label:
    for (ci,col) in enumerate(columns):
        if col.startswith('label:'):
            first_label_ind = ci;
            break;
        pass;

    # Feature columns come after timestamp and before the labels:
    feature_names = columns[1:first_label_ind];
    # Then come the labels, till the one-before-last column:
    label_names = columns[first_label_ind:-1];
    for (li,label) in enumerate(label_names):
        # In the CSV the label names appear with prefix 'label:', but we don't need it after reading the data:
        assert label.startswith('label:');
        label_names[li] = label.replace('label:','');
        pass;
    
    return (feature_names,label_names);

def parse_body_of_csv(csv_str,n_features):
    # Read the entire CSV body into a single numeric matrix:
    full_table = np.loadtxt(StringIO(csv_str),delimiter=',',skiprows=1);
    
    # Timestamp is the primary key for the records (examples):
    timestamps = full_table[:,0].astype(int);
    
    # Read the sensor features:
    X = full_table[:,1:(n_features+1)];
    
    # Read the binary label values, and the 'missing label' indicators:
    trinary_labels_mat = full_table[:,(n_features+1):-1]; # This should have values of either 0., 1. or NaN
    M = np.isnan(trinary_labels_mat); # M is the missing label matrix
    
    #print("M matrix shape:",M.shape)
    #print("Matrix: ",np.argwhere(M))
    trinary_labels_mat[M]=-1 # Replace NaNs with -1.0 for which we then apply a mask
    unique,counts=np.unique(trinary_labels_mat,return_counts=True)
    print(*zip(unique,counts)) 
    
#     Y = np.where(M,0,trinary_labels_mat) > 0.; # Y is the label matrix
    
    return (X,trinary_labels_mat,M,timestamps);

def read_user_data(directory):
    print('Reading {}'.format(directory.split("/")[-1]))

    # Read the entire csv file of the user:
    with gzip.open(directory,'rb') as fid:
        csv_str = fid.read();
        csv_str = csv_str.decode("utf-8")
        pass;

    (feature_names,label_names) = parse_header_of_csv(csv_str);
    n_features = len(feature_names);
    (X,Y,M,timestamps) = parse_body_of_csv(csv_str,n_features);

    return (X,Y,M,timestamps,feature_names,label_names);

In [4]:
# Clean labels
def clean_labels(input_label):
    if label.endswith('_'):
        label=label[:-1]+')'
    label=label.replace('__',' (').replace('_',' ')
    label=label[0]+label[1:].lower()
    label=label.replace('i m','I\'m')
    return label

In [5]:
# Get a summary of the sensor feature
'''
# Summarize features as we are only using phone_acc,phone_gyro,phone_mag,phone_loc,phone_audio,
# phone_app,phone_battery,phone_use,phone_callstat,phone_wifi,phone_lf,phone_time
# We are ignoring the use of the smartwatch features. There are definitely features that will be used
# much more (e.g. than the phone_callstat) but we'll leave that up to the ML algorithm.
'''
def summarize_features(feature_list):
    summary_feature_list=np.empty_like(feature_list)
    for (ind,feature) in enumerate(feature_list):
        if feature.startswith('raw_acc'):
            summary_feature_list[ind]='phone_acc' 
        if feature.startswith('proc_gyro'):
            summary_feature_list[ind]='phone_gyro'
        if feature.startswith('raw_magnet'):
            summary_feature_list[ind]='phone_mag'
        if feature.startswith('watch_acc'):
            summary_feature_list[ind]='watch_acc'
        if feature.startswith('watch_heading'):
            summary_feature_list[ind]='watch_dir'
        if feature.startswith('location'):
            summary_feature_list[ind]='phone_loc'
        if feature.startswith('audio'):
            summary_feature_list[ind]='phone_audio'
        if feature.startswith('discrete:app_state'):
            summary_feature_list[ind]='phone_app'
        if feature.startswith('discrete:battery'):
            summary_feature_list[ind]='phone_battery'
        if feature.startswith('discrete:on'):
            summary_feature_list[ind]='phone_use'
        if feature.startswith('discrete:ringer'):
            summary_feature_list[ind]='phone_callstat'
        if feature.startswith('discrete:wifi'):
            summary_feature_list[ind]='phone_wifi'
        if feature.startswith('lf'):
            summary_feature_list[ind]='phone_lf'
        if feature.startswith('discrete:time'):
            summary_feature_list[ind]='phone_time'

    return summary_feature_list


# Get a summary of the sensor feature along with the original label that was used
def summarize_features_worig(feature_list):
    summary_feature_list=np.empty((len(feature_list),2),dtype=object)
    
    for (ind,feature) in enumerate(feature_list):
        if feature.startswith('raw_acc'):
            summary_feature_list[ind,0]='phone_acc'
            summary_feature_list[ind,1]=feature
            
        if feature.startswith('proc_gyro'):
            summary_feature_list[ind,0]='phone_gyro'
            summary_feature_list[ind,1]=feature
            
        if feature.startswith('raw_magnet'):
            summary_feature_list[ind,0]='phone_mag'
            summary_feature_list[ind,1]=feature
            
        if feature.startswith('watch_acc'):
            summary_feature_list[ind,0]='watch_acc'
            summary_feature_list[ind,1]=feature
            
        if feature.startswith('watch_heading'):
            summary_feature_list[ind,0]='watch_dir'
            summary_feature_list[ind,1]=feature
            
        if feature.startswith('location'):
            summary_feature_list[ind,0]='phone_loc'
            summary_feature_list[ind,1]=feature
            
        if feature.startswith('audio'):
            summary_feature_list[ind,0]='phone_audio'
            summary_feature_list[ind,1]=feature
            
        if feature.startswith('discrete:app_state'):
            summary_feature_list[ind,0]='phone_app'
            summary_feature_list[ind,1]=feature
            
        if feature.startswith('discrete:battery'):
            summary_feature_list[ind,0]='phone_battery'
            summary_feature_list[ind,1]=feature
            
        if feature.startswith('discrete:on'):
            summary_feature_list[ind,0]='phone_use'
            summary_feature_list[ind,1]=feature
            
        if feature.startswith('discrete:ringer'):
            summary_feature_list[ind,0]='phone_callstat'
            summary_feature_list[ind,1]=feature
            
        if feature.startswith('discrete:wifi'):
            summary_feature_list[ind,0]='phone_wifi'
            summary_feature_list[ind,1]=feature
            
        if feature.startswith('lf'):
            summary_feature_list[ind,0]='phone_lf'
            summary_feature_list[ind,1]=feature
            
        if feature.startswith('discrete:time'):
            summary_feature_list[ind,0]='phone_time'
            summary_feature_list[ind,1]=feature

    return summary_feature_list

In [6]:
# Custom dictionary class with help for duplicate keys
class Customdictionary(dict):
    def __setitem__(self,key,value):
        try:
            self[key]
        except KeyError:
            super(Customdictionary,self).__setitem__(key,[])
        self[key].append(value)

## Test data

In [7]:
# Reading sample data
sample_loc='{}/{}.csv.gz'.format(prefix,user_sample)
x_user,y_user,missedlabel_user,tstamp_user,featurename_user,labelname_user=read_user_data(sample_loc)

# Dataset summaries for this user
print('Data shape input for user (Len minutes/num examples, num sensors): ',x_user.shape) # Timestep examples, number of sensors
print('Label shape for user (Len minutes, num labels): ',y_user.shape,'\n') # Timestep examples, labels

countlabels_user=np.sum(y_user,axis=0) # Column summary
labelname_countlabel_user=zip(labelname_user,countlabels_user) # Zip together names, counts
labelname_countlabel_user=sorted(labelname_countlabel_user,key=lambda row:row[-1],reverse=True)

print('Sensor feature names:\n')
feature_names=summarize_features(featurename_user)
    
# for i,sensor_feature in enumerate(featurename_user):
#     print('{} :: {} ::--> {}\n'.format(i,feature_names[i],sensor_feature))

print('Activities and counts:')
print(labelname_countlabel_user)

Reading 3600D531-0C55-44A7-AE95-A7A38519464E.features_labels.csv.gz
(-1.0, 148794) (0.0, 97289) (1.0, 19270)
Data shape input for user (Len minutes/num examples, num sensors):  (5203, 225)
Label shape for user (Len minutes, num labels):  (5203, 51) 

Sensor feature names:

Activities and counts:
[('LOC_home', 3040.0), ('SITTING', 481.0), ('WITH_FRIENDS', 295.0), ('PHONE_ON_TABLE', -42.0), ('LYING_DOWN', -99.0), ('OR_indoors', -102.0), ('SLEEPING', -414.0), ('WATCHING_TV', -523.0), ('EATING', -673.0), ('TALKING', -797.0), ('DRIVE_-_I_M_A_PASSENGER', -1026.0), ('OR_standing', -1051.0), ('IN_A_CAR', -1093.0), ('OR_exercise', -1273.0), ('AT_THE_GYM', -1273.0), ('SINGING', -1299.0), ('FIX_walking', -1303.0), ('SHOPPING', -1324.0), ('AT_SCHOOL', -1330.0), ('BATHING_-_SHOWER', -1350.0), ('DRESSING', -1368.0), ('DRINKING__ALCOHOL_', -1369.0), ('FIX_restaurant', -1376.0), ('IN_CLASS', -1381.0), ('IN_A_MEETING', -1408.0), ('TOILET', -1423.0), ('COOKING', -1430.0), ('ELEVATOR', -1434.0), ('PHONE_

# Training Functions

In [8]:
# Choosing sensor labels
'''
Summary sensor choices are: phone_acc,phone_gyro,phone_mag,watch_acc,watch_dir,phone_loc,phone_audio,
phone_app,phone_battery,phone_use,phone_callstat,phone_wifi,phone_lf,phone_time
In this project, we aren't using watch_acc,watch_dir (no smartwatch)
'''

def choose_sensors(X_train,used_sensors,summarized_feature_names):
    used_sensor_feature_names=np.zeros(len(summarized_feature_names),dtype=bool)
    # Creates a zero boolean vector of all possible feature names
    for s in used_sensors:
        used_sensor_feature_names=np.logical_or(used_sensor_feature_names,(s==summarized_feature_names))
    X_train=X_train[:,used_sensor_feature_names]
    return X_train

def choose_sensors_dropout(X_train,used_sensors,summarized_feature_names):
    used_sensor_feature_names=np.zeros(len(summarized_feature_names),dtype=bool)
    data_length=len(X_train)
    
    # Creates a zero boolean vector of all possible feature names
    for s in used_sensors:
        used_sensor_feature_names=np.logical_or(used_sensor_feature_names,(s==summarized_feature_names))
    mask=np.tile(used_sensor_feature_names,(data_length,1))
    
    X_train=np.multiply(X_train,mask) # Element-wise matrix multiply
    return X_train

def choose_sensors_longnames(X_train,used_sensors,long_featurenames):
    
    used_sensor_feature_names=np.zeros(len(long_featurenames),dtype=bool)
    used_feature_actualnames=np.zeros(len(long_featurenames),dtype=bool)
    # Creates a zero boolean vector of all possible feature names
    summary_features=long_featurenames[:,0]
    all_complete_features=long_featurenames[:,-1]
    
    for s in used_sensors:
        similar=(s==summary_features)
        
        #used_complete_features=(all_complete_features[similar.astype(int)])
       
        used_sensor_feature_names=np.logical_or(used_sensor_feature_names,similar)
        used_feature_actualnames=np.logical_or(used_feature_actualnames,similar)
    
    X_train=X_train[:,used_sensor_feature_names]
    long_names=all_complete_features[used_feature_actualnames]
    return X_train,long_names



In [9]:
# Returns a standardized (0 mean, 1 variance) dataset
def standardize(X_train):
    mean=np.nanmean(X_train,axis=0).reshape((1,-1))# Ignores NaNs while finding the mean across rows
    standard_dev=np.nanstd(X_train,axis=0) # Ignores NaNs while finding the standard deviation across rows
    standard_dev_nonzero=np.where(standard_dev>0,standard_dev,1.).reshape((1,-1)) # Div zero
    
    X=(X_train-mean)/standard_dev_nonzero
    return X,mean,standard_dev_nonzero   

In [10]:
# Sensor Types, Label Possibilities variables
sensor_types=['phone_acc','phone_gyro','phone_mag','phone_loc','phone_audio',
'phone_app','phone_battery','phone_use','phone_callstat','phone_wifi','phone_lf',
'phone_time']
label_possibilities=['LOC_home','OR_indoors','PHONE_ON_TABLE','SITTING','WITH_FRIENDS',
 'LYING_DOWN','SLEEPING','WATCHING_TV','EATING','PHONE_IN_POCKET',
 'TALKING','DRIVE_-_I_M_A_PASSENGER','OR_standing','IN_A_CAR',
 'OR_exercise','AT_THE_GYM','SINGING','FIX_walking','OR_outside',
 'SHOPPING','AT_SCHOOL','BATHING_-_SHOWER','DRESSING','DRINKING__ALCOHOL_',
 'PHONE_IN_HAND','FIX_restaurant','IN_CLASS','PHONE_IN_BAG','IN_A_MEETING',
 'TOILET','COOKING','ELEVATOR','FIX_running','BICYCLING','LAB_WORK',
 'LOC_main_workplace','ON_A_BUS','DRIVE_-_I_M_THE_DRIVER','STROLLING',
 'CLEANING','DOING_LAUNDRY','WASHING_DISHES','SURFING_THE_INTERNET',
 'AT_A_PARTY','AT_A_BAR','LOC_beach','COMPUTER_WORK','GROOMING','STAIRS_-_GOING_UP',
 'STAIRS_-_GOING_DOWN','WITH_CO-WORKERS']

### Test data on particular sensors

In [11]:
# Reading sample data
sample_loc='{}/{}.csv.gz'.format(prefix,user_sample)
x_user,y_user,missedlabel_user,tstamp_user,featurename_user,labelname_user=read_user_data(sample_loc)

# Dataset summaries for this user
print('Data shape input for user (Len minutes/num examples, num sensors): ',x_user.shape) # Timestep examples, number of sensors
print('Label shape for user (Len minutes, num labels): ',y_user.shape,'\n') # Timestep examples, labels

countlabels_user=np.sum(y_user,axis=0) # Column summary
labelname_countlabel_user=zip(labelname_user,countlabels_user) # Zip together names, counts
labelname_countlabel_user=sorted(labelname_countlabel_user,key=lambda row:row[-1],reverse=True)

print('Sensor feature names:\n')
feature_names_woriginallabels=summarize_features_worig(featurename_user)
    
print('Activities and counts:')
print(labelname_countlabel_user)

x_train_chosen,feature_long_names=choose_sensors_longnames(x_user,sensor_types,feature_names_woriginallabels)
# feature_long_names is original long feature name from the chosen sensor list

Reading 3600D531-0C55-44A7-AE95-A7A38519464E.features_labels.csv.gz
(-1.0, 148794) (0.0, 97289) (1.0, 19270)
Data shape input for user (Len minutes/num examples, num sensors):  (5203, 225)
Label shape for user (Len minutes, num labels):  (5203, 51) 

Sensor feature names:

Activities and counts:
[('LOC_home', 3040.0), ('SITTING', 481.0), ('WITH_FRIENDS', 295.0), ('PHONE_ON_TABLE', -42.0), ('LYING_DOWN', -99.0), ('OR_indoors', -102.0), ('SLEEPING', -414.0), ('WATCHING_TV', -523.0), ('EATING', -673.0), ('TALKING', -797.0), ('DRIVE_-_I_M_A_PASSENGER', -1026.0), ('OR_standing', -1051.0), ('IN_A_CAR', -1093.0), ('OR_exercise', -1273.0), ('AT_THE_GYM', -1273.0), ('SINGING', -1299.0), ('FIX_walking', -1303.0), ('SHOPPING', -1324.0), ('AT_SCHOOL', -1330.0), ('BATHING_-_SHOWER', -1350.0), ('DRESSING', -1368.0), ('DRINKING__ALCOHOL_', -1369.0), ('FIX_restaurant', -1376.0), ('IN_CLASS', -1381.0), ('IN_A_MEETING', -1408.0), ('TOILET', -1423.0), ('COOKING', -1430.0), ('ELEVATOR', -1434.0), ('PHONE_

## Creating a new data structure for all valid data and pickling it

Remove rows with np.nan labels (missing labels). Zero impute missing feature entries. Standardization done at train time.

In [12]:
done=0

In [26]:
# kalman_loc='dataset/impute_raw/kalmanimpute_raw/'
# arimakalman_loc='dataset/impute_raw/arimakalmanimpute_raw/'
ssl_user_loc='dataset/semisupervised_users_2Hdrop/'

In [27]:
# Skipping cell if the data files were already created previously 
if done!=1:
    # Reading data in the directory (Stacked)
    
    #M_train_t=np.empty((0,51))
    #M_test_t=np.empty((0,51))
    for fold_n in [0,1,2,3,4]:
        #X_train_t=np.empty((0,170))
        Y_train_t=np.empty((0,51))
        #X_test_t=np.empty((0,170))
        Y_test_t=np.empty((0,51))
        train = glob.glob(cross_validation_user_loc+'fold_%d_train_*_uuids.txt'%fold_n)
        test = glob.glob(cross_validation_user_loc+'fold_%d_test_*_uuids.txt'%fold_n)
        for tr in train:
            with open(tr,'r') as file:
                for line in file:
                    line = line.replace("\n","")
                    print("Working on Train: ",line)

                    with open(ssl_user_loc+line+'_ssl_2Hdrop.pkl','rb') as f:
                        y_ssl=pickle.load(f)
                    f.close()
                    
                    y_ssl[y_ssl < 1e-1] = 0
                    unique,counts=np.unique(y_ssl,return_counts=True)
                    if(len(unique)>3):
                        print(*zip(unique,counts),"\n")
                    #print(y_ssl.shape)
                    Y_train_t=np.vstack((Y_train_t,y_ssl))
        
        for te in test:
            with open(te,'r') as file:
                for line in file:
                    line = line.replace("\n","")
                    print("Working on Test: ",line)
                    
                    with open(ssl_user_loc+line+'_ssl_2Hdrop.pkl','rb') as f:
                        y_ssl_t=pickle.load(f)
                    f.close()
                    y_ssl_t[y_ssl_t < 1e-1] = 0
                    unique,counts=np.unique(y_ssl_t,return_counts=True)
                    if(len(unique)>3):
                        print(*zip(unique,counts),"\n")
                    
                    Y_test_t=np.vstack((Y_test_t,y_ssl_t))
        with open('dataset/pickled/semisupervised_2Hdrop/ytrain_{}.pkl'.format(fold_n),'wb') as f:
            pickle.dump(Y_train_t,f)
#         with open('dataset/pickled/orig_logicimputelabel/xtest_{}.pkl'.format(fold_n),'wb') as f:
#             pickle.dump(X_test_t,f)
        with open('dataset/pickled/semisupervised_2Hdrop/ytest_{}.pkl'.format(fold_n),'wb') as f:
            pickle.dump(Y_test_t,f)
        print("Done for fold {}".format(fold_n))
    print ("DONE") 
else:
    print("Skipping step")

Working on Train:  2C32C23E-E30C-498A-8DD2-0EFB9150A02E
Working on Train:  3600D531-0C55-44A7-AE95-A7A38519464E
Working on Train:  4FC32141-E888-4BFF-8804-12559A491D8C
Working on Train:  5119D0F8-FCA8-4184-A4EB-19421A40DE0D
Working on Train:  5152A2DF-FAF3-4BA8-9CA9-E66B32671A53
Working on Train:  5EF64122-B513-46AE-BCF1-E62AAC285D2C
Working on Train:  61976C24-1C50-4355-9C49-AAE44A7D09F6
Working on Train:  78A91A4E-4A51-4065-BDA7-94755F0BB3BB
Working on Train:  7CE37510-56D0-4120-A1CF-0E23351428D2
Working on Train:  7D9BB102-A612-4E2A-8E22-3159752F55D8
Working on Train:  8023FE1A-D3B0-4E2C-A57A-9321B7FC755F
Working on Train:  83CF687B-7CEC-434B-9FE8-00C3D5799BE6
Working on Train:  9759096F-1119-4E19-A0AD-6F16989C7E1C
Working on Train:  9DC38D04-E82E-4F29-AB52-B476535226F2
Working on Train:  A5CDF89D-02A2-4EC1-89F8-F534FDABDD96
Working on Train:  A76A5AF5-5A93-4CF2-A16E-62353BB70E8A
Working on Train:  B09E373F-8A54-44C8-895B-0039390B859F
Working on Train:  B9724848-C7E2-45F4-9B3F-A1F38

Working on Train:  24E40C4C-A349-4F9F-93AB-01D00FB994AF
Working on Train:  2C32C23E-E30C-498A-8DD2-0EFB9150A02E
Working on Train:  3600D531-0C55-44A7-AE95-A7A38519464E
Working on Train:  4FC32141-E888-4BFF-8804-12559A491D8C
Working on Train:  5119D0F8-FCA8-4184-A4EB-19421A40DE0D
Working on Train:  5152A2DF-FAF3-4BA8-9CA9-E66B32671A53
Working on Train:  5EF64122-B513-46AE-BCF1-E62AAC285D2C
Working on Train:  61976C24-1C50-4355-9C49-AAE44A7D09F6
Working on Train:  A5CDF89D-02A2-4EC1-89F8-F534FDABDD96
Working on Train:  A76A5AF5-5A93-4CF2-A16E-62353BB70E8A
Working on Train:  B09E373F-8A54-44C8-895B-0039390B859F
Working on Train:  B9724848-C7E2-45F4-9B3F-A1F38D864495
Working on Train:  BE3CA5A6-A561-4BBD-B7C9-5DF6805400FC
Working on Train:  C48CE857-A0DD-4DDB-BEA5-3A25449B2153
Working on Train:  CA820D43-E5E2-42EF-9798-BE56F776370B
Working on Train:  CDA3BBF7-6631-45E8-85BA-EEB416B32A3C
Working on Train:  CF722AA9-2533-4E51-9FEB-9EAC84EE9AAC
Working on Train:  D7D20E2E-FC78-405D-B346-DBD3F

Done for fold 4
DONE


In [None]:
# Skipping cell if the data files were already created previously 
if done!=1:
    # Reading data in the directory (Stacked)
    
    #M_train_t=np.empty((0,51))
    #M_test_t=np.empty((0,51))
    for fold_n in [0,1,2,3,4]:
        X_train_t=np.empty((0,170))
        Y_train_t=np.empty((0,51))
        X_test_t=np.empty((0,170))
        Y_test_t=np.empty((0,51))
        train = glob.glob(cross_validation_user_loc+'fold_%d_train_*_uuids.txt'%fold_n)
        test = glob.glob(cross_validation_user_loc+'fold_%d_test_*_uuids.txt'%fold_n)
        for tr in train:
            with open(tr,'r') as file:
                for line in file:
                    line = line.replace("\n","")
                    print("Working on Train: ",line)
                    
                    (x_user_train,y_user_train,missed_label_user,tstamp_user,featurename_user,labelname_user) = read_user_data(prefix+line+'.features_labels.csv.gz')
#                     x_sh=x_user_train.shape
#                     y_sh=y_user_train.shape
                    
                    x_user_train=np.nan_to_num(x_user_train)
                    
                    
                    # New method of imputation
#                     tstamp_user_df=[pd.Timestamp(datetime.datetime.fromtimestamp(x).strftime("%x %X")) for x in tstamp_user]
#                     x_user_train_df=pd.DataFrame(x_user_train,columns=featurename_user)
#                     x_user_train_df.insert(0,"Datetime",tstamp_user_df,True) 
#                     x_user_train_df=x_user_train_df.set_index('Datetime')
#                     x_user_train_df.interpolate(limit_direction='forward',method='time')
#                     x_user_train_df.interpolate()
#                     x_user_train_df.fillna(0)
                    # End new method of imputation
    
#                     x_user_train=x_user_train_df.values # Date-time column is an index (doesn't have to be ignored)
#                     x_user_train=np.nan_to_num(x_user_train)
                    #x_user_train=choose_sensors(x_user_train,used_sensors=sensor_types,summarized_feature_names=feature_names)
                    #x_user_train=IterativeImputer().fit_transform(x_user_train)
#                     x_user_train=KNN(k=3).fit_transform(x_user_train)
#                     x_user_train=SoftImpute().fit_transform(x_user_train)

#                     with open(arimakalman_loc+line+'_x.pkl','rb') as f:
#                         x_user_train=pickle.load(f).astype(np.float)
#                     with open(arimakalman_loc+line+'_y.pkl','rb') as f:
#                         y_user_train=pickle.load(f).astype(np.float)
                    #y_df=pd.read_csv(yimpute_loc+line+'.csv',header=0,index_col=0)
                    #y_user_train=y_df.values
                    
                    
                
                    #x_user_train=np.nan_to_num(x_user_train)
                    # Remove outliers
#                     df_x_user=pd.DataFrame(x_user_train)
#                     df_y_user=pd.DataFrame(y_user_train,columns=labelname_user)
#                     df_x_trim,df_y_trim=remove_outliers(df_x=df_x_user,df_y=df_y_user,method='iqrwhiskers')
                    # End remove outlier

                    #X_train_t=np.vstack((X_train_t,x_user_train)) # Removing the first index columns
                    y_ssl=s3vm(x_user_train,y_user_train)
                    Y_train_t=np.vstack((Y_train_t,y_ssl))
        
#         X_train_t,mean,dev=standardize(X_train_t)
#         assert len(X_train_t)==len(Y_train_t)
#         print('\nTraining: Fold::{} X::{} ,Y::{}'.format(fold_n,X_train_t.shape,Y_train_t.shape))

        for te in test:
            with open(te,'r') as file:
                for line in file:
                    line = line.replace("\n","")
                    print("Working on Test: ",line)
                    (x_user_test,y_user_test,missed_label_user,tstamp_user,featurename_user,labelname_user) = read_user_data(prefix+line+'.features_labels.csv.gz')
#                     x_sh=x_user_test.shape
#                     y_sh=y_user_test.shape
                    
                    x_user_test=np.nan_to_num(x_user_test)
                    

                    # New method of imputation    
#                     tstamp_user_df=[pd.Timestamp(datetime.datetime.fromtimestamp(x).strftime("%x %X")) for x in tstamp_user]
#                     x_user_test_df=pd.DataFrame(x_user_test,columns=featurename_user)
#                     x_user_test_df.insert(0,"Datetime",tstamp_user_df,True) 
#                     x_user_test_df=x_user_test_df.set_index('Datetime')
#                     x_user_test_df.interpolate(limit_direction='forward',method='time')
#                     x_user_test_df.interpolate()
#                     x_user_test_df.fillna(0)
                    # End new method of imputation
                    
#                     x_user_test=x_user_test_df.values
                    
                    x_user_test=choose_sensors(x_user_test,used_sensors=sensor_types,summarized_feature_names=feature_names)
#                     x_user_test=IterativeImputer().transform(x_user_test)
#                     x_user_test=KNN(k=3).fit_transform(x_user_test)
#                     x_user_test=SoftImpute().fit_transform(x_user_test)
#                     with open(arimakalman_loc+line+'_x.pkl','rb') as f:
#                         x_user_test=pickle.load(f).astype(np.float)
#                     with open(arimakalman_loc+line+'_y.pkl','rb') as f:
#                         y_user_test=pickle.load(f).astype(np.float)
#                     y_df=pd.read_csv(yimpute_loc+line+'.csv',header=0,index_col=0)
#                     y_user_test=y_df.values
#                     x_user_test=np.nan_to_num(x_user_test)
                    # Remove outliers
#                     df_x_user=pd.DataFrame(x_user_test)
#                     df_y_user=pd.DataFrame(y_user_test,columns=labelname_user)
#                     df_x_trim,df_y_trim=remove_outliers(df_x=df_x_user,df_y=df_y_user,method='iqrwhiskers')
                    # End remove outlier
                    y_ssl=s3vm(x_user_test,y_user_test)
                    Y_test_t=np.vstack((Y_test_t,y_ssl))
#                     X_test_t=np.vstack((X_test_t,x_user_test)) # Removing the first index columns
#                     Y_test_t=np.vstack((Y_test_t,y_user_test))
            
#         X_test_t=(X_test_t-mean)/dev

#         assert len(X_test_t)==len(Y_test_t)
#         print('\nTesting: Fold::{} X::{} ,Y::{}'.format(fold_n,X_test_t.shape,Y_test_t.shape))
        
#         print("Pickling data files")
        # Split datasets
#         with open('dataset/pickled/orig_logicimputelabel/xtrain_{}.pkl'.format(fold_n),'wb') as f:
#             pickle.dump(X_train_t,f)
        with open('dataset/pickled/semisupervised_labels/ytrain_{}.pkl'.format(fold_n),'wb') as f:
            pickle.dump(Y_train_t,f)
#         with open('dataset/pickled/orig_logicimputelabel/xtest_{}.pkl'.format(fold_n),'wb') as f:
#             pickle.dump(X_test_t,f)
        with open('dataset/pickled/semisupervised_labels/ytest_{}.pkl'.format(fold_n),'wb') as f:
            pickle.dump(Y_test_t,f)
        print("Done for fold {}".format(fold_n))
    print ("DONE") 
else:
    print("Skipping step")

## Creating and pickling instance weight matrix

In [18]:
# Creating an instance weight matrix for the training labels
def instance_weight_matrix(y_train):
    instance_weights=np.zeros_like(y_train)
    for l in range(len(labelname_user)):
        temp_column=y_train[:,l]
        count_neg=0
        count_0=0
        count_1=0
        for i in range(len(temp_column)): # n^2 bincount doesn't work with arrays consisting of negative numbers
            if (temp_column[i]==-1):
                count_neg+=1
            elif (temp_column[i]==0):
                count_0+=1
            elif (temp_column[i]==1):
                count_1+=1
            else:
                print(temp_column[i])
                raise ValueError("Bad Loop")
#         print(l,count_0,count_1)
        if(count_0!=0):
            weight_0=float((count_0+count_1)/count_0)
        else:
            weight_0=0.
        if(count_1!=0):
            weight_1=float((count_0+count_1)/count_1)
        else:
            weight_1=0.
        if(weight_0+weight_1==0.):
            weight_0=1.
            weight_1=1.
        else:
            weight_0=weight_0/(weight_0+weight_1)
            weight_1=weight_1/(weight_0+weight_1)

        for i in range(len(temp_column)):
            if (temp_column[i]==-1):
                instance_weights[i,l]=0.
            elif (temp_column[i]==0):
                instance_weights[i,l]=weight_0
            elif (temp_column[i]==1):
                instance_weights[i,l]=weight_1
    return instance_weights

In [28]:
# Skipping cell if the data files were already created previously 
if done!=1:
    weights = dict()
    for fold_n in [0,1,2,3,4]:
        with open('dataset/pickled/semisupervised_2Hdrop/ytrain_{}.pkl'.format(fold_n),'rb') as f:
            y_train=pickle.load(f)
            #y_train=unclassified_labels(y_train)

        weights[fold_n] = instance_weight_matrix(y_train)

    for fold_n in [0,1,2,3,4]:
        with open('dataset/pickled/semisupervised_2Hdrop/weights_{}.pkl'.format(fold_n),'wb') as f:
            pickle.dump(weights[fold_n],f)
else:
    print("Skipping step")

## Miscellaneous train/test functions

### Cuda-enable

In [None]:
# Simple function to run using GPU when available
def C(structure):
    if torch.cuda.is_available():
        device=torch.device("cuda")
        return structure.to(device)

### Tackling missing labels using a mask

In [None]:
# Create a mask to hide -1 nans before training and then input to a train criterion
def mask(criterion,y_true,y_pred,mask_value=-1.):
    mask=torch.ne(y_true,mask_value).type(torch.cuda.FloatTensor)
    # Cast the ByteTensor from elementwise comparison to a FloatTensor
    return criterion(torch.mul(y_pred,mask),torch.mul(y_true,mask))

### Linear Learning-Rate scheduler

In [None]:
# Linear decreasing LR scheduler
def linear_lr_scheduler(optimizer,epoch):
    """
    LR_init=0.1, LR_final=0.01, n_epochs=40
    Sets the learning rate to the initial LR decayed by 1.04 every epoch"""
    for param_group in optimizer.param_groups:
        lr=param_group['lr']
    m=-3/1300
    c=0.1
    lr=(epoch*m)+c # Linear LR decay based on a set number of epochs
    for param_group in optimizer.param_groups:
        param_group['lr']=lr

In [None]:
# Linear decreasing LR scheduler
def linear_lr_scheduler_2(optimizer,epoch):
    """
    LR_init=0.1, LR_final=0.01, n_epochs=40
    Sets the learning rate to the initial LR decayed by 1.04 every epoch"""
    for param_group in optimizer.param_groups:
        lr=param_group['lr']
    m=-33/1300000
    c=1333/1300000
    lr=(epoch*m)+c # Linear LR decay based on a set number of epochs
    for param_group in optimizer.param_groups:
        param_group['lr']=lr

### Euclidean Norm for weight matrices

In [None]:
# Adds euclidean regularization to weight matrices
def frobenius_norm(model,loss):
    regularizer_loss=0
    
    for m in model.modules():
        if isinstance(m,nn.Linear): # Linear layer
            frobenius_norm=torch.norm(m.weight,p='fro')
            regularizer_loss+=frobenius_norm # Regularization over the weight matrices for linear layers
    return loss+0.001*regularizer_loss

### Accuracy (Precision, Recall, F1, Support, Balanced Accuracy) metrics

In [None]:
# Function for the required accuracy metrics per fold
def accuracy(fold,target_labels,y_true,y_pred):
    y_true=y_true.detach().numpy()
    y_pred=y_pred.detach().numpy()
    balanced_accuracy_dict={}
    print('*'*20)
    print('For fold {}'.format(fold))
    
    # Balanced accuracy
    for i in range(len(target_labels)):
        true_perlabel=y_true[:,i]
        pred_perlabel=y_pred[:,i]
        initial_shape=true_perlabel.shape
        
        invalid_mask=np.where(true_perlabel==-1.)
        valid_mask=np.where(true_perlabel!=-1.) # Create a mask
        true_perlabel=true_perlabel[valid_mask]
        pred_perlabel=pred_perlabel[valid_mask]
        
        bal_acc=balanced_accuracy_score(y_true=true_perlabel,y_pred=pred_perlabel)
        print('\t Label {}:::-> Balanced Accuracy {}'.format(target_labels[i],round(bal_acc,7)))
        print('\t\t Initial length {}, Missing mask length {}, Valid mask length {}, Final length {}'
              .format(initial_shape[0],len(invalid_mask[0]),len(valid_mask[0]),len(true_perlabel)))
        balanced_accuracy_dict[target_labels[i]]=round(bal_acc,5)
    return balanced_accuracy_dict