<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Imports" data-toc-modified-id="Imports-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Imports</a></span></li><li><span><a href="#Dataset-parsers-and-cleaning-functions" data-toc-modified-id="Dataset-parsers-and-cleaning-functions-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Dataset parsers and cleaning functions</a></span><ul class="toc-item"><li><span><a href="#Test-data" data-toc-modified-id="Test-data-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Test data</a></span></li><li><span><a href="#Actual-data" data-toc-modified-id="Actual-data-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Actual data</a></span></li></ul></li><li><span><a href="#Sequence-Prediction-LSTM" data-toc-modified-id="Sequence-Prediction-LSTM-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Sequence Prediction LSTM</a></span><ul class="toc-item"><li><span><a href="#Embedding" data-toc-modified-id="Embedding-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Embedding</a></span></li></ul></li></ul></div>

# Imports

In [1]:
# Required imports
import os
import numpy as np
import pandas as pd
import seaborn as sns
import gzip
import glob
import pickle
import copy
import math
import itertools
from io import StringIO
import importlib.machinery

from sklearn.metrics import accuracy_score,confusion_matrix,balanced_accuracy_score
from sklearn.metrics import precision_recall_fscore_support,classification_report
#from sklearn.metrics import multilabel_confusion_matrix # Only available in dev .21

# Need Pytorch for multilabel classifications
import torch
from torch.autograd import Variable as V
from torch import nn,optim
import torch.nn.functional as F
import torch.utils.data as utils
#import skorch [Scikit-learn wrapper around Pytorch so allowing for K-fold cross-validation]
from lr_finder import *
random_state=10
np.random.seed(random_state)



In [2]:
# Data location and sample user
prefix='dataset/Extrasensory_uuid_fl_uTAR/'
cross_validation_user_loc='dataset/cv_5_folds/'
user_sample='3600D531-0C55-44A7-AE95-A7A38519464E.features_labels'
done=1 # Pickled files are not created [0]

# Dataset parsers and cleaning functions

In [3]:
# Dataset parsers for header/ body for CSVs
def parse_header_of_csv(csv_str):
    # Isolate the headline columns:
    headline = csv_str[:csv_str.index('\n')];
    columns = headline.split(',');

    # The first column should be timestamp:
    assert columns[0] == 'timestamp';
    # The last column should be label_source:
    assert columns[-1] == 'label_source';
    
    # Search for the column of the first label:
    for (ci,col) in enumerate(columns):
        if col.startswith('label:'):
            first_label_ind = ci;
            break;
        pass;

    # Feature columns come after timestamp and before the labels:
    feature_names = columns[1:first_label_ind];
    # Then come the labels, till the one-before-last column:
    label_names = columns[first_label_ind:-1];
    for (li,label) in enumerate(label_names):
        # In the CSV the label names appear with prefix 'label:', but we don't need it after reading the data:
        assert label.startswith('label:');
        label_names[li] = label.replace('label:','');
        pass;
    
    return (feature_names,label_names);

def parse_body_of_csv(csv_str,n_features):
    # Read the entire CSV body into a single numeric matrix:
    full_table = np.loadtxt(StringIO(csv_str),delimiter=',',skiprows=1);
    
    # Timestamp is the primary key for the records (examples):
    timestamps = full_table[:,0].astype(int);
    
    # Read the sensor features:
    X = full_table[:,1:(n_features+1)];
    
    # Read the binary label values, and the 'missing label' indicators:
    trinary_labels_mat = full_table[:,(n_features+1):-1]; # This should have values of either 0., 1. or NaN
    M = np.isnan(trinary_labels_mat); # M is the missing label matrix
    
    #print("M matrix shape:",M.shape)
    #print("Matrix: ",np.argwhere(M))
    
    Y = np.where(M,0,trinary_labels_mat) > 0.; # Y is the label matrix
    
    return (X,Y,M,timestamps);

def read_user_data(directory):
    print('Reading {}'.format(directory.split("/")[-1]))

    # Read the entire csv file of the user:
    with gzip.open(directory,'rb') as fid:
        csv_str = fid.read();
        csv_str = csv_str.decode("utf-8")
        pass;

    (feature_names,label_names) = parse_header_of_csv(csv_str);
    n_features = len(feature_names);
    (X,Y,M,timestamps) = parse_body_of_csv(csv_str,n_features);

    return (X,Y,M,timestamps,feature_names,label_names);

In [4]:
# Clean labels
def clean_labels(input_label):
    if label.endswith('_'):
        label=label[:-1]+')'
    label=label.replace('__',' (').replace('_',' ')
    label=label[0]+label[1:].lower()
    label=label.replace('i m','I\'m')
    return label

In [5]:
# Get a summary of the sensor feature
'''
# Summarize features as we are only using phone_acc,phone_gyro,phone_mag,phone_loc,phone_audio,
# phone_app,phone_battery,phone_use,phone_callstat,phone_wifi,phone_lf,phone_time
# We are ignoring the use of the smartwatch features. There are definitely features that will be used
# much more (e.g. than the phone_callstat) but we'll leave that up to the ML algorithm.
'''
def summarize_features(feature_list):
    summary_feature_list=np.empty_like(feature_list)
    for (ind,feature) in enumerate(feature_list):
        if feature.startswith('raw_acc'):
            summary_feature_list[ind]='phone_acc' 
        if feature.startswith('proc_gyro'):
            summary_feature_list[ind]='phone_gyro'
        if feature.startswith('raw_magnet'):
            summary_feature_list[ind]='phone_mag'
        if feature.startswith('watch_acc'):
            summary_feature_list[ind]='watch_acc'
        if feature.startswith('watch_heading'):
            summary_feature_list[ind]='watch_dir'
        if feature.startswith('location'):
            summary_feature_list[ind]='phone_loc'
        if feature.startswith('audio'):
            summary_feature_list[ind]='phone_audio'
        if feature.startswith('discrete:app_state'):
            summary_feature_list[ind]='phone_app'
        if feature.startswith('discrete:battery'):
            summary_feature_list[ind]='phone_battery'
        if feature.startswith('discrete:on'):
            summary_feature_list[ind]='phone_use'
        if feature.startswith('discrete:ringer'):
            summary_feature_list[ind]='phone_callstat'
        if feature.startswith('discrete:wifi'):
            summary_feature_list[ind]='phone_wifi'
        if feature.startswith('lf'):
            summary_feature_list[ind]='phone_lf'
        if feature.startswith('discrete:time'):
            summary_feature_list[ind]='phone_time'

    return summary_feature_list


# Get a summary of the sensor feature along with the original label that was used
def summarize_features_wordy(feature_list):
    summary_feature_list=np.empty((len(feature_list),2),dtype=object)
    
    for (ind,feature) in enumerate(feature_list):
        if feature.startswith('raw_acc'):
            summary_feature_list[ind,0]='phone_acc'
            summary_feature_list[ind,1]=feature
            
        if feature.startswith('proc_gyro'):
            summary_feature_list[ind,0]='phone_gyro'
            summary_feature_list[ind,1]=feature
            
        if feature.startswith('raw_magnet'):
            summary_feature_list[ind,0]='phone_mag'
            summary_feature_list[ind,1]=feature
            
        if feature.startswith('watch_acc'):
            summary_feature_list[ind,0]='watch_acc'
            summary_feature_list[ind,1]=feature
            
        if feature.startswith('watch_heading'):
            summary_feature_list[ind,0]='watch_dir'
            summary_feature_list[ind,1]=feature
            
        if feature.startswith('location'):
            summary_feature_list[ind,0]='phone_loc'
            summary_feature_list[ind,1]=feature
            
        if feature.startswith('audio'):
            summary_feature_list[ind,0]='phone_audio'
            summary_feature_list[ind,1]=feature
            
        if feature.startswith('discrete:app_state'):
            summary_feature_list[ind,0]='phone_app'
            summary_feature_list[ind,1]=feature
            
        if feature.startswith('discrete:battery'):
            summary_feature_list[ind,0]='phone_battery'
            summary_feature_list[ind,1]=feature
            
        if feature.startswith('discrete:on'):
            summary_feature_list[ind,0]='phone_use'
            summary_feature_list[ind,1]=feature
            
        if feature.startswith('discrete:ringer'):
            summary_feature_list[ind,0]='phone_callstat'
            summary_feature_list[ind,1]=feature
            
        if feature.startswith('discrete:wifi'):
            summary_feature_list[ind,0]='phone_wifi'
            summary_feature_list[ind,1]=feature
            
        if feature.startswith('lf'):
            summary_feature_list[ind,0]='phone_lf'
            summary_feature_list[ind,1]=feature
            
        if feature.startswith('discrete:time'):
            summary_feature_list[ind,0]='phone_time'
            summary_feature_list[ind,1]=feature

    return summary_feature_list

In [6]:
# Custom dictionary class with help for duplicate keys
class Customdictionary(dict):
    def __setitem__(self,key,value):
        try:
            self[key]
        except KeyError:
            super(Customdictionary,self).__setitem__(key,[])
        self[key].append(value)

## Test data

In [7]:
# Reading sample data
sample_loc='{}/{}.csv.gz'.format(prefix,user_sample)
x_user,y_user,missedlabel_user,tstamp_user,featurename_user,labelname_user=read_user_data(sample_loc)

# Dataset summaries for this user
# print('Data shape input for user (Len minutes/num examples, num sensors): ',x_user.shape) # Timestep examples, number of sensors
# print('Label shape for user (Len minutes, num labels): ',y_user.shape,'\n') # Timestep examples, labels

countlabels_user=np.sum(y_user,axis=0) # Column summary
labelname_countlabel_user=zip(labelname_user,countlabels_user) # Zip together names, counts
labelname_countlabel_user=sorted(labelname_countlabel_user,key=lambda row:row[-1],reverse=True)

# print('Sensor feature names:\n')
feature_names=summarize_features(featurename_user)
    
# for i,sensor_feature in enumerate(featurename_user):
#     print('{} :: {} ::--> {}\n'.format(i,feature_names[i],sensor_feature))

# print('Activities and counts:')
# print(labelname_countlabel_user)

Reading 3600D531-0C55-44A7-AE95-A7A38519464E.features_labels.csv.gz


In [8]:
int2word=dict((integer,word) for integer,word in enumerate(labelname_user))
word2int=dict((word,integer) for integer,word in enumerate(labelname_user))

In [9]:
df=pd.DataFrame(y_user,columns=labelname_user)
df.groupby(labelname_user).size().reset_index().rename(columns={0:'count'}) # Unique sets of labels

Unnamed: 0,LYING_DOWN,SITTING,FIX_walking,FIX_running,BICYCLING,SLEEPING,LAB_WORK,IN_CLASS,IN_A_MEETING,LOC_main_workplace,...,STAIRS_-_GOING_DOWN,ELEVATOR,OR_standing,AT_SCHOOL,PHONE_IN_HAND,PHONE_IN_BAG,PHONE_ON_TABLE,WITH_CO-WORKERS,WITH_FRIENDS,count
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,106
1,False,False,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,3
2,False,False,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,True,30
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,1329
4,False,False,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,59
5,False,False,False,False,False,False,False,False,False,False,...,False,False,True,False,False,True,False,False,False,7
6,False,False,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,True,15
7,False,False,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,41
8,False,False,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,True,121
9,False,False,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,1


In [10]:
df_test=df.astype(int)

In [11]:
list_arr=[]
for i in range(0, len(df_test)):
    temp_arr=df.iloc[i].values.astype(int)
    temp_arr=np.multiply(temp_arr,np.arange(51)) #Column position value wherever the label was 1
    temp_arr=temp_arr[temp_arr>0]
    list_arr.append(temp_arr)

In [12]:
list_arr_orig=copy.deepcopy(list_arr)
list_arr_mod=[]
for i in range(0,len(list_arr)):
    check=[list_arr[i-2].tolist(),list_arr[i-1].tolist()]
    if i>=2:
        arr_value=list_arr[i].tolist()
        if ((arr_value!=check[-1]) or (arr_value!=check[0])):
            list_arr_mod.append(arr_value)
    else:
        list_arr_mod.append(list_arr[i])

In [13]:
level_dict={}
for i in range(len(list_arr_mod)):
    level_dict[i]=list_arr_mod[i]

In [14]:
string_sequence=''
for k,v in level_dict.items():
    for v1 in v:
        string_sequence=string_sequence+'_'+str(v1)

In [15]:
def permute(lists,prefix=''):
#     global out
    if not lists:
        print(prefix)
        return
        
    first=lists[0]
    rest=lists[1:]
    for letter in first:
        permute(rest,prefix+str(letter)+'_')

In [16]:
string_sequence

'_1_8_34_45_48_1_8_34_45_48_2_2_2_10_16_10_16_44_1_10_16_34_48_50_1_10_16_34_48_50_10_16_43_44_46_2_10_16_18_2_10_16_18_2_18_2_10_16_18_2_10_16_18_2_18_2_18_10_19_40_44_10_19_40_44_18_44_18_44_16_16_16_16_10_16_28_34_36_46_50_10_16_28_34_36_46_50_1_10_37_46_1_10_37_46_1_10_16_37_46_1_10_16_37_46_1_10_37_46_1_10_16_37_46_1_10_16_37_46_10_16_24_44_47_10_16_24_44_47_10_24_44_47_10_24_44_47_10_16_24_44_47_10_24_44_47_10_24_44_47_10_16_34_44_48_10_16_34_44_48_16_10_16_20_44_48_10_16_20_44_48_1_10_16_18_28_36_1_10_16_18_28_36_5_10_16_48_1_28_36_45_1_12_18_33_34_50_1_28_36_48_50_1_28_36_48_50_10_16_28_36_10_16_28_36_1_10_48_50_1_10_48_50_1_12_15_1_12_15_10_21_44_50_10_21_44_50_1_12_15_50_1_12_15_50_1_23_36_1_23_36_1_10_16_23_36_1_10_16_23_36_1_10_16_18_23_28_34_36_50_1_10_16_18_23_28_34_36_50_1_18_23_28_34_36_50_1_10_16_18_23_28_34_36_50_1_18_23_28_34_36_50_1_10_16_18_23_28_34_36_50_1_18_23_28_34_36_50_1_10_16_18_23_28_34_36_50_1_10_16_18_23_28_34_36_50_5_10_16_48_50_5_10_16_48_50_5_10_48_50_

## Actual data

In [17]:
globs=glob.glob('dataset/Extrasensory_uuid_fl_uTAR/*')
for g in globs:
    fname=g.split('/')[-1].split('.')[0]
    fname='dataset/sequence_strings/{}.txt'.format(fname)
    x_user,y_user,missedlabel_user,tstamp_user,featurename_user,labelname_user=read_user_data(g)
    
        # Dataset summaries for this user
    print('Data shape input for user (Len minutes/num examples, num sensors): ',x_user.shape) # Timestep examples, number of sensors
    print('Label shape for user (Len minutes, num labels): ',y_user.shape,'\n') # Timestep examples, labels

    countlabels_user=np.sum(y_user,axis=0) # Column summary
    labelname_countlabel_user=zip(labelname_user,countlabels_user) # Zip together names, counts
    labelname_countlabel_user=sorted(labelname_countlabel_user,key=lambda row:row[-1],reverse=True)

    print('Sensor feature names:\n')
    feature_names=summarize_features(featurename_user)

    # for i,sensor_feature in enumerate(featurename_user):
    #     print('{} :: {} ::--> {}\n'.format(i,feature_names[i],sensor_feature))

    print('Activities and counts:')
    print(labelname_countlabel_user)
    
    df=pd.DataFrame(y_user,columns=labelname_user)
    df_test=df.astype(int)
    
    list_arr=[]
    for i in range(0, len(df_test)):
        temp_arr=df.iloc[i].values.astype(int)
        temp_arr=np.multiply(temp_arr,np.arange(51)) #Column position value wherever the label was 1
        temp_arr=temp_arr[temp_arr>0]
        list_arr.append(temp_arr)
        
    list_arr_orig=copy.deepcopy(list_arr)
    list_arr_mod=[]
    for i in range(0,len(list_arr)):
        check=[list_arr[i-2].tolist(),list_arr[i-1].tolist()]
        if i>=2:
            arr_value=list_arr[i].tolist()
            if ((arr_value!=check[-1]) or (arr_value!=check[0])):
                list_arr_mod.append(arr_value)
        else:
            list_arr_mod.append(list_arr[i])
    
    level_dict={}
    for i in range(len(list_arr_mod)):
        level_dict[i]=list_arr_mod[i]
        
    string_sequence=''
    for k,v in level_dict.items():
        for v1 in v:
            string_sequence=string_sequence+'_'+str(v1)
            
    f=open(fname,"w")
    f.write(string_sequence)
    f.close()

Reading 3600D531-0C55-44A7-AE95-A7A38519464E.features_labels.csv.gz
Data shape input for user (Len minutes/num examples, num sensors):  (5203, 225)
Label shape for user (Len minutes, num labels):  (5203, 51) 

Sensor feature names:

Activities and counts:
[('LOC_home', 3040), ('OR_indoors', 2487), ('PHONE_ON_TABLE', 2179), ('SITTING', 1916), ('WITH_FRIENDS', 1730), ('LYING_DOWN', 1336), ('SLEEPING', 1021), ('WATCHING_TV', 912), ('EATING', 762), ('PHONE_IN_POCKET', 706), ('TALKING', 638), ('DRIVE_-_I_M_A_PASSENGER', 409), ('OR_standing', 384), ('IN_A_CAR', 342), ('OR_exercise', 162), ('AT_THE_GYM', 162), ('SINGING', 136), ('FIX_walking', 132), ('OR_outside', 127), ('SHOPPING', 111), ('AT_SCHOOL', 105), ('BATHING_-_SHOWER', 85), ('DRESSING', 67), ('DRINKING__ALCOHOL_', 66), ('PHONE_IN_HAND', 64), ('FIX_restaurant', 59), ('IN_CLASS', 54), ('PHONE_IN_BAG', 33), ('IN_A_MEETING', 27), ('TOILET', 12), ('COOKING', 5), ('ELEVATOR', 1), ('FIX_running', 0), ('BICYCLING', 0), ('LAB_WORK', 0), ('LO

Reading B9724848-C7E2-45F4-9B3F-A1F38D864495.features_labels.csv.gz
Data shape input for user (Len minutes/num examples, num sensors):  (7626, 225)
Label shape for user (Len minutes, num labels):  (7626, 51) 

Sensor feature names:

Activities and counts:
[('OR_indoors', 4893), ('LOC_home', 3066), ('SITTING', 2961), ('LYING_DOWN', 2554), ('WITH_FRIENDS', 2455), ('SLEEPING', 2422), ('TALKING', 1414), ('PHONE_IN_BAG', 1225), ('COMPUTER_WORK', 1035), ('OR_standing', 1010), ('AT_SCHOOL', 770), ('FIX_walking', 714), ('OR_outside', 584), ('EATING', 457), ('LOC_main_workplace', 369), ('PHONE_IN_POCKET', 360), ('PHONE_ON_TABLE', 335), ('CLEANING', 330), ('IN_CLASS', 286), ('PHONE_IN_HAND', 220), ('DRIVE_-_I_M_A_PASSENGER', 217), ('GROOMING', 187), ('WITH_CO-WORKERS', 143), ('DOING_LAUNDRY', 96), ('BATHING_-_SHOWER', 87), ('SURFING_THE_INTERNET', 69), ('ON_A_BUS', 65), ('TOILET', 56), ('WATCHING_TV', 49), ('SHOPPING', 44), ('FIX_running', 13), ('OR_exercise', 13), ('WASHING_DISHES', 5), ('ELEVA

Reading 0E6184E1-90C0-48EE-B25A-F1ECB7B9714E.features_labels.csv.gz
Data shape input for user (Len minutes/num examples, num sensors):  (7521, 225)
Label shape for user (Len minutes, num labels):  (7521, 51) 

Sensor feature names:

Activities and counts:
[('OR_indoors', 4549), ('SITTING', 3089), ('LOC_home', 2494), ('TALKING', 2438), ('WITH_FRIENDS', 2418), ('LYING_DOWN', 1989), ('FIX_walking', 1532), ('SLEEPING', 1312), ('AT_SCHOOL', 1069), ('OR_standing', 866), ('WITH_CO-WORKERS', 840), ('CLEANING', 828), ('IN_CLASS', 647), ('OR_outside', 546), ('LOC_main_workplace', 517), ('EATING', 376), ('COOKING', 331), ('SURFING_THE_INTERNET', 231), ('FIX_restaurant', 137), ('DRESSING', 118), ('GROOMING', 116), ('PHONE_ON_TABLE', 107), ('TOILET', 96), ('BATHING_-_SHOWER', 88), ('STAIRS_-_GOING_DOWN', 87), ('STAIRS_-_GOING_UP', 78), ('IN_A_MEETING', 63), ('WATCHING_TV', 62), ('ON_A_BUS', 54), ('SHOPPING', 48), ('IN_A_CAR', 42), ('FIX_running', 0), ('BICYCLING', 0), ('LAB_WORK', 0), ('DRIVE_-_I_M

Reading 81536B0A-8DBF-4D8A-AC24-9543E2E4C8E0.features_labels.csv.gz
Data shape input for user (Len minutes/num examples, num sensors):  (6407, 225)
Label shape for user (Len minutes, num labels):  (6407, 51) 

Sensor feature names:

Activities and counts:
[('OR_indoors', 2580), ('LYING_DOWN', 2297), ('SLEEPING', 2269), ('SITTING', 2143), ('LOC_home', 1782), ('COMPUTER_WORK', 1377), ('SURFING_THE_INTERNET', 1293), ('WITH_FRIENDS', 959), ('EATING', 746), ('TALKING', 687), ('WATCHING_TV', 607), ('BICYCLING', 579), ('OR_exercise', 579), ('DRIVE_-_I_M_THE_DRIVER', 557), ('OR_standing', 395), ('DRINKING__ALCOHOL_', 301), ('SHOPPING', 226), ('OR_outside', 219), ('STROLLING', 219), ('FIX_walking', 78), ('COOKING', 77), ('DRIVE_-_I_M_A_PASSENGER', 25), ('BATHING_-_SHOWER', 20), ('AT_A_BAR', 14), ('CLEANING', 11), ('GROOMING', 11), ('PHONE_IN_HAND', 7), ('FIX_running', 0), ('LAB_WORK', 0), ('IN_CLASS', 0), ('IN_A_MEETING', 0), ('LOC_main_workplace', 0), ('IN_A_CAR', 0), ('ON_A_BUS', 0), ('FIX_re

Reading 61359772-D8D8-480D-B623-7C636EAD0C81.features_labels.csv.gz
Data shape input for user (Len minutes/num examples, num sensors):  (6079, 225)
Label shape for user (Len minutes, num labels):  (6079, 51) 

Sensor feature names:

Activities and counts:
[('SITTING', 2731), ('OR_indoors', 2225), ('PHONE_ON_TABLE', 1892), ('LYING_DOWN', 1522), ('OR_standing', 1201), ('SLEEPING', 1093), ('WITH_FRIENDS', 919), ('PHONE_IN_POCKET', 699), ('TALKING', 631), ('LOC_main_workplace', 564), ('LOC_home', 508), ('IN_CLASS', 482), ('AT_SCHOOL', 317), ('EATING', 250), ('FIX_walking', 234), ('BICYCLING', 232), ('OR_exercise', 232), ('BATHING_-_SHOWER', 142), ('GROOMING', 78), ('DRESSING', 78), ('ON_A_BUS', 60), ('AT_A_PARTY', 45), ('OR_outside', 32), ('IN_A_MEETING', 22), ('TOILET', 13), ('FIX_running', 0), ('LAB_WORK', 0), ('IN_A_CAR', 0), ('DRIVE_-_I_M_THE_DRIVER', 0), ('DRIVE_-_I_M_A_PASSENGER', 0), ('FIX_restaurant', 0), ('COOKING', 0), ('SHOPPING', 0), ('STROLLING', 0), ('DRINKING__ALCOHOL_', 0),

Reading 5152A2DF-FAF3-4BA8-9CA9-E66B32671A53.features_labels.csv.gz
Data shape input for user (Len minutes/num examples, num sensors):  (6617, 225)
Label shape for user (Len minutes, num labels):  (6617, 51) 

Sensor feature names:

Activities and counts:
[('OR_indoors', 3954), ('PHONE_ON_TABLE', 3084), ('LOC_home', 3040), ('SITTING', 2972), ('SLEEPING', 2440), ('TALKING', 2422), ('LYING_DOWN', 2319), ('AT_SCHOOL', 2016), ('SURFING_THE_INTERNET', 1674), ('PHONE_IN_HAND', 1480), ('OR_standing', 576), ('PHONE_IN_BAG', 467), ('EATING', 415), ('FIX_walking', 384), ('LOC_beach', 236), ('BICYCLING', 136), ('OR_exercise', 136), ('ELEVATOR', 23), ('IN_A_MEETING', 13), ('WITH_CO-WORKERS', 13), ('FIX_running', 0), ('LAB_WORK', 0), ('IN_CLASS', 0), ('LOC_main_workplace', 0), ('OR_outside', 0), ('IN_A_CAR', 0), ('ON_A_BUS', 0), ('DRIVE_-_I_M_THE_DRIVER', 0), ('DRIVE_-_I_M_A_PASSENGER', 0), ('FIX_restaurant', 0), ('PHONE_IN_POCKET', 0), ('COOKING', 0), ('SHOPPING', 0), ('STROLLING', 0), ('DRINKING_

Reading ECECC2AB-D32F-4F90-B74C-E12A1C69BBE2.features_labels.csv.gz
Data shape input for user (Len minutes/num examples, num sensors):  (3530, 225)
Label shape for user (Len minutes, num labels):  (3530, 51) 

Sensor feature names:

Activities and counts:
[('LOC_main_workplace', 1076), ('SITTING', 956), ('AT_SCHOOL', 940), ('PHONE_ON_TABLE', 802), ('WITH_FRIENDS', 777), ('LOC_home', 772), ('TALKING', 491), ('IN_A_MEETING', 404), ('PHONE_IN_BAG', 314), ('OR_indoors', 237), ('FIX_walking', 156), ('EATING', 125), ('OR_outside', 120), ('OR_standing', 101), ('WITH_CO-WORKERS', 100), ('COOKING', 79), ('WASHING_DISHES', 79), ('IN_CLASS', 35), ('IN_A_CAR', 35), ('DRIVE_-_I_M_THE_DRIVER', 35), ('SINGING', 35), ('FIX_restaurant', 29), ('LYING_DOWN', 25), ('SLEEPING', 25), ('ON_A_BUS', 17), ('SHOPPING', 15), ('TOILET', 14), ('PHONE_IN_POCKET', 9), ('FIX_running', 0), ('BICYCLING', 0), ('LAB_WORK', 0), ('DRIVE_-_I_M_A_PASSENGER', 0), ('OR_exercise', 0), ('STROLLING', 0), ('DRINKING__ALCOHOL_', 0),

Reading 99B204C0-DD5C-4BB7-83E8-A37281B8D769.features_labels.csv.gz
Data shape input for user (Len minutes/num examples, num sensors):  (6038, 225)
Label shape for user (Len minutes, num labels):  (6038, 51) 

Sensor feature names:

Activities and counts:
[('PHONE_ON_TABLE', 2935), ('OR_indoors', 2731), ('LOC_home', 2729), ('SITTING', 2215), ('SLEEPING', 1913), ('LYING_DOWN', 1874), ('COMPUTER_WORK', 1336), ('LOC_main_workplace', 1097), ('OR_standing', 801), ('TALKING', 742), ('WITH_CO-WORKERS', 717), ('IN_A_MEETING', 507), ('FIX_walking', 468), ('EATING', 441), ('OR_exercise', 244), ('WITH_FRIENDS', 200), ('DRIVE_-_I_M_THE_DRIVER', 189), ('BICYCLING', 126), ('COOKING', 116), ('WATCHING_TV', 101), ('FIX_restaurant', 72), ('PHONE_IN_BAG', 62), ('CLEANING', 61), ('OR_outside', 52), ('SURFING_THE_INTERNET', 47), ('ON_A_BUS', 42), ('TOILET', 17), ('STAIRS_-_GOING_UP', 4), ('STAIRS_-_GOING_DOWN', 4), ('FIX_running', 0), ('LAB_WORK', 0), ('IN_CLASS', 0), ('IN_A_CAR', 0), ('DRIVE_-_I_M_A_PASS

Reading 1DBB0F6F-1F81-4A50-9DF4-CD62ACFA4842.features_labels.csv.gz
Data shape input for user (Len minutes/num examples, num sensors):  (7375, 225)
Label shape for user (Len minutes, num labels):  (7375, 51) 

Sensor feature names:

Activities and counts:
[('OR_indoors', 5655), ('LOC_home', 4842), ('PHONE_ON_TABLE', 4082), ('SITTING', 3441), ('LYING_DOWN', 1799), ('COMPUTER_WORK', 1657), ('SURFING_THE_INTERNET', 1340), ('SLEEPING', 1035), ('OR_standing', 980), ('TALKING', 971), ('FIX_walking', 925), ('PHONE_IN_POCKET', 871), ('OR_outside', 848), ('AT_SCHOOL', 841), ('PHONE_IN_HAND', 587), ('EATING', 461), ('IN_CLASS', 361), ('GROOMING', 253), ('DRESSING', 220), ('TOILET', 181), ('BATHING_-_SHOWER', 160), ('STROLLING', 124), ('FIX_running', 47), ('OR_exercise', 47), ('IN_A_CAR', 36), ('ON_A_BUS', 32), ('BICYCLING', 0), ('LAB_WORK', 0), ('IN_A_MEETING', 0), ('LOC_main_workplace', 0), ('DRIVE_-_I_M_THE_DRIVER', 0), ('DRIVE_-_I_M_A_PASSENGER', 0), ('FIX_restaurant', 0), ('COOKING', 0), ('S

# Sequence Prediction LSTM

## Embedding

In [18]:
def tokenize_corpus(corpus):
    corpus=corpus.split('_')
    tokens = [x for x in corpus]
    return tokens

def get_input_layer(word_idx):
    x = torch.zeros(vocabulary_size).float()
    x[word_idx] = 1.0
    return x

vocabulary_size=51
embedding_dims=10

In [19]:
all_tokens=[]
globs=glob.glob('dataset/sequence_strings/*')
for g in globs:
    text=open(g,'r').read()

    tokens=tokenize_corpus(text)
    tokens.pop(0) # Semantics
    all_tokens.append(tokens)

In [23]:
window_size=3
idx_pairs = []
# for each sentence
for sentence in all_tokens:
    for center_word_pos in range(len(sentence)):
        # for each window position
        for w in range(-window_size, window_size + 1):
            context_word_pos = center_word_pos + w
            # make soure not jump out sentence
            if context_word_pos < 0 or context_word_pos >= len(sentence) or center_word_pos == context_word_pos:
                continue
            context_word_idx = sentence[context_word_pos]
            idx_pairs.append((sentence[center_word_pos], context_word_idx))

idx_pairs = np.array(idx_pairs).astype(int) # it will be useful to have this as numpy array

In [None]:
W1 = V(torch.randn(embedding_dims, vocabulary_size).float(), requires_grad=True)
W2 = V(torch.randn(vocabulary_size, embedding_dims).float(), requires_grad=True)
num_epochs = 100
learning_rate = 0.001

for epo in range(num_epochs):
    loss_val = 0
    for data, target in idx_pairs:
        x = V(get_input_layer(data)).float()
        y_true = V(torch.from_numpy(np.array([target])).long())

        z1 = torch.matmul(W1, x)
        z2 = torch.matmul(W2, z1)
    
        log_softmax = F.log_softmax(z2, dim=0)

        loss = F.nll_loss(log_softmax.view(1,-1), y_true)
        
        loss_val += loss.data
        loss.backward()
        W1.data -= learning_rate * W1.grad.data
        W2.data -= learning_rate * W2.grad.data

        W1.grad.data.zero_()
        W2.grad.data.zero_()
    if epo % 10 == 0:    
        print(f'Loss at epo {epo}: {loss_val/len(idx_pairs)}')

Loss at epo 0: 2.8326637744903564


In [None]:
W1_np=W1.detach().numpy()
W2_np=W2.detach().numpy().T
embed_np=(W1_np+W2_np)/2.

from sklearn.manifold import TSNE
X_embedded=TSNE(n_components=2,verbose=2,learning_rate=500,n_iter=3000,random_state=10).fit_transform(embed_np)

In [None]:
sns.set_context('notebook',font_scale=1.1)
sns.set_style('ticks')
sns.lmplot(x='_DIM_1_',
           y='_DIM_2_'.
           data)

In [None]:
plt.scatter(X_embedded[:, 0], X_embedded[:, 1])