<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Dataset-summary" data-toc-modified-id="Dataset-summary-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Dataset summary</a></span></li><li><span><a href="#SMiLE-(Semi-supervised-multi-label-classification-using-incomplete-label-information)" data-toc-modified-id="SMiLE-(Semi-supervised-multi-label-classification-using-incomplete-label-information)-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>SMiLE (Semi-supervised multi-label classification using incomplete label information)</a></span></li><li><span><a href="#Raw-SVM" data-toc-modified-id="Raw-SVM-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Raw SVM</a></span></li><li><span><a href="#1-NearestNeighbor" data-toc-modified-id="1-NearestNeighbor-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>1-NearestNeighbor</a></span></li><li><span><a href="#Multi-MLP" data-toc-modified-id="Multi-MLP-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Multi-MLP</a></span><ul class="toc-item"><li><span><a href="#0H" data-toc-modified-id="0H-5.1"><span class="toc-item-num">5.1&nbsp;&nbsp;</span>0H</a></span></li><li><span><a href="#1H" data-toc-modified-id="1H-5.2"><span class="toc-item-num">5.2&nbsp;&nbsp;</span>1H</a></span></li><li><span><a href="#2H" data-toc-modified-id="2H-5.3"><span class="toc-item-num">5.3&nbsp;&nbsp;</span>2H</a></span></li><li><span><a href="#2HDrop" data-toc-modified-id="2HDrop-5.4"><span class="toc-item-num">5.4&nbsp;&nbsp;</span>2HDrop</a></span></li></ul></li></ul></div>

# Dataset summary

In [2]:
# Required imports
import datetime
import os
import numpy as np
import pandas as pd
import gzip
import glob
import pickle
import copy
import math
from io import StringIO
import importlib.machinery
from scipy import stats, optimize
import multiprocessing
from functools import partial
import time
import pandas as pd

In [3]:
prefix='dataset/Extrasensory_uuid_fl_uTAR/'
user_sample='3600D531-0C55-44A7-AE95-A7A38519464E.features_labels'

In [4]:
# Dataset parsers for header/ body for CSVs
def parse_header_of_csv(csv_str):
    # Isolate the headline columns:
    headline = csv_str[:csv_str.index('\n')];
    columns = headline.split(',');

    # The first column should be timestamp:
    assert columns[0] == 'timestamp';
    # The last column should be label_source:
    assert columns[-1] == 'label_source';
    
    # Search for the column of the first label:
    for (ci,col) in enumerate(columns):
        if col.startswith('label:'):
            first_label_ind = ci;
            break;
        pass;

    # Feature columns come after timestamp and before the labels:
    feature_names = columns[1:first_label_ind];
    # Then come the labels, till the one-before-last column:
    label_names = columns[first_label_ind:-1];
    for (li,label) in enumerate(label_names):
        # In the CSV the label names appear with prefix 'label:', but we don't need it after reading the data:
        assert label.startswith('label:');
        label_names[li] = label.replace('label:','');
        pass;
    
    return (feature_names,label_names);

def parse_body_of_csv(csv_str,n_features):
    # Read the entire CSV body into a single numeric matrix:
    full_table = np.loadtxt(StringIO(csv_str),delimiter=',',skiprows=1);
    
    # Timestamp is the primary key for the records (examples):
    timestamps = full_table[:,0].astype(int);
    
    # Read the sensor features:
    X = full_table[:,1:(n_features+1)];
    
    # Read the binary label values, and the 'missing label' indicators:
    trinary_labels_mat = full_table[:,(n_features+1):-1]; # This should have values of either 0., 1. or NaN
    M = np.isnan(trinary_labels_mat); # M is the missing label matrix
    
    #print("M matrix shape:",M.shape)
    #print("Matrix: ",np.argwhere(M))
    trinary_labels_mat[M]=-1 # Replace NaNs with -1.0 for which we then apply a mask
    unique,counts=np.unique(trinary_labels_mat,return_counts=True)
    print(*zip(unique,counts)) 
    
#     Y = np.where(M,0,trinary_labels_mat) > 0.; # Y is the label matrix
    
    return (X,trinary_labels_mat,M,timestamps);

def read_user_data(directory):
    print('Reading {}'.format(directory.split("/")[-1]))

    # Read the entire csv file of the user:
    with gzip.open(directory,'rb') as fid:
        csv_str = fid.read();
        csv_str = csv_str.decode("utf-8")
        pass;

    (feature_names,label_names) = parse_header_of_csv(csv_str);
    n_features = len(feature_names);
    (X,Y,M,timestamps) = parse_body_of_csv(csv_str,n_features);

    return (X,Y,M,timestamps,feature_names,label_names);

In [5]:
# Get a summary of the sensor feature
'''
# Summarize features as we are only using phone_acc,phone_gyro,phone_mag,phone_loc,phone_audio,
# phone_app,phone_battery,phone_use,phone_callstat,phone_wifi,phone_lf,phone_time
# We are ignoring the use of the smartwatch features. There are definitely features that will be used
# much more (e.g. than the phone_callstat) but we'll leave that up to the ML algorithm.
'''
def summarize_features(feature_list):
    summary_feature_list=np.empty_like(feature_list)
    for (ind,feature) in enumerate(feature_list):
        if feature.startswith('raw_acc'):
            summary_feature_list[ind]='phone_acc' 
        if feature.startswith('proc_gyro'):
            summary_feature_list[ind]='phone_gyro'
        if feature.startswith('raw_magnet'):
            summary_feature_list[ind]='phone_mag'
        if feature.startswith('watch_acc'):
            summary_feature_list[ind]='watch_acc'
        if feature.startswith('watch_heading'):
            summary_feature_list[ind]='watch_dir'
        if feature.startswith('location'):
            summary_feature_list[ind]='phone_loc'
        if feature.startswith('audio'):
            summary_feature_list[ind]='phone_audio'
        if feature.startswith('discrete:app_state'):
            summary_feature_list[ind]='phone_app'
        if feature.startswith('discrete:battery'):
            summary_feature_list[ind]='phone_battery'
        if feature.startswith('discrete:on'):
            summary_feature_list[ind]='phone_use'
        if feature.startswith('discrete:ringer'):
            summary_feature_list[ind]='phone_callstat'
        if feature.startswith('discrete:wifi'):
            summary_feature_list[ind]='phone_wifi'
        if feature.startswith('lf'):
            summary_feature_list[ind]='phone_lf'
        if feature.startswith('discrete:time'):
            summary_feature_list[ind]='phone_time'

    return summary_feature_list


# Get a summary of the sensor feature along with the original label that was used
def summarize_features_worig(feature_list):
    summary_feature_list=np.empty((len(feature_list),2),dtype=object)
    
    for (ind,feature) in enumerate(feature_list):
        if feature.startswith('raw_acc'):
            summary_feature_list[ind,0]='phone_acc'
            summary_feature_list[ind,1]=feature
            
        if feature.startswith('proc_gyro'):
            summary_feature_list[ind,0]='phone_gyro'
            summary_feature_list[ind,1]=feature
            
        if feature.startswith('raw_magnet'):
            summary_feature_list[ind,0]='phone_mag'
            summary_feature_list[ind,1]=feature
            
        if feature.startswith('watch_acc'):
            summary_feature_list[ind,0]='watch_acc'
            summary_feature_list[ind,1]=feature
            
        if feature.startswith('watch_heading'):
            summary_feature_list[ind,0]='watch_dir'
            summary_feature_list[ind,1]=feature
            
        if feature.startswith('location'):
            summary_feature_list[ind,0]='phone_loc'
            summary_feature_list[ind,1]=feature
            
        if feature.startswith('audio'):
            summary_feature_list[ind,0]='phone_audio'
            summary_feature_list[ind,1]=feature
            
        if feature.startswith('discrete:app_state'):
            summary_feature_list[ind,0]='phone_app'
            summary_feature_list[ind,1]=feature
            
        if feature.startswith('discrete:battery'):
            summary_feature_list[ind,0]='phone_battery'
            summary_feature_list[ind,1]=feature
            
        if feature.startswith('discrete:on'):
            summary_feature_list[ind,0]='phone_use'
            summary_feature_list[ind,1]=feature
            
        if feature.startswith('discrete:ringer'):
            summary_feature_list[ind,0]='phone_callstat'
            summary_feature_list[ind,1]=feature
            
        if feature.startswith('discrete:wifi'):
            summary_feature_list[ind,0]='phone_wifi'
            summary_feature_list[ind,1]=feature
            
        if feature.startswith('lf'):
            summary_feature_list[ind,0]='phone_lf'
            summary_feature_list[ind,1]=feature
            
        if feature.startswith('discrete:time'):
            summary_feature_list[ind,0]='phone_time'
            summary_feature_list[ind,1]=feature

    return summary_feature_list

In [6]:
def choose_sensors(X_train,used_sensors,summarized_feature_names):
    used_sensor_feature_names=np.zeros(len(summarized_feature_names),dtype=bool)
    # Creates a zero boolean vector of all possible feature names
    for s in used_sensors:
        used_sensor_feature_names=np.logical_or(used_sensor_feature_names,(s==summarized_feature_names))
    X_train=X_train[:,used_sensor_feature_names]
    return X_train

def choose_sensors_dropout(X_train,used_sensors,summarized_feature_names):
    used_sensor_feature_names=np.zeros(len(summarized_feature_names),dtype=bool)
    data_length=len(X_train)
    
    # Creates a zero boolean vector of all possible feature names
    for s in used_sensors:
        used_sensor_feature_names=np.logical_or(used_sensor_feature_names,(s==summarized_feature_names))
    mask=np.tile(used_sensor_feature_names,(data_length,1))
    
    X_train=np.multiply(X_train,mask) # Element-wise matrix multiply
    return X_train

def choose_sensors_longnames(X_train,used_sensors,long_featurenames):
    
    used_sensor_feature_names=np.zeros(len(long_featurenames),dtype=bool)
    used_feature_actualnames=np.zeros(len(long_featurenames),dtype=bool)
    # Creates a zero boolean vector of all possible feature names
    summary_features=long_featurenames[:,0]
    all_complete_features=long_featurenames[:,-1]
    
    for s in used_sensors:
        similar=(s==summary_features)
        
        #used_complete_features=(all_complete_features[similar.astype(int)])
       
        used_sensor_feature_names=np.logical_or(used_sensor_feature_names,similar)
        used_feature_actualnames=np.logical_or(used_feature_actualnames,similar)
    
    X_train=X_train[:,used_sensor_feature_names]
    long_names=all_complete_features[used_feature_actualnames]
    return X_train,long_names


In [7]:
# Sensor Types, Label Possibilities variables
sensor_types=['phone_acc','phone_gyro','phone_mag','phone_loc','phone_audio',
'phone_app','phone_battery','phone_use','phone_callstat','phone_wifi','phone_lf',
'phone_time']
label_possibilities=['LOC_home','OR_indoors','PHONE_ON_TABLE','SITTING','WITH_FRIENDS',
 'LYING_DOWN','SLEEPING','WATCHING_TV','EATING','PHONE_IN_POCKET',
 'TALKING','DRIVE_-_I_M_A_PASSENGER','OR_standing','IN_A_CAR',
 'OR_exercise','AT_THE_GYM','SINGING','FIX_walking','OR_outside',
 'SHOPPING','AT_SCHOOL','BATHING_-_SHOWER','DRESSING','DRINKING__ALCOHOL_',
 'PHONE_IN_HAND','FIX_restaurant','IN_CLASS','PHONE_IN_BAG','IN_A_MEETING',
 'TOILET','COOKING','ELEVATOR','FIX_running','BICYCLING','LAB_WORK',
 'LOC_main_workplace','ON_A_BUS','DRIVE_-_I_M_THE_DRIVER','STROLLING',
 'CLEANING','DOING_LAUNDRY','WASHING_DISHES','SURFING_THE_INTERNET',
 'AT_A_PARTY','AT_A_BAR','LOC_beach','COMPUTER_WORK','GROOMING','STAIRS_-_GOING_UP',
 'STAIRS_-_GOING_DOWN','WITH_CO-WORKERS']

In [8]:
# Reading sample data
sample_loc='{}/{}.csv.gz'.format(prefix,user_sample)
x_user,y_user,missedlabel_user,tstamp_user,featurename_user,labelname_user=read_user_data(sample_loc)
feature_names=summarize_features_worig(featurename_user)
x_user,feature_long_names=choose_sensors_longnames(x_user,sensor_types,feature_names)

Reading 3600D531-0C55-44A7-AE95-A7A38519464E.features_labels.csv.gz
(-1.0, 148794) (0.0, 97289) (1.0, 19270)


# SMiLE (Semi-supervised multi-label classification using incomplete label information)

In [None]:
# from smile_master.SMILE.smile_ import SMiLE
from pomegranate import NaiveBayes, NormalDistribution


In [None]:
# smile = SMiLE(s=0.5, alpha=0.35, k=5)

In [None]:
x_user=np.nan_to_num(x_user)
# smile.fit(np.transpose(x_user),np.transpose(y_user))
# predictions=smile.predict(np.transpose(x_user))
# probabilities=smile.predict_proba(np.transpose(x_user))
model = NaiveBayes.from_samples(NormalDistribution, x_user, y_user, verbose=True)
# print(predictions)
# print(probabilities)

In [None]:
unique,counts=np.unique(predictions,return_counts=True)
print(*zip(unique,counts))

In [None]:
from semisup_learn.frameworks.SelfLearning import *

In [None]:
# from sklearn import *

for ind in range(y_user.shape[-1]):
    unique_col,counts_col=np.unique(y_user[:,ind],return_counts=True)
    
    skip_col=0
    for i in range(len(unique_col)):
        if(unique_col[i]==-1):
            if counts_col[i]==len(y_user[:,ind]):
                skip_col=1
                print("Skipping column {}".format(ind))
    if(len(unique_col)==2): # If there are only 2 unique labels, and one of them is -1...don't have enough classes for SVM
        if -1 in unique_col:
            skip_col=1
            print("Skipping column {} because 1-class".format(ind))
    if(skip_col!=1):
    

        print(ind," of ",y_user.shape[-1]," :: ")
        x_user=np.nan_to_num(x_user)
        y_col=y_user[:,ind].reshape(len(y_user[:,ind]),1)
#         smile = SMiLE(s=0.5, alpha=0.35, k=51)
#         smile.fit(np.transpose(x_user), np.transpose(y_col))
#         predictions=smile.predict(np.transpose(x_user))
#         probabilities=smile.predict_proba(np.transpose(x_user))

        ssmodel = SelfLearning.CPLELearningModel(sklearn.svm.SVC(kernel="rbf", probability=True), predict_from_probabilities=True) # RBF SVM
        ssmodel.fit(x_user,y_col)
        ssmodel.predict(X, ys)
#         print(predictions)
#         print(probabilities)
#         unique,counts=np.unique(predictions,return_counts=True)
#         print(*zip(unique,counts))

In [None]:
ssmodel = CPLELearningModel(basemodel)

In [None]:
# y_user[:,ind]

unique,counts=np.unique(y_user[:,ind],return_counts=True)
print(*zip(unique,counts)) 

# Raw SVM

In [None]:
import numpy.ma as ma # Masked array
from sklearn import *

In [None]:
def s3vm(x,y):
    y_out=copy.deepcopy(y)
    for ind in range(y.shape[-1]):
        print("Col ",ind)
        unique_col,counts_col=np.unique(y[:,ind],return_counts=True)
        print(*zip(unique_col,counts_col))
        
        skip_col=0
        for i in range(len(unique_col)):
            if(unique_col[i]==-1):
                if counts_col[i]==len(y[:,ind]):
                    skip_col=1
                    print("Skipping column {}".format(ind))
        if(len(unique_col)==2): # If there are only 2 unique labels, and one of them is -1...don't have enough classes for SVM
            if -1 in unique_col:
                skip_col=1
                print("Skipping column {} because 1-class".format(ind))
        if(skip_col!=1):
            print(ind," of ",y_user.shape[-1]," :: ")
            y_col=y[:,ind]
            y_train_masked=ma.masked_where(y_col!=-1,y_col).mask
            indices=np.where(y_train_masked==False)[0]

            svm_classifier=svm.SVC(probability=True,class_weight='balanced')
            print(x[y_train_masked].shape,y_col[y_train_masked].shape)
            svm_classifier.fit(x[y_train_masked],y_col[y_train_masked])
            for index in indices:
                prediction=svm_classifier.predict(x[index].reshape(1, -1))
                y_out[index,ind]=prediction
            
                
    unique,counts=np.unique(y,return_counts=True)
    unique_after,counts_after=np.unique(y_out,return_counts=True)
    print(*zip(unique,counts),">>>>>",*zip(unique_after,counts_after))
    return y_out

In [None]:
for g in glob.glob('dataset/Extrasensory_uuid_fl_uTAR/*'):
    print(g)
    fname=g.split('/')[-1].split('.')[0]
    (x_user_train,y_user_train,missed_label_user,tstamp_user,featurename_user,labelname_user) = read_user_data(g)
    x_user_train=np.nan_to_num(x_user_train)
    feature_names=summarize_features_worig(featurename_user)
    x_user,feature_long_names=choose_sensors_longnames(x_user_train,sensor_types,feature_names)
    y_out=s3vm(x_user,y_user_train)
    
    with open('dataset/semisupervised_users/{}_ssl_label.pkl'.format(fname),'wb') as f:
            pickle.dump(y_out,f)

# 1-NearestNeighbor

In [None]:
import numpy.ma as ma # Masked array
from sklearn.neighbors import KNeighborsClassifier as KNN

In [None]:
# Returns a standardized (0 mean, 1 variance) dataset
def standardize(X_train):
    mean=np.nanmean(X_train,axis=0).reshape((1,-1))# Ignores NaNs while finding the mean across rows
    standard_dev=np.nanstd(X_train,axis=0) # Ignores NaNs while finding the standard deviation across rows
    standard_dev_nonzero=np.where(standard_dev>0,standard_dev,1.).reshape((1,-1)) # Div zero
    
    X=(X_train-mean)/standard_dev_nonzero
    return X,mean,standard_dev_nonzero   

In [None]:
def nearestneighbor(x,y):
    imputed_y=np.empty_like(y)
    for ind in range(y.shape[-1]):
        print("Current column: ",ind)
        y_col=y[:,ind]
        
        unique_col,counts_col=np.unique(y_col,return_counts=True)
        print(*zip(unique_col,counts_col))
        
        skip_col=0
        for i in range(len(unique_col)):
            if(unique_col[i]==-1):
                if counts_col[i]==len(y[:,ind]):
                    skip_col=1
                    print("Skipping column {}".format(ind))
        
        if (skip_col!=1):
            y_train_masked=ma.masked_where(y_col!=-1,y_col).mask
            missing_indices=np.where(y_train_masked==False)[0]


            x_train=x[y_train_masked]
            y_col_train=y_col[y_train_masked]

            knn_classifier=KNN(n_neighbors=1,weights='distance',p=2,metric='minkowski',n_jobs=-1)
            knn_classifier.fit(x_train,y_col_train)

            for i,index in enumerate(missing_indices):
                print("\t\tWorking on index {} of {}".format(i,len(missing_indices)))
                predict=knn_classifier.predict(x[index,:].reshape(1, -1))
                print(predict)
                #print("\t\t",knn_classifier.predict_proba(x_user[index,:].reshape(1, -1)))
                y_col[index]=predict
                y_train_masked=ma.masked_where(y_col!=-1,y_col).mask

                knn_classifier.fit(x[y_train_masked],y_col[y_train_masked])
                print("\t\tMissing length is now :",len(np.where(y_train_masked==False)[0]))
            imputed_y[:,ind]=y_col
    unique,counts=np.unique(imputed_y,return_counts=True)
    print(*zip(unique,counts))
    
    return imputed_y

In [None]:
for g in glob.glob('dataset/Extrasensory_uuid_fl_uTAR/*'):
    print(g)
    fname=g.split('/')[-1].split('.')[0]
    (x_user_train,y_user_train,missed_label_user,tstamp_user,featurename_user,labelname_user) = read_user_data(g)
    x_user_train,_,_=standardize(x_user_train)
    x_user_train=np.nan_to_num(x_user_train)
    
    feature_names=summarize_features_worig(featurename_user)
    x_user,feature_long_names=choose_sensors_longnames(x_user_train,sensor_types,feature_names)
    
    y_out=nearestneighbor(x_user,y_user_train)
    
    with open('dataset/semisupervised_users_1NN/{}_ssl_label.pkl'.format(fname),'wb') as f:
            pickle.dump(y_out,f)

# Multi-MLP

In [9]:
from sklearn.metrics import accuracy_score,confusion_matrix,balanced_accuracy_score
from sklearn.metrics import precision_recall_fscore_support,classification_report
#from sklearn.metrics import multilabel_confusion_matrix # Only available in dev .21

# Need Pytorch for multilabel classifications
import torch
from torch.autograd import Variable as V
from torch import nn,optim
import torch.nn.functional as F
import torch.utils.data as utils
#import skorch [Scikit-learn wrapper around Pytorch so allowing for K-fold cross-validation]
random_state=10
np.random.seed(random_state)

from utils import cm,remove_outliers

In [10]:
# Simple function to run using GPU when available
def C(structure):
    if torch.cuda.is_available():
        device=torch.device("cuda")
        return structure.to(device)

In [11]:
# Create a mask to hide -1 nans before training and then input to a train criterion
def mask(criterion,y_true,y_pred,mask_value=-1.):
    mask=torch.ne(y_true,mask_value).type(torch.cuda.FloatTensor)
    # Cast the ByteTensor from elementwise comparison to a FloatTensor
    return criterion(torch.mul(y_pred,mask),torch.mul(y_true,mask))

In [12]:
# Creating an instance weight matrix for the training labels
def instance_weight_matrix(y_train):
    instance_weights=np.zeros_like(y_train)
    for l in range(len(labelname_user)):
        temp_column=y_train[:,l]
        count_neg=0
        count_0=0
        count_1=0
        for i in range(len(temp_column)): # n^2 bincount doesn't work with arrays consisting of negative numbers
            if (temp_column[i]==-1):
                count_neg+=1
            elif (temp_column[i]==0):
                count_0+=1
            elif (temp_column[i]==1):
                count_1+=1
            else:
                raise ValueError("Bad Loop")
#         print(l,count_0,count_1)
        if(count_0!=0):
            weight_0=float((count_0+count_1)/count_0)
        else:
            weight_0=0.
        if(count_1!=0):
            weight_1=float((count_0+count_1)/count_1)
        else:
            weight_1=0.
        if(weight_0+weight_1==0.):
            weight_0=1.
            weight_1=1.
        else:
            weight_0=weight_0/(weight_0+weight_1)
            weight_1=weight_1/(weight_0+weight_1)

        for i in range(len(temp_column)):
            if (temp_column[i]==-1):
                instance_weights[i,l]=0.
            elif (temp_column[i]==0):
                instance_weights[i,l]=weight_0
            elif (temp_column[i]==1):
                instance_weights[i,l]=weight_1
    return instance_weights

In [13]:
# Returns a standardized (0 mean, 1 variance) dataset
def standardize(X_train):
    mean=np.nanmean(X_train,axis=0).reshape((1,-1))# Ignores NaNs while finding the mean across rows
    standard_dev=np.nanstd(X_train,axis=0) # Ignores NaNs while finding the standard deviation across rows
    standard_dev_nonzero=np.where(standard_dev>0,standard_dev,1.).reshape((1,-1)) # Div zero
    
    X=(X_train-mean)/standard_dev_nonzero
    return X,mean,standard_dev_nonzero   

In [14]:
# Linear decreasing LR scheduler
def linear_lr_scheduler(optimizer,epoch):
    """
    LR_init=0.1, LR_final=0.01, n_epochs=40
    Reset to n_epochs=80 (0,0.1) (79,0.01)
    Sets the learning rate to the initial LR decayed by 1.04 every epoch"""
    for param_group in optimizer.param_groups:
        lr=param_group['lr']
    m=-9/7900;#-3/1300
    c=0.1
    lr=(epoch*m)+c # Linear LR decay based on a set number of epochs
    for param_group in optimizer.param_groups:
        param_group['lr']=lr

In [15]:
# Adds euclidean regularization to weight matrices
def frobenius_norm(model,loss):
    regularizer_loss=0
    
    for m in model.modules():
        if isinstance(m,nn.Linear): # Linear layer
            frobenius_norm=torch.norm(m.weight,p='fro')
            regularizer_loss+=frobenius_norm # Regularization over the weight matrices for linear layers
    return loss+0.001*regularizer_loss

In [16]:
# Function for the required accuracy metrics per fold
def accuracy(fold,target_labels,y_true,y_pred):
    y_true=y_true.detach().numpy()
    y_pred=y_pred.detach().numpy()
    balanced_accuracy_dict={}
    print('*'*20)
    print('For fold {}'.format(fold))
    
    # Balanced accuracy
    for i in range(len(target_labels)):
        true_perlabel=y_true[:,i]
        pred_perlabel=y_pred[:,i]
        initial_shape=true_perlabel.shape
        
        invalid_mask=np.where(true_perlabel==-1.)
        valid_mask=np.where(true_perlabel!=-1.) # Create a mask
        true_perlabel=true_perlabel[valid_mask]
        pred_perlabel=pred_perlabel[valid_mask]
        
        bal_acc=balanced_accuracy_score(y_true=true_perlabel,y_pred=pred_perlabel)
#         print('\t Label {}:::-> Balanced Accuracy {}'.format(target_labels[i],round(bal_acc,7)))
#         print('\t\t Initial length {}, Missing mask length {}, Valid mask length {}, Final length {}'
#               .format(initial_shape[0],len(invalid_mask[0]),len(valid_mask[0]),len(true_perlabel)))
        balanced_accuracy_dict[target_labels[i]]=round(bal_acc,5)
    return balanced_accuracy_dict

In [17]:
# CPU/detach inference
def I(tensor):
    return tensor.cpu().detach().numpy() # Run on CPU, detach from variable in graph, convert to array

In [18]:
# Train function w/BCE loss, linear LR scheduler, instance weights
def train(model,X,Y,X_test,Y_test,weights,n_epoch,batch_size,lr_init,momentum,fold):
    
    optimizer=optim.SGD(model.parameters(),lr=lr_init,momentum=momentum)

    X=V(torch.cuda.FloatTensor(X),requires_grad=True)
    Y=V(torch.cuda.FloatTensor(Y),requires_grad=False)
    X_test=V(torch.cuda.FloatTensor(X_test),requires_grad=False)
    Y_test=V(torch.cuda.FloatTensor(Y_test),requires_grad=False)
    weights=V(torch.cuda.FloatTensor(weights),requires_grad=False)
   
    # Cuda-Compatible Model
    model = C(model)
    # Create dataloaders
    # Dataloader creation
    # Wrap weights for instance weight tensor along with data & label tensors s.t.
    # it can be called properly as a dataloader in batches.
    train_dataset=utils.TensorDataset(X,Y,weights)
    train_loader=utils.DataLoader(dataset=train_dataset,batch_size=bs
                                  ,shuffle=False,drop_last=False)

    for epoch in range(n_epoch):
        linear_lr_scheduler(optimizer,epoch)
        for i,data in enumerate(train_loader,0):

            inputs,labels,weights=data
            inputs=V(torch.cuda.FloatTensor(inputs),requires_grad=True)
            labels=V(torch.cuda.FloatTensor(labels),requires_grad=False)
            weights=V(torch.cuda.FloatTensor(weights),requires_grad=False)

            criterion=C(nn.BCEWithLogitsLoss(weight=weights))
            optimizer.zero_grad()   
            sum_total=0

            outputs=model(inputs)

            # Zero gradients, backward pass, weight update
#             loss=criterion(outputs,labels) 
            loss=mask(criterion=criterion,y_true=labels,y_pred=outputs,mask_value=-1)
            loss=frobenius_norm(model,loss)
            loss.backward()
            optimizer.step()

            sum_total+=loss.item()
            for param_group in optimizer.param_groups:
                epoch_lr=param_group['lr']

            print("Epoch {}::Minibatch {}::LR {} --> Loss {}".format(epoch+1,i+1,epoch_lr,sum_total/bs))
            sum_total=0.
        
    
    print("Training finished, Prediction")
    
    model.eval() # Evaluation model
    
    y_pred=torch.sigmoid(model(X))>=0.5
    fold_train_dict=accuracy(fold,labelname_user,Y.cpu(),y_pred.cpu())
    
    Y_test_pred=torch.sigmoid(model(X_test))>=0.5
    fold_test_dict=accuracy(fold,labelname_user,Y_test.cpu(),Y_test_pred.cpu())
    
#     cm(I(Y),I(y_pred),np.asarray(labelname_user),fname+'train.png')
#     cm(I(Y_test),I(Y_test_pred),np.asarray(labelname_user),fname+'test.png')
    model.train() # Back to train model
    
#     return fold_train_dict,fold_test_dict,I(Y),I(y_pred),I(Y_test),I(Y_test_pred)
    return fold_train_dict,fold_test_dict,I(y_pred),I(Y_test_pred)

In [19]:
# Train function w/BCE loss, linear LR scheduler, instance weights
def train_custom(model,X,Y,X_test,Y_test,weights,n_epoch,batch_size,lr_init,momentum,fold):
    
    optimizer=optim.Adam(model.parameters())

    X=V(torch.cuda.FloatTensor(X),requires_grad=True)
    Y=V(torch.cuda.FloatTensor(Y),requires_grad=False)
    X_test=V(torch.cuda.FloatTensor(X_test),requires_grad=False)
    Y_test=V(torch.cuda.FloatTensor(Y_test),requires_grad=False)
    weights=V(torch.cuda.FloatTensor(weights),requires_grad=False)
   
    # Cuda-Compatible Model
    model = C(model)
    # Create dataloaders
    # Dataloader creation
    # Wrap weights for instance weight tensor along with data & label tensors s.t.
    # it can be called properly as a dataloader in batches.
    train_dataset=utils.TensorDataset(X,Y,weights)
    train_loader=utils.DataLoader(dataset=train_dataset,batch_size=bs
                                  ,shuffle=False,drop_last=False)

    for epoch in range(n_epoch):
        for i,data in enumerate(train_loader,0):

            inputs,labels,weights=data
            inputs=V(torch.cuda.FloatTensor(inputs),requires_grad=True)
            labels=V(torch.cuda.FloatTensor(labels),requires_grad=False)
            weights=V(torch.cuda.FloatTensor(weights),requires_grad=False)

            criterion=C(nn.BCEWithLogitsLoss(weight=weights))
            optimizer.zero_grad()   
            sum_total=0

            outputs=model(inputs)

            # Zero gradients, backward pass, weight update
#             loss=criterion(outputs,labels) 
            loss=mask(criterion=criterion,y_true=labels,y_pred=outputs,mask_value=-1)
            loss=frobenius_norm(model,loss)
            loss.backward()
            optimizer.step()

            sum_total+=loss.item()
#             for param_group in optimizer.param_groups:
#                 epoch_lr=param_group['lr']

            print("Epoch {}::Minibatch {}--> Loss {}".format(epoch+1,i+1,sum_total/bs))
            sum_total=0.
        
    
    print("Training finished, Prediction")
    
    model.eval() # Evaluation model
    
    y_pred=torch.sigmoid(model(X))>=0.5
    fold_train_dict=accuracy(fold,labelname_user,Y.cpu(),y_pred.cpu())
    
    Y_test_pred=torch.sigmoid(model(X_test))>=0.5
    fold_test_dict=accuracy(fold,labelname_user,Y_test.cpu(),Y_test_pred.cpu())
    
#     cm(I(Y),I(y_pred),np.asarray(labelname_user),fname+'train.png')
#     cm(I(Y_test),I(Y_test_pred),np.asarray(labelname_user),fname+'test.png')
    model.train() # Back to train model
    
#     return fold_train_dict,fold_test_dict,I(Y),I(y_pred),I(Y_test),I(Y_test_pred)
    return fold_train_dict,fold_test_dict,I(y_pred),I(Y_test_pred)

In [20]:
# Randomly choose sensors (features) and replace those by 0 (dropout-ish)
def random_choice(sensor_list,feature_names,x_dataset):
    '''
    Sensor list if of the form:
    sensor_list=['phone_acc','phone_gyro','phone_mag','phone_loc','phone_audio',
'phone_app','phone_battery','phone_use','phone_callstat','phone_wifi','phone_lf',
'phone_time']
    '''
    sensor_length=len(sensor_list)
    chosen_sensors=np.random.choice(sensor_list,math.floor(sensor_length*0.8),replace=False)
    ignored_sensors=list(set(sensor_list)-set(chosen_sensors)) 
    print("\t\t\tIgnoring {}".format(ignored_sensors))
    new_summary_features=summarize_features(feature_long_names)
    x_dataset=choose_sensors_dropout(x_dataset,chosen_sensors,new_summary_features)
    
    return x_dataset

In [21]:
# Train function w/BCE loss, linear LR scheduler, instance weights
def train_sensordropout(model,X,Y,X_test,Y_test,weights,sensor_list,feature_names,n_epoch,batch_size,lr_init,momentum,fold):
    
    optimizer=optim.SGD(model.parameters(),lr=lr_init,momentum=momentum)

    X=V(torch.cuda.FloatTensor(X),requires_grad=True)
    Y=V(torch.cuda.FloatTensor(Y),requires_grad=False)
    X_test=V(torch.cuda.FloatTensor(X_test),requires_grad=False)
    Y_test=V(torch.cuda.FloatTensor(Y_test),requires_grad=False)
    weights=V(torch.cuda.FloatTensor(weights),requires_grad=False)
   
    # Cuda-Compatible Model
    model = C(model)
    # Create dataloaders
    # Dataloader creation
    # Wrap weights for instance weight tensor along with data & label tensors s.t.
    # it can be called properly as a dataloader in batches.
    train_dataset=utils.TensorDataset(X,Y,weights)
    train_loader=utils.DataLoader(dataset=train_dataset,batch_size=bs
                                  ,shuffle=False,drop_last=False)

    for epoch in range(n_epoch):
        linear_lr_scheduler(optimizer,epoch)
        for i,data in enumerate(train_loader,0):

            inputs,labels,weights=data
            inputs_detached=inputs.cpu().detach().numpy()
            
            inputs=random_choice(sensor_list,feature_names,inputs_detached) # Sensor Dropout
            
            inputs=V(torch.cuda.FloatTensor(inputs),requires_grad=True)
            labels=V(torch.cuda.FloatTensor(labels),requires_grad=False)
            weights=V(torch.cuda.FloatTensor(weights),requires_grad=False)

            criterion=C(nn.BCEWithLogitsLoss(weight=weights))
            optimizer.zero_grad()   
            sum_total=0

            outputs=model(inputs)

            # Zero gradients, backward pass, weight update
#             loss=criterion(outputs,labels) 
            loss=mask(criterion=criterion,y_true=labels,y_pred=outputs,mask_value=-1)
            loss=frobenius_norm(model,loss)
            loss.backward()
            optimizer.step()

            sum_total+=loss.item()
            for param_group in optimizer.param_groups:
                epoch_lr=param_group['lr']

            print("Epoch {}::Minibatch {}::LR {} --> Loss {}".format(epoch+1,i+1,epoch_lr,sum_total/bs))
            sum_total=0.
        
    
    print("Training finished, Prediction")
    
    model.eval() # Evaluation model
    
    y_pred=torch.sigmoid(model(X))>=0.5
    fold_train_dict=accuracy(fold,labelname_user,Y.cpu(),y_pred.cpu())
    
    Y_test_pred=torch.sigmoid(model(X_test))>=0.5
    fold_test_dict=accuracy(fold,labelname_user,Y_test.cpu(),Y_test_pred.cpu())
    
    model.train() # Back to train model
    
    return fold_train_dict,fold_test_dict,I(y_pred),I(Y_test_pred)

## 0H

In [22]:
# Defining sizes for neural networks and other global hyperparameters
# input_size=x_train[0].shape[-1]
input_size=170 #176 #464 #506 # 170
hidden_size=16
# output_size=y_train[0].shape[-1]
output_size=51#51
n_epoch=80
bs=300
lr_init=0.1
momentum=0.5
print('Input Size {}, Output Size {}'.format(input_size,output_size))

Input Size 170, Output Size 51


In [23]:
# Linear MLP no hidden layer
class LinearMLP(nn.Module):
    def __init__(self):
        super(LinearMLP,self).__init__()
        self.fc1=nn.Linear(input_size,output_size)
    def forward(self,x):
        x=self.fc1(x)
        return x
    
model=LinearMLP()
print(model)

LinearMLP(
  (fc1): Linear(in_features=170, out_features=51, bias=True)
)


In [None]:
for g in glob.glob('dataset/Extrasensory_uuid_fl_uTAR/*'):
    print("\n",g)
    fname=g.split('/')[-1].split('.')[0]
    (x_user_train,y_user_train,missed_label_user,tstamp_user,featurename_user,labelname_user) = read_user_data(g)
    x_user_train,_,_=standardize(x_user_train)
    x_user_train=np.nan_to_num(x_user_train)
    
    feature_names=summarize_features_worig(featurename_user)
    x_user,feature_long_names=choose_sensors_longnames(x_user_train,sensor_types,feature_names)
    
    weights=instance_weight_matrix(y_user_train)
    print("\t Loaded datasets")
    
    model=LinearMLP() # Creating a new instance of the model every fold
    
    mlp0H_train_dict,mlp0H_test_dict,y_train_pred,y_test_pred=train(model,
                                                           X=x_user,
                                                           Y=y_user_train,
                                                           X_test=x_user,
                                                           Y_test=y_user_train,
                                                           weights=weights,
                                                           n_epoch=n_epoch,
                                                           batch_size=bs,
                                                           lr_init=lr_init,
                                                           momentum=momentum,
                                                           fold=-1)
    print(mlp0H_test_dict)
    print("Y_train_pred==Y_test_pred:",np.array_equal(y_train_pred,y_test_pred))
    
    y_user_impute=copy.deepcopy(y_user_train)
    ind_x,ind_y=np.where(y_user_train)
    
    for z in zip(ind_x,ind_y):
        y_user_impute[z[0],z[1]]=y_test_pred[z[0],z[1]]
    
    impute_unique,impute_counts=np.unique(y_user_impute,return_counts=True)
    print(*zip(impute_unique,impute_counts))
    
    with open('dataset/semisupervised_users_0H/{}_ssl_0H.pkl'.format(fname),'wb') as f:
            pickle.dump(y_user_impute,f) #train_pred==test_pred
    
    print("*"*50)

## 1H

In [25]:
# Defining sizes for neural networks and other global hyperparameters
# input_size=x_train[0].shape[-1]
input_size=170 #176 #464 #506 # 170
hidden_size=16
# output_size=y_train[0].shape[-1]
output_size=51#51
n_epoch=80
bs=300
lr_init=0.1
momentum=0.5
print('Input Size {}, Output Size {}'.format(input_size,output_size))

Input Size 170, Output Size 51


In [26]:
# Linear MLP 1 hidden layer
class MLP_1H(nn.Module):
    def __init__(self):
        super(MLP_1H,self).__init__()
        self.hidden0=nn.Sequential(
            nn.Linear(input_size,hidden_size),
            nn.LeakyReLU(negative_slope=0.1)
        )
        self.out=nn.Sequential(
            nn.Linear(hidden_size,output_size)
        )
        
    def forward(self,x):
        x = self.hidden0(x)
        return self.out(x)
    
# Train for MLP-1 Hidden
model=MLP_1H()
print(model)

MLP_1H(
  (hidden0): Sequential(
    (0): Linear(in_features=170, out_features=16, bias=True)
    (1): LeakyReLU(negative_slope=0.1)
  )
  (out): Sequential(
    (0): Linear(in_features=16, out_features=51, bias=True)
  )
)


In [None]:
for g in glob.glob('dataset/Extrasensory_uuid_fl_uTAR/*'):
    print("\n",g)
    fname=g.split('/')[-1].split('.')[0]
    (x_user_train,y_user_train,missed_label_user,tstamp_user,featurename_user,labelname_user) = read_user_data(g)
    x_user_train,_,_=standardize(x_user_train)
    x_user_train=np.nan_to_num(x_user_train)
    
    feature_names=summarize_features_worig(featurename_user)
    x_user,feature_long_names=choose_sensors_longnames(x_user_train,sensor_types,feature_names)
    
    weights=instance_weight_matrix(y_user_train)
    print("\t Loaded datasets")
    
    model=MLP_1H() # Creating a new instance of the model every fold
    
    mlp1H_train_dict,mlp1H_test_dict,y_train_pred,y_test_pred=train(model,
                                                           X=x_user,
                                                           Y=y_user_train,
                                                           X_test=x_user,
                                                           Y_test=y_user_train,
                                                           weights=weights,
                                                           n_epoch=n_epoch,
                                                           batch_size=bs,
                                                           lr_init=lr_init,
                                                           momentum=momentum,
                                                           fold=-1)
    countlabels_user=np.sum(y_user_train,axis=0) # Column summary
    labelname_countlabel_user=zip(labelname_user,countlabels_user) # Zip together names, counts
    labelname_countlabel_user=sorted(labelname_countlabel_user,key=lambda row:row[-1],reverse=True)

    print('Activities and counts:')
    print(labelname_countlabel_user)
    
    print(mlp1H_test_dict)
    
    print("Y_train_pred==Y_test_pred:",np.array_equal(y_train_pred,y_test_pred))
    
    y_user_impute=copy.deepcopy(y_user_train)
    ind_x,ind_y=np.where(y_user_train)
    
    for z in zip(ind_x,ind_y):
        y_user_impute[z[0],z[1]]=y_test_pred[z[0],z[1]]
    
    impute_unique,impute_counts=np.unique(y_user_impute,return_counts=True)
    print(*zip(impute_unique,impute_counts))
    
    with open('dataset/semisupervised_users_1H/{}_ssl_1H.pkl'.format(fname),'wb') as f:
            pickle.dump(y_user_impute,f) #train_pred==test_pred
    
    print("*"*50)

## 2H

In [28]:
# Defining sizes for neural networks and other global hyperparameters
# input_size=x_train[0].shape[-1]
input_size=170 #176 #464 #506 # 170
hidden_size=16
# output_size=y_train[0].shape[-1]
output_size=51#51
n_epoch=80
bs=300
lr_init=0.1
momentum=0.5
print('Input Size {}, Output Size {}'.format(input_size,output_size))

Input Size 170, Output Size 51


In [29]:
class MLP_2H(nn.Module):
    def __init__(self):
        super(MLP_2H,self).__init__()
        self.hidden0=nn.Sequential(
            nn.Linear(input_size,hidden_size),
            nn.LeakyReLU(negative_slope=0.1)
        )
        self.hidden1=nn.Sequential(
            nn.Linear(hidden_size,hidden_size),
            nn.LeakyReLU(negative_slope=0.1)
        )
        self.out=nn.Sequential(
            nn.Linear(hidden_size,output_size)
        )
        
    def forward(self,x):
        x = self.hidden0(x)
        x = self.hidden1(x)
        return self.out(x)
    
# Train for MLP-2 Hidden
model=MLP_2H()
print(model)

MLP_2H(
  (hidden0): Sequential(
    (0): Linear(in_features=170, out_features=16, bias=True)
    (1): LeakyReLU(negative_slope=0.1)
  )
  (hidden1): Sequential(
    (0): Linear(in_features=16, out_features=16, bias=True)
    (1): LeakyReLU(negative_slope=0.1)
  )
  (out): Sequential(
    (0): Linear(in_features=16, out_features=51, bias=True)
  )
)


In [None]:
for g in glob.glob('dataset/Extrasensory_uuid_fl_uTAR/*'):
    print("\n",g)
    fname=g.split('/')[-1].split('.')[0]
    (x_user_train,y_user_train,missed_label_user,tstamp_user,featurename_user,labelname_user) = read_user_data(g)
    x_user_train,_,_=standardize(x_user_train)
    x_user_train=np.nan_to_num(x_user_train)
    
    feature_names=summarize_features_worig(featurename_user)
    x_user,feature_long_names=choose_sensors_longnames(x_user_train,sensor_types,feature_names)
    
    weights=instance_weight_matrix(y_user_train)
    print("\t Loaded datasets")
    
    model=MLP_2H() # Creating a new instance of the model every fold
    
    mlp2H_train_dict,mlp2H_test_dict,y_train_pred,y_test_pred=train(model,
                                                           X=x_user,
                                                           Y=y_user_train,
                                                           X_test=x_user,
                                                           Y_test=y_user_train,
                                                           weights=weights,
                                                           n_epoch=n_epoch,
                                                           batch_size=bs,
                                                           lr_init=lr_init,
                                                           momentum=momentum,
                                                           fold=-1)
    countlabels_user=np.sum(y_user_train,axis=0) # Column summary
    labelname_countlabel_user=zip(labelname_user,countlabels_user) # Zip together names, counts
    labelname_countlabel_user=sorted(labelname_countlabel_user,key=lambda row:row[-1],reverse=True)

    print('Activities and counts:')
    print(labelname_countlabel_user)
    
    print(mlp2H_test_dict)
    
    print("Y_train_pred==Y_test_pred:",np.array_equal(y_train_pred,y_test_pred))
    
    y_user_impute=copy.deepcopy(y_user_train)
    ind_x,ind_y=np.where(y_user_train)
    
    for z in zip(ind_x,ind_y):
        y_user_impute[z[0],z[1]]=y_test_pred[z[0],z[1]]
    
    impute_unique,impute_counts=np.unique(y_user_impute,return_counts=True)
    print(*zip(impute_unique,impute_counts))
    
    with open('dataset/semisupervised_users_2H/{}_ssl_2H.pkl'.format(fname),'wb') as f:
            pickle.dump(y_user_impute,f) #train_pred==test_pred
    
    print("*"*50)

## 2HDrop

In [31]:
# Defining sizes for neural networks and other global hyperparameters
# input_size=x_train[0].shape[-1]
input_size=170 #176 #464 #506 # 170
hidden_size=16
# output_size=y_train[0].shape[-1]
output_size=51#51
n_epoch=80
bs=300
lr_init=0.1
momentum=0.5
print('Input Size {}, Output Size {}'.format(input_size,output_size))

Input Size 170, Output Size 51


In [32]:
class MLP_2HDrop(nn.Module):
    def __init__(self):
        super(MLP_2HDrop,self).__init__()
        self.hidden0=nn.Sequential(
            nn.Linear(input_size,hidden_size),
            nn.LeakyReLU(negative_slope=0.1),
            nn.Dropout(0.20)
        )
        self.hidden1=nn.Sequential(
            nn.Linear(hidden_size,hidden_size),
            nn.LeakyReLU(negative_slope=0.1),
            nn.Dropout(0.20)
        )
        self.out=nn.Sequential(
            nn.Linear(hidden_size,output_size)
        )
        
    def forward(self,x):
        x = self.hidden0(x)
        x = self.hidden1(x)
        return self.out(x)
    
model=MLP_2HDrop()
print(model)

MLP_2HDrop(
  (hidden0): Sequential(
    (0): Linear(in_features=170, out_features=16, bias=True)
    (1): LeakyReLU(negative_slope=0.1)
    (2): Dropout(p=0.2)
  )
  (hidden1): Sequential(
    (0): Linear(in_features=16, out_features=16, bias=True)
    (1): LeakyReLU(negative_slope=0.1)
    (2): Dropout(p=0.2)
  )
  (out): Sequential(
    (0): Linear(in_features=16, out_features=51, bias=True)
  )
)


In [None]:
for g in glob.glob('dataset/Extrasensory_uuid_fl_uTAR/*'):
    print("\n",g)
    fname=g.split('/')[-1].split('.')[0]
    (x_user_train,y_user_train,missed_label_user,tstamp_user,featurename_user,labelname_user) = read_user_data(g)
    x_user_train,_,_=standardize(x_user_train)
    x_user_train=np.nan_to_num(x_user_train)
    
    feature_names=summarize_features_worig(featurename_user)
    x_user,feature_long_names=choose_sensors_longnames(x_user_train,sensor_types,feature_names)
    
    weights=instance_weight_matrix(y_user_train)
    print("\t Loaded datasets")
    
    model=MLP_2HDrop() # Creating a new instance of the model every fold
    
    mlp2Hdrop_train_dict,mlp2Hdrop_test_dict,y_train_pred,y_test_pred=train_sensordropout(model,
                                                             X=x_user,
                                                             Y=y_user_train,
                                                             X_test=x_user,
                                                             Y_test=y_user_train,
                                                             weights=weights,
                                                             sensor_list=sensor_types,
                                                             feature_names=feature_names,
                                                             n_epoch=n_epoch,
                                                             batch_size=bs,
                                                             lr_init=lr_init,
                                                             momentum=momentum,
                                                             fold=-1)
    
    countlabels_user=np.sum(y_user_train,axis=0) # Column summary
    labelname_countlabel_user=zip(labelname_user,countlabels_user) # Zip together names, counts
    labelname_countlabel_user=sorted(labelname_countlabel_user,key=lambda row:row[-1],reverse=True)

    print('Activities and counts:')
    print(labelname_countlabel_user)
    
    print(mlp2Hdrop_test_dict)
    
    print("Y_train_pred==Y_test_pred:",np.array_equal(y_train_pred,y_test_pred))
    
    y_user_impute=copy.deepcopy(y_user_train)
    
    ind_x,ind_y=np.where(y_user_train)
    
    for z in zip(ind_x,ind_y):
        y_user_impute[z[0],z[1]]=y_test_pred[z[0],z[1]]
    
    impute_unique,impute_counts=np.unique(y_user_impute,return_counts=True)
    print(*zip(impute_unique,impute_counts))
    
    with open('dataset/semisupervised_users_2Hdrop/{}_ssl_2Hdrop.pkl'.format(fname),'wb') as f:
            pickle.dump(y_user_impute,f) #train_pred==test_pred
    
    print("*"*50)