In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import numpy.linalg as la
from numpy import log
from scipy.special import digamma
from sklearn.neighbors import BallTree, KDTree
import sklearn

import sys
import os
import matplotlib.pyplot as plt
from group_lasso import LogisticGroupLasso
import pandas as pd
import sklearn

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

In [2]:
def one_hot_encoder(X_static):
    X_static = pd.concat(
        [
            X_static[["Age", "Gender", "SAPSIIIScore", "MonthOfAdmission", "YearOfAdmission"]], 
            pd.get_dummies(X_static.Origin, prefix='Origin'), 
            pd.get_dummies(X_static.ReasonAdmission, prefix='ReasonAdmission'), 
            pd.get_dummies(X_static.PatientCategory, prefix='PatientCategory')
        ],
        axis=1
    )
    
    return X_static

# Load data

In [3]:
i = 0
n = 4

X_train = pd.read_csv("../splits_14_days/notbalanced/split_" + str(i) + "/X_train_static_" + str(n)+ ".csv")

y_train = pd.read_csv("../splits_14_days/notbalanced/split_" + str(i) + "/y_train_" + str(n)+ ".csv", index_col=0)

X_val = pd.read_csv("../splits_14_days/notbalanced/split_" + str(i) + "/X_val_static_" + str(n)+ ".csv")

y_val = pd.read_csv("../splits_14_days/notbalanced/split_" + str(i) + "/y_val_" + str(n)+ ".csv", index_col=0)


X = pd.concat([X_train, X_val], axis=0)
y = y_train.append(y_val)

In [4]:
X.head()

Unnamed: 0.1,Unnamed: 0,Age,Gender,SAPSIIIScore,MonthOfAdmission,YearOfAdmission,Origin,ReasonAdmission,PatientCategory
0,0,-0.089553,0.804829,-0.129789,-0.10561,-1.891012,2.0,2.0,2.0
1,1,-0.680868,0.804829,-0.129789,0.171988,-1.891012,1.0,5.0,2.0
2,2,-0.943675,0.804829,-0.129789,0.171988,-1.891012,3.0,4.0,1.0
3,3,1.027375,-1.2425,-0.129789,0.171988,-1.891012,4.0,7.0,1.0
4,4,-1.403586,0.804829,-0.129789,0.171988,-1.891012,2.0,8.0,1.0


In [5]:
X = one_hot_encoder(X)
X.head()

Unnamed: 0,Age,Gender,SAPSIIIScore,MonthOfAdmission,YearOfAdmission,Origin_1.0,Origin_2.0,Origin_3.0,Origin_4.0,Origin_5.0,...,ReasonAdmission_16.0,ReasonAdmission_17.0,ReasonAdmission_18.0,ReasonAdmission_19.0,ReasonAdmission_20.0,ReasonAdmission_21.0,ReasonAdmission_22.0,PatientCategory_1.0,PatientCategory_2.0,PatientCategory_3.0
0,-0.089553,0.804829,-0.129789,-0.10561,-1.891012,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,-0.680868,0.804829,-0.129789,0.171988,-1.891012,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,-0.943675,0.804829,-0.129789,0.171988,-1.891012,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
3,1.027375,-1.2425,-0.129789,0.171988,-1.891012,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
4,-1.403586,0.804829,-0.129789,0.171988,-1.891012,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0


# Code - Glasso Static

In [6]:
NFeatures = 48
NTimesteps = 1

groups = np.zeros(NFeatures * NTimesteps)
for i in range(NFeatures):
    
    groups[i*NTimesteps:i*NTimesteps + NTimesteps] = i
groups = groups + 1 
print(groups)

[ 1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12. 13. 14. 15. 16. 17. 18.
 19. 20. 21. 22. 23. 24. 25. 26. 27. 28. 29. 30. 31. 32. 33. 34. 35. 36.
 37. 38. 39. 40. 41. 42. 43. 44. 45. 46. 47. 48.]


In [7]:
group_reg = [0.0025, 0.005, 0.006, 0.0075, 0.01]

v_sparsity_mask= []
best_accuracy = 0
for i in range(len(group_reg)):
    gl = LogisticGroupLasso(groups=groups, 
                            group_reg=group_reg[i], l1_reg=0,
                            n_iter= 15000, tol = 0, supress_warning=True)

    gl.fit(X, y)

    # Extract info from estimator
    pred_y = gl.predict(X)
    sparsity_mask = gl.sparsity_mask_
    w_hat = gl.coef_

    # Compute performance metrics
    accuracy = (pred_y == y.individualMRGerm.values).mean()
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_group_reg = group_reg
        best_sparsity_mask = sparsity_mask  
        best_w_hat = gl.coef_     
        
    v_sparsity_mask.append(sparsity_mask)
    
print(f"Best group_reg value: {best_group_reg}")
print(f"Best accuracy: {best_accuracy}")
print(f"Best number of chosen time variables: {best_sparsity_mask.sum()}")
    


Best group_reg value: [0.0025, 0.005, 0.006, 0.0075, 0.01]
Best accuracy: 0.8044338875692795
Best number of chosen time variables: 2


In [8]:
pd.DataFrame(best_sparsity_mask, X.keys()).to_csv("results_GLASSO_statics.csv")