In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import numpy.linalg as la
from numpy import log
from scipy.special import digamma
from sklearn.neighbors import BallTree, KDTree
import sklearn

import sys
import os
import matplotlib.pyplot as plt
from group_lasso import LogisticGroupLasso
import pandas as pd
import sklearn

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

from group_lasso import LogisticGroupLasso

In [2]:
def one_hot_encoder(X_static):
    X_static = pd.concat(
        [
            X_static[["Age", "Gender", "SAPSIIIScore", "MonthOfAdmission", "YearOfAdmission"]], 
            pd.get_dummies(X_static.Origin, prefix='Origin'), 
            pd.get_dummies(X_static.ReasonAdmission, prefix='ReasonAdmission'), 
            pd.get_dummies(X_static.PatientCategory, prefix='PatientCategory')
        ],
        axis=1
    )
    
    return X_static

# GLASSO - Dynamic

In [3]:
i = 0
n = 4

X_train = np.load("../../../ORIGINAL_DATA/MDR/splits_14_days/notbalanced/split_" + str(i) + "/X_train_tensor_" + str(n)+ ".npy")

y_train = pd.read_csv("../../../ORIGINAL_DATA/MDR/splits_14_days/notbalanced/split_" + str(i) + "/y_train_" + str(n)+ ".csv", index_col=0)

X_val = np.load("../../../ORIGINAL_DATA/MDR/splits_14_days/notbalanced/split_" + str(i) + "/X_val_tensor_" + str(n)+ ".npy")

y_val = pd.read_csv("../../../ORIGINAL_DATA/MDR/splits_14_days/notbalanced/split_" + str(i) + "/y_val_" + str(n)+ ".csv", index_col=0)


X = np.concatenate([X_train, X_val], axis=0)
y = y_train.append(y_val)

In [4]:
NTimesteps = 14
NFeatures = 56
groups = np.zeros(NFeatures * NTimesteps)
for i in range(NFeatures):
    groups[i*NTimesteps:i*NTimesteps + NTimesteps] = i
groups = groups + 1 
print(groups)

[ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  2.  2.  2.  2.
  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  3.  3.  3.  3.  3.  3.  3.  3.
  3.  3.  3.  3.  3.  3.  4.  4.  4.  4.  4.  4.  4.  4.  4.  4.  4.  4.
  4.  4.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  6.  6.
  6.  6.  6.  6.  6.  6.  6.  6.  6.  6.  6.  6.  7.  7.  7.  7.  7.  7.
  7.  7.  7.  7.  7.  7.  7.  7.  8.  8.  8.  8.  8.  8.  8.  8.  8.  8.
  8.  8.  8.  8.  9.  9.  9.  9.  9.  9.  9.  9.  9.  9.  9.  9.  9.  9.
 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 11. 11. 11. 11.
 11. 11. 11. 11. 11. 11. 11. 11. 11. 11. 12. 12. 12. 12. 12. 12. 12. 12.
 12. 12. 12. 12. 12. 12. 13. 13. 13. 13. 13. 13. 13. 13. 13. 13. 13. 13.
 13. 13. 14. 14. 14. 14. 14. 14. 14. 14. 14. 14. 14. 14. 14. 14. 15. 15.
 15. 15. 15. 15. 15. 15. 15. 15. 15. 15. 15. 15. 16. 16. 16. 16. 16. 16.
 16. 16. 16. 16. 16. 16. 16. 16. 17. 17. 17. 17. 17. 17. 17. 17. 17. 17.
 17. 17. 17. 17. 18. 18. 18. 18. 18. 18. 18. 18. 18

In [5]:
v_sparsity_mask = []
best_accuracy = 0
P, T, F = X.shape
X_reshaped = X.reshape(P, T * F)


group_reg_values = [0.0025, 0.005, 0.006, 0.0075, 0.01]
best_group_reg = None 

for group_reg in group_reg_values:
    gl = LogisticGroupLasso(groups=groups, 
                            group_reg=group_reg, l1_reg=0,
                            n_iter=15000, tol=0, supress_warning=True, random_state=42)

    # Fit the model
    gl.fit(X_reshaped, y)

    # Extract predictions and sparsity mask
    pred_y = gl.predict(X_reshaped)
    sparsity_mask = gl.sparsity_mask_
    
    # Compute performance metrics (accuracy in this case)
    accuracy = (pred_y == y.individualMRGerm.values).mean()  # Assuming y is a binary classification label
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_group_reg = group_reg
        best_sparsity_mask = sparsity_mask  
        best_w_hat = gl.coef_              
    
print(f"Best group_reg value: {best_group_reg}")
print(f"Best accuracy: {best_accuracy}")
print(f"Best number of chosen time variables: {best_sparsity_mask.sum()}")
print(f"Best number of chosen variables: {best_sparsity_mask.sum() / NTimesteps}")


Best group_reg value: 0.01
Best accuracy: 0.8068091844813935
Best number of chosen time variables: 219
Best number of chosen variables: 15.642857142857142


In [6]:
temporaryKeys = [
    'AMG', 'ATF', 'CAR', 'CF1', 'CF2', 'CF3', 'CF4', 'Falta', 'GCC', 'GLI', 'LIN', 'LIP', 'MAC', 'MON', 'NTI', 
    'OTR', 'OXA', 'PAP', 'PEN', 'POL','QUI', 'SUL', 'TTC', 
    'pc_acinet', 'pc_enterob', 'pc_enteroc', 'pc_pseud', 'pc_staph', 'pc_stenot', 'pc_no_germ',
    'isVM',
    'numberOfPatients', 'numberOfPatientsMR',
    'neighbor_AMG', 'neighbor_ATF', 'neighbor_CAR', 
    'neighbor_CF1', 'neighbor_CF2', 'neighbor_CF3', 'neighbor_CF4', 'neighbor_Falta',
    'neighbor_GCC', 'neighbor_GLI', 'neighbor_LIN',  'neighbor_LIP',
    'neighbor_MAC', 'neighbor_MON', 'neighbor_NTI', 'neighbor_OTR', 'neighbor_OXA',
    'neighbor_PAP', 'neighbor_PEN', 'neighbor_POL', 'neighbor_QUI', 'neighbor_SUL', 
    'neighbor_TTC',
]

keys_2 = []
for i in range(len(temporaryKeys)):
    keys_2.extend([temporaryKeys[i] + str(s) for s in np.arange(0, 14, 1).tolist()])

In [7]:
pd.DataFrame(best_sparsity_mask, keys_2).to_csv("glasso_results_notbalanced.csv")