In [5]:
import argparse
import os
import numpy as np
import pandas as pd
import sys
sys.path.append('..')
from utilities import logger
from utilities import configuration
from utilities import health_data
from sklearn.linear_model import Lasso
from sklearn.metrics import f1_score, precision_score,recall_score,roc_auc_score,confusion_matrix
from scipy import sparse

# ---------- ---------- ---------- ---------- ---------- ---------- ---------- ---------- 
# LOGGING
# ---------- ---------- ---------- ---------- ---------- ---------- ---------- ---------- 
config = configuration.get_config()
logging = logger.init_logger(config['lasso_log'])
logging.debug('Starting LASSO experiments ...')

# ---------- ---------- ---------- ---------- ---------- ---------- ---------- ---------- 
# MANAGING ARGUMENTS
# ---------- ---------- ---------- ---------- ---------- ---------- ---------- ---------- 
numerical_features = True
categorical_features = True 
diagnosis_features = True 
intervention_features = True 

fix_missings = True
normalize = False
fix_skew = False
use_idf = False
# class_balanced = True
remove_outliers = False

params = {'fix_skew': fix_skew,
            'normalize': normalize,
            'fix_missing_in_testing': fix_missings,
            'numerical_features': numerical_features,
            'categorical_features': categorical_features,
            'diagnosis_features': diagnosis_features,
            'intervention_features':intervention_features,
            'use_idf':use_idf,
            # 'class_balanced':class_balanced,
            'remove_outliers': remove_outliers,
        }

for key, value in params.items():
    logging.debug(f'{key:30}={value}')

# ---------- ---------- ---------- ---------- ---------- ---------- ---------- ---------- 
# RETRIEVING TRAIN AND TEST
# ---------- ---------- ---------- ---------- ---------- ---------- ---------- ---------- 
training ,testing = health_data.Admission.get_training_testing_data()
if params['fix_missing_in_testing']:
    for admission in testing:
        admission.fix_missings(training)
        
logging.debug(f'Training size={len(training):,}')
logging.debug(f'Testing  size={len(testing):,}')

# ---------- ---------- ---------- ---------- ---------- ---------- ---------- ---------- 
# TRAINING MATRIX
# ---------- ---------- ---------- ---------- ---------- ---------- ---------- ---------- 
features = []
if params['numerical_features']:
    numerical_df = health_data.Admission.numerical_features(training, 
                                                            fix_skew=params['fix_skew'], 
                                                            normalize=params['normalize'])
    if params['remove_outliers']:
        stds = np.std(numerical_df)
        mean = np.mean(numerical_df, axis=0)
        is_outlier=np.sum(numerical_df.values > (mean*4*stds).values, axis=1)>0

    features.append(sparse.csr_matrix(numerical_df.values))

if params['categorical_features']:
    categorical_df, main_pt_services_list = health_data.Admission.categorical_features(training)
    features.append(sparse.csr_matrix(categorical_df.values))

if params['diagnosis_features']:
    vocab_diagnosis, diagnosis_matrix = health_data.Admission.diagnosis_codes_features(training, 
                                                                                    use_idf=params['use_idf'])
    features.append(diagnosis_matrix)

if params['intervention_features']:
    vocab_interventions, intervention_matrix = health_data.Admission.intervention_codes_features(training, 
                                                                                                use_idf=params['use_idf'])
    features.append(intervention_matrix)

if params['remove_outliers']:
    mask=~is_outlier
else:
    mask = np.ones(shape=(len(training)))==1

X_train = sparse.hstack([matrix[mask,:] for matrix in features])
y_train = health_data.Admission.get_y(training)[mask]

# ---------- ---------- ---------- ---------- ---------- ---------- ---------- ---------- 
# TESTING MATRIX
# ---------- ---------- ---------- ---------- ---------- ---------- ---------- ---------- 
features = []
if params['numerical_features']:
    numerical_df = health_data.Admission.numerical_features(testing, 
                                                            fix_skew=params['fix_skew'], 
                                                            normalize=params['normalize'])
    features.append(sparse.csr_matrix(numerical_df.values))

if params['categorical_features']:
    categorical_df,_ = health_data.Admission.categorical_features(testing, main_pt_services_list=main_pt_services_list)
    features.append(sparse.csr_matrix(categorical_df.values))

if params['diagnosis_features']:
    vocab_diagnosis, diagnosis_matrix = health_data.Admission.diagnosis_codes_features(testing, 
                                                                                    vocabulary=vocab_diagnosis, 
                                                                                    use_idf=params['use_idf'])
    features.append(diagnosis_matrix)

if params['intervention_features']:
    vocab_interventions, intervention_matrix = health_data.Admission.intervention_codes_features(testing, 
                                                                                                vocabulary=vocab_interventions, 
                                                                                                use_idf=params['use_idf']
                                                                                                )
    features.append(intervention_matrix)

X_test = sparse.hstack(features)
y_test = health_data.Admission.get_y(testing)


logging.debug(f'X_train.shape = ({X_train.shape[0]:,} x {X_train.shape[1]:,})')
logging.debug(f'y_train.shape = ({y_train.shape[0]:,} x )')
# print()
logging.debug(f'X_test.shape =  ({X_test.shape[0]:,} x {X_test.shape[1]:,})')
logging.debug(f'y_test.shape =  ({y_test.shape[0]:,} x )')


# ---------- ---------- ---------- ---------- ---------- ---------- ---------- ----------
# LOGISTIC REGRESSION MODEL 
# ---------- ---------- ---------- ---------- ---------- ---------- ---------- ---------- 
# class_weight = 'balanced' if params['class_balanced'] else None
clf = Lasso(max_iter=7000,).fit(X_train, y_train,)

y_true = y_train
y_pred = clf.predict(X_train) > 0.5
y_score= clf.predict(X_train)

model_name = str(clf)
columns = ['Model','params','split','TN','FP','FN','TP','Precision','Recall','F1-Score','AUC']
str_ = ';'.join(columns)
logging.debug(str_)

tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
str_ = f'{model_name};{str(params)};TRAIN;{tn};{fp};{fn};{tp};{precision_score(y_true, y_pred,)};{recall_score(y_true, y_pred,)};'\
    f'{f1_score(y_true, y_pred,)};{roc_auc_score(y_true=y_true, y_score=y_pred)}'
logging.debug(str_)


vec1 = [model_name,
        str(params),
        'TRAIN',
        tn,
        fp,
        fn,
        tp,
        precision_score(y_true, y_pred,),
        recall_score(y_true, y_pred,),
        f1_score(y_true, y_pred,),
        roc_auc_score(y_true=y_true, y_score=y_pred),
        ]

y_true = y_test 
y_pred = clf.predict(X_test) > 0.5
y_score= clf.predict(X_test)

tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
str_ = f'{model_name};{str(params)};TEST;{tn};{fp};{fn};{tp};{precision_score(y_true, y_pred,)};{recall_score(y_true, y_pred,)};'\
    f'{f1_score(y_true, y_pred,):};{roc_auc_score(y_true=y_true, y_score=y_pred)}'
logging.debug(str_)                         

vec2 = [model_name,
        str(params),
        'TEST',
        tn,
        fp,
        fn,
        tp,
        precision_score(y_true, y_pred,),
        recall_score(y_true, y_pred,),
        f1_score(y_true, y_pred,),
        roc_auc_score(y_true=y_true, y_score=y_pred),
        ]

m = np.vstack([vec1, vec2])
df = pd.DataFrame(m, columns=columns)

if os.path.isfile(config['logreg_results']):
    old_df = pd.read_csv(config['logreg_results'], sep=';')
    df = pd.concat([old_df,df])

# df.to_csv(config['logreg_results'], index=False, sep=';')
logging.debug('Finishing Logistic Regression execution')


2023-11-22 09:25:11,353 - root - DEBUG - Starting LASSO experiments ...
2023-11-22 09:25:11,363 - root - DEBUG - fix_skew                      =False
2023-11-22 09:25:11,367 - root - DEBUG - normalize                     =False
2023-11-22 09:25:11,369 - root - DEBUG - fix_missing_in_testing        =True
2023-11-22 09:25:11,372 - root - DEBUG - numerical_features            =True
2023-11-22 09:25:11,376 - root - DEBUG - categorical_features          =True
2023-11-22 09:25:11,377 - root - DEBUG - diagnosis_features            =True
2023-11-22 09:25:11,377 - root - DEBUG - intervention_features         =True
2023-11-22 09:25:11,378 - root - DEBUG - use_idf                       =False
2023-11-22 09:25:11,378 - root - DEBUG - remove_outliers               =False


Training instances before filtering: 419988
Training instances after filtering:  419139
Testomg instances before filtering:  104998
Testomg instances after filtering:   104884
2023-11-22 09:25:46,723 - root - DEBUG - Training size=419,139
2023-11-22 09:25:46,724 - root - DEBUG - Testing  size=104,884
2023-11-22 09:26:06,935 - root - DEBUG - X_train.shape = (419,139 x 17,129)
2023-11-22 09:26:06,936 - root - DEBUG - y_train.shape = (419,139 x )
2023-11-22 09:26:06,937 - root - DEBUG - X_test.shape =  (104,884 x 17,129)
2023-11-22 09:26:06,937 - root - DEBUG - y_test.shape =  (104,884 x )
2023-11-22 09:26:30,946 - root - DEBUG - Model;params;split;TN;FP;FN;TP;Precision;Recall;F1-Score;AUC


  _warn_prf(average, modifier, msg_start, len(result))


2023-11-22 09:26:31,361 - root - DEBUG - Lasso(max_iter=7000);{'fix_skew': False, 'normalize': False, 'fix_missing_in_testing': True, 'numerical_features': True, 'categorical_features': True, 'diagnosis_features': True, 'intervention_features': True, 'use_idf': False, 'remove_outliers': False};TRAIN;401737;0;17402;0;0.0;0.0;0.0;0.5


  _warn_prf(average, modifier, msg_start, len(result))


2023-11-22 09:26:31,834 - root - DEBUG - Lasso(max_iter=7000);{'fix_skew': False, 'normalize': False, 'fix_missing_in_testing': True, 'numerical_features': True, 'categorical_features': True, 'diagnosis_features': True, 'intervention_features': True, 'use_idf': False, 'remove_outliers': False};TEST;100351;0;4533;0;0.0;0.0;0.0;0.5
2023-11-22 09:26:31,927 - root - DEBUG - Finishing Logistic Regression execution


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [4]:
clf.predict(X_train).shape

(419139,)