2023-12-07 18:36:15,497 - root - DEBUG - Starting Neural Networks experiments ...


In [5]:
import pandas as pd
import numpy as np
import sys
sys.path.append('..')
from utilities import configuration
from utilities import logger
from utilities import health_data
import json
import argparse
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score
import os
import ast

config = configuration.get_config()
logging = logger.init_logger(config['nn_log'])
logging.debug('Starting Neural Networks experiments ...')

model_configurations = json.load(open(config['models_config'], encoding='utf-8'))
experiment_configurations = json.load(open(config['experiments_config'], encoding='utf-8'))


for configuration_id, configuration_dict in experiment_configurations.items():
        params = configuration_dict
        X_train, y_train, X_test, y_test = health_data.Admission.get_train_test_matrices(fix_missing_in_testing=params['fix_missing_in_testing'],
                                                                                        normalize=params['normalize'],
                                                                                        fix_skew=params['fix_skew'],
                                                                                        numerical_features=params['numerical_features'],
                                                                                        categorical_features=params['categorical_features'],
                                                                                        diagnosis_features=params['diagnosis_features'],
                                                                                        intervention_features=params['intervention_features'],
                                                                                        use_idf=params['use_idf'],
                                                                                        remove_outliers=params['remove_outliers'],
                                                                                        )
        X_train = X_train[:2000,:]
        y_train = y_train[:2000]
        X_test = X_test[:2000,:]
        y_test = y_test[:2000]
        for model_id, model_dict in list(model_configurations.items())[8:]:
                print(f"Working with model: {model_dict['model_name']}")
                if os.path.isfile(config['experiment_results']):
                        auxdf = pd.read_csv(config['experiment_results'], sep=';')
                        model_ids = set([model_id for model_id in auxdf['model_id']])
                        configuration_ids = set([model_id for model_id in auxdf['config_id']])
                        if model_id in model_ids and configuration_id in configuration_ids:
                                print('SKIPPING ...')
                                continue
                        else:
                                print('NOT SKIPPING')

                model = configuration.model_from_configuration(model_dict)

                model.fit(X_train, y_train)
                y_true = y_train
                y_pred = model.predict(X_train)
                y_score= model.predict_proba(X_train)

                model_name = model_dict['model_name']
                columns = ['Model','split','TN','FP','FN','TP','Precision','Recall','F1-Score','AUC', 'experiment_params', 'model_params', 'config_id', 'model_id']

                tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

                vec1 = [model_name,
                        'TRAIN',
                        tn,
                        fp,
                        fn,
                        tp,
                        precision_score(y_true, y_pred,),
                        recall_score(y_true, y_pred,),
                        f1_score(y_true, y_pred,),
                        roc_auc_score(y_true=y_true, y_score=y_pred),
                        str(configuration_dict),
                        str(model_dict),
                        configuration_id,
                        model_id
                        ]

                y_true = y_test
                y_pred = model.predict(X_test)
                y_score= model.predict_proba(X_test)

                tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()                       

                vec2 = [model_name,
                        'TEST',
                        tn,
                        fp,
                        fn,
                        tp,
                        precision_score(y_true, y_pred,),
                        recall_score(y_true, y_pred,),
                        f1_score(y_true, y_pred,),
                        roc_auc_score(y_true=y_true, y_score=y_pred),
                        str(configuration_dict),
                        str(model_dict),
                        configuration_id,
                        model_id
                        ]
                m = np.vstack([vec1, vec2])
                new_df = pd.DataFrame(m, columns=columns)

                if os.path.isfile(config['experiment_results']):
                        old_df = pd.read_csv(config['experiment_results'], sep=';')
                        new_df = pd.concat([old_df,new_df])

                new_df.to_csv(config['experiment_results'], index=False, sep=';')
        

Working with model: LogisticRegression(class_weight=balanced)
SKIPPING ...
Working with model: SVC(class_weight=balanced)
SKIPPING ...


In [3]:
import numpy as np

model_configurations = json.load(open(config['models_config'], encoding='utf-8'))

X = np.random.rand(1000,5)
y = np.random.randint(low=0, high=2, size=1000)
for model_no, model_dict in model_configurations.items():
    print(f"Working with model: {model_dict['model_name']}")
    model = configuration.model_from_configuration(model_dict)
    model.fit(X,y)
    X2 = np.random.rand(5000,5)

    yhat = model.predict(X2)
    y_score = model.predict_proba(X2)

    print(f'yhat (yhat.shape={yhat.shape})       = {yhat[:10]}')
    print(f'yhat (y_score.shape={y_score.shape}) = {y_score[:10]}')
    print()

Working with model: SVC
yhat (yhat.shape=(5000,))       = [1 1 1 0 1 1 0 1 0 0]
yhat (y_score.shape=(5000, 2)) = [[0.49147918 0.50852082]
 [0.48192603 0.51807397]
 [0.49061261 0.50938739]
 [0.50623495 0.49376505]
 [0.47869663 0.52130337]
 [0.48404542 0.51595458]
 [0.50646858 0.49353142]
 [0.48848093 0.51151907]
 [0.4917061  0.5082939 ]
 [0.5        0.5       ]]

Working with model: DecisionTreeClassifier
yhat (yhat.shape=(5000,))       = [1 1 1 0 1 1 1 1 1 1]
yhat (y_score.shape=(5000, 2)) = [[0.48737374 0.51262626]
 [0.48737374 0.51262626]
 [0.48737374 0.51262626]
 [0.62962963 0.37037037]
 [0.48737374 0.51262626]
 [0.48737374 0.51262626]
 [0.48737374 0.51262626]
 [0.48737374 0.51262626]
 [0.48737374 0.51262626]
 [0.48737374 0.51262626]]

Working with model: LogisticRegression
yhat (yhat.shape=(5000,))       = [1 1 0 0 0 1 0 1 0 0]
yhat (y_score.shape=(5000, 2)) = [[0.47994365 0.52005635]
 [0.48952994 0.51047006]
 [0.51832845 0.48167155]
 [0.50766356 0.49233644]
 [0.50514037 0.49485963



In [15]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf.fit(X, y)

X_2 = np.random.rand(w)
clf.predict_proba()

array([[0., 1.],
       [0., 1.],
       [1., 0.],
       ...,
       [1., 0.],
       [0., 1.],
       [0., 1.]])

In [8]:

# parser = argparse.ArgumentParser()
# parser.add_argument('--remove-outliers', action=argparse.BooleanOptionalAction)
# args = parser.parse_args()

params = {'fix_skew': False,
            'normalize': False,
            'fix_missing_in_testing': True,
            'numerical_features': True,
            'categorical_features': True,
            'diagnosis_features': True,
            'intervention_features':True,
            'use_idf':False,
            'class_balanced':False,
            'remove_outliers': True,
        }

# for key, value in params.items():
#     logging.debug(f'{key:30}={value}')



In [14]:
X_train, y_train, X_test, y_test = health_data.Admission.get_train_test_matrices(fix_missing_in_testing=params['fix_missing_in_testing'],
                                                                                 normalize=params['normalize'],
                                                                                 fix_skew=params['fix_skew'],
                                                                                 numerical_features=params['numerical_features'],
                                                                                 categorical_features=params['categorical_features'],
                                                                                 diagnosis_features=params['diagnosis_features'],
                                                                                 intervention_features=params['intervention_features'],
                                                                                 use_idf=params['use_idf'],
                                                                                 remove_outliers=params['remove_outliers'],
                                                                                 )

In [36]:
balanced_weight

{0: 0.5217392936794599, 1: 11.999913644214162}

In [40]:
11.999913644214162*(X_train[y_train==0]).shape[1]

205630.5202072539

In [46]:
(X_train[y_train==0]).shape[0]*0.25

99876.75

In [39]:
print((X_train[y_train==0]).shape)
print((X_train[y_train==1]).shape)

(399507, 17136)
(17370, 17136)


In [5]:
custom_weight = {0: 0.25,
                   1: 12,
                   }
balanced_weight = {0: X_train.shape[0] / (2 * np.bincount(y_train))[0],
                   1: X_train.shape[0] / (2 * np.bincount(y_train))[1],
                   }
default_weight = {0: 0.5,
                   1: 1,
                   }
# clf = LogisticRegression(class_weight=custom_weight, max_iter=7000 )
# _ = clf.fit(X_train, y_train)
balanced_weight

{0: 0.5217392936794599, 1: 11.999913644214162}

In [19]:
y_true = y_train
y_pred = clf.predict(X_train)
y_score= clf.predict_proba(X_train)

model_name = str(clf)    
columns = ['Model','split','TN','FP','FN','TP','Precision','Recall','F1-Score','AUC']

tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()


vec1 = [model_name,
        'TRAIN',
        tn,
        fp,
        fn,
        tp,
        precision_score(y_true, y_pred,),
        recall_score(y_true, y_pred,),
        f1_score(y_true, y_pred,),
        roc_auc_score(y_true=y_true, y_score=y_pred),
        ]

y_true = y_test
y_pred = clf.predict(X_test)
y_score= clf.predict_proba(X_test)

tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()                       

vec2 = [model_name,
        'TEST',
        tn,
        fp,
        fn,
        tp,
        precision_score(y_true, y_pred,),
        recall_score(y_true, y_pred,),
        f1_score(y_true, y_pred,),
        roc_auc_score(y_true=y_true, y_score=y_pred),
        ]
# vec2 = vec2[0:1] + [params[param_name] for param_name in param_names] + vec2[1:]
m = np.vstack([vec1, vec2])
df = pd.DataFrame(m, columns=columns)

df

Unnamed: 0,Model,split,TN,FP,FN,TP,Precision,Recall,F1-Score,AUC
0,"LogisticRegression(class_weight='balanced', ma...",TRAIN,262182,137325,4676,12694,0.0846159486465047,0.7308002302820956,0.1516706593623236,0.6935320377356455
1,"LogisticRegression(class_weight='balanced', ma...",TEST,65977,34374,1572,2961,0.0793089594214544,0.6532097948378557,0.1414445399828031,0.6553360510696139


In [35]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score
y_true = y_train
y_pred = clf.predict(X_train)
y_score= clf.predict_proba(X_train)

model_name = str(clf)    
columns = ['Model','split','TN','FP','FN','TP','Precision','Recall','F1-Score','AUC']

tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()


vec1 = [model_name,
        'TRAIN',
        tn,
        fp,
        fn,
        tp,
        precision_score(y_true, y_pred,),
        recall_score(y_true, y_pred,),
        f1_score(y_true, y_pred,),
        roc_auc_score(y_true=y_true, y_score=y_pred),
        ]

y_true = y_test
y_pred = clf.predict(X_test)
y_score= clf.predict_proba(X_test)

tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()                       

vec2 = [model_name,
        'TEST',
        tn,
        fp,
        fn,
        tp,
        precision_score(y_true, y_pred,),
        recall_score(y_true, y_pred,),
        f1_score(y_true, y_pred,),
        roc_auc_score(y_true=y_true, y_score=y_pred),
        ]
# vec2 = vec2[0:1] + [params[param_name] for param_name in param_names] + vec2[1:]
m = np.vstack([vec1, vec2])
df = pd.DataFrame(m, columns=columns)

df

Unnamed: 0,Model,split,TN,FP,FN,TP,Precision,Recall,F1-Score,AUC
0,"LogisticRegression(class_weight={0: 1, 1: 1}, ...",TRAIN,399502,5,17369,1,0.1666666666666666,5.757052389176742e-05,0.0001151012891344,0.5000225275493151
1,"LogisticRegression(class_weight={0: 1, 1: 1}, ...",TEST,100348,3,4532,1,0.25,0.00022060445621,0.0004408199250606,0.50009535469395


In [48]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score
y_true = y_train
y_pred = clf.predict(X_train)
y_score= clf.predict_proba(X_train)

model_name = str(clf)    
columns = ['Model','split','TN','FP','FN','TP','Precision','Recall','F1-Score','AUC']

tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()


vec1 = [model_name,
        'TRAIN',
        tn,
        fp,
        fn,
        tp,
        precision_score(y_true, y_pred,),
        recall_score(y_true, y_pred,),
        f1_score(y_true, y_pred,),
        roc_auc_score(y_true=y_true, y_score=y_pred),
        ]

y_true = y_test
y_pred = clf.predict(X_test)
y_score= clf.predict_proba(X_test)

tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()                       

vec2 = [model_name,
        'TEST',
        tn,
        fp,
        fn,
        tp,
        precision_score(y_true, y_pred,),
        recall_score(y_true, y_pred,),
        f1_score(y_true, y_pred,),
        roc_auc_score(y_true=y_true, y_score=y_pred),
        ]
# vec2 = vec2[0:1] + [params[param_name] for param_name in param_names] + vec2[1:]
m = np.vstack([vec1, vec2])
df = pd.DataFrame(m, columns=columns)

df

Unnamed: 0,Model,split,TN,FP,FN,TP,Precision,Recall,F1-Score,AUC
0,"LogisticRegression(class_weight={0: 0.25, 1: 1...",TRAIN,155052,244455,1340,16030,0.0615390521527151,0.9228554979850316,0.1153839232693311,0.6554819207592183
1,"LogisticRegression(class_weight={0: 0.25, 1: 1...",TEST,39158,61193,603,3930,0.0603473427206977,0.8669755129053607,0.1128402434822556,0.6285929372680186


In [50]:
from sklearn.neural_network import MLPClassifier
_ = MLPClassifier(hidden_layer_sizes=0)


In [7]:
params = {'experiment_1': {'model': {'name': 'MLPClassifier',
                                    'C':1.0,
                                    'kernel': 'rbf',
                                    'degree':3,
                                    'gamma': 'scale',
                                    'coef0': 0.0,
                                    'shrinking': True,
                                    'probability': True,
                                    'tol': 0.001,
                                    'cache_size': 200,
                                    'class_weight': None,
                                    'verbose': False,
                                    'max_iter':-1,
                                    'decision_function_shape': 'ovr',
                                    'break_ties': False,
                                    'random_state': None,}
                           },
          'experiment_2': {'model':{'name': 'DecisionTreeClassifier',
                                    'criterion': 'gini',
                                    'splitter': 'best',
                                    'max_depth': None,
                                    'min_samples_split': 2,
                                    'min_samples_leaf': 1,
                                    'min_weight_fraction_leaf': 0.0,
                                    'max_features': None,
                                    'random_state': None,
                                    'max_leaf_nodes': None,
                                    'min_impurity_decrease':0.0,
                                    'class_weight': None,
                                    'ccp_alpha': 0.0,
                                    }
                           },
          'experiment_3': {'model':{'name': 'LogisticRegression',
                                    'penalty': 'l2',
                                    'dual': False,
                                    'tol': 0.0001,
                                    'C': 1.0,
                                    'fit_intercept': True,
                                    'intercept_scaling': 1,
                                    'class_weight': None,
                                    'random_state': None,
                                    'solver': 'lbfgs',
                                    'max_iter': 100,
                                    'multi_class': 'auto',
                                    'verbose': 0,
                                    'warm_start': False,
                                    'n_jobs': None,
                                    'l1_ratio': None,
                                   }
                          },
          'experiment_4': {'model':{'name': 'RandomForestClassifier',
                                    'n_estimators': 100,
                                    'criterion': 'gini',
                                    'max_depth': None,
                                    'min_samples_split': 2,
                                    'min_samples_leaf':1,
                                    'min_weight_fraction_leaf':0.0,
                                    'max_features': 'sqrt',
                                    'max_leaf_nodes': None,
                                    'min_impurity_decrease': 0.0,
                                    'bootstrap': True,
                                    'oob_score': False,
                                    'n_jobs': None,
                                    'random_state': None,
                                    'verbose': 0,
                                    'warm_start': False,
                                    'class_weight': None,
                                    'ccp_alpha': 0.0,
                                    'max_samples': None
                                   }
                          },
          'experiment_5': {'model':{'name': 'MLPClassifier',
                                    'hidden_layer_sizes': (100,),
                                    'activation': 'relu',
                                    'solver': 'adam',
                                    'alpha': 0.0001,
                                    'batch_size': 'auto',
                                    'learning_rate': 'constant',
                                    'learning_rate_init': 0.001,
                                    'power_t': 0.5,
                                    'max_iter': 200,
                                    'shuffle': True,
                                    'random_state': None,
                                    'tol': 0.0001,
                                    'verbose': False,
                                    'warm_start': False,
                                    'momentum': 0.9,
                                    'nesterovs_momentum': True,
                                    'early_stopping': False,
                                    'validation_fraction': 0.1,
                                    'beta_1': 0.9,
                                    'beta_2': 0.999,
                                    'epsilon': 1e-08,
                                    'n_iter_no_change': 10,
                                    'max_fun': 15000,
                                   }
                          },
          'experiment_6': {'model':{'name': 'GaussianNB',
                                    'priors': None,
                                    'var_smoothing': 1e-09,
                                   }
                          },
          'experiment_7': {'model':{'name': 'ComplementNB',
                                    'alpha': 1.0,
                                    'force_alpha': 'warn',
                                    'fit_prior': True,
                                    'class_prior': None,
                                    'norm': False,
                                   }
                          },
          'experiment_8': {'model':{'name': 'BernoulliNB',
                                    'alpha': 1.0,
                                    'force_alpha': 'warn',
                                    'binarize': 0.0,
                                    'fit_prior': True,
                                    'class_prior': None,
                                   }
                          },
         }
# configuration.model_from_configuration(params['experiment_8'])

with open(config['experiments_config'], "w", encoding='utf-8') as outfile:
    json.dump(params, outfile,indent=4)

In [53]:
params['experiment_1']['Model']

'MLPClassifier'

In [54]:
params['experiment_1']()

TypeError: 'str' object is not callable