In [1]:
import pandas as pd
import numpy as np
import json
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score
import os
import sys
sys.path.append('..')

from utilities import configuration
from utilities import logger
from utilities import health_data

from imblearn.over_sampling import SMOTE

In [3]:
configuration_dict = {
    "fix_skew": False,
    "normalize": False,
    "fix_missing_in_testing": True,
    "numerical_features": True,
    "categorical_features": True,
    "diagnosis_features": True,
    "intervention_features": True,
    "use_idf": False,
    "class_balanced": False,
    "remove_outliers": True
}
model_dict = {
        "model_name": "MLPClassifier",
        "hidden_layer_sizes": [
            100
        ],
        "activation": "relu",
        "solver": "adam",
        "alpha": 0.0001,
        "batch_size": "auto",
        "learning_rate": "constant",
        "learning_rate_init": 0.001,
        "power_t": 0.5,
        "max_iter": 1000,
        "shuffle": True,
        "random_state": None,
        "tol": 0.0001,
        "verbose": False,
        "warm_start": False,
        "momentum": 0.9,
        "nesterovs_momentum": True,
        "early_stopping": False,
        "validation_fraction": 0.1,
        "beta_1": 0.9,
        "beta_2": 0.999,
        "epsilon": 1e-08,
        "n_iter_no_change": 10,
        "max_fun": 15000
    }

log_reg = {
        "model_name": "LogisticRegression",
        "penalty": "l2",
        "dual": False,
        "tol": 0.0001,
        "C": 1.0,
        "fit_intercept": True,
        "intercept_scaling": 1,
        "class_weight": None,
        "random_state": None,
        "solver": "lbfgs",
        "max_iter": 7000,
        "multi_class": "auto",
        "verbose": 0,
        "warm_start": False,
        "n_jobs": None,
        "l1_ratio": None
    }
model_seed = 1270833263
model_random_state=np.random.RandomState(model_seed)
logreg = configuration.model_from_configuration(log_reg,model_random_state )
logreg

In [4]:
config = configuration.get_config()


# Retrieving model and experiment configurations 
model_configurations = json.load(open(config['models_config'], encoding='utf-8'))
experiment_configurations = json.load(open(config['experiments_config'], encoding='utf-8'))

params = configuration_dict

# Computing training and testing matrices.
X_train, y_train, X_test, y_test, columns = health_data.Admission.get_train_test_matrices(
        fix_missing_in_testing=params['fix_missing_in_testing'],
        normalize=params['normalize'],
        fix_skew=params['fix_skew'],
        numerical_features=params['numerical_features'],
        categorical_features=params['categorical_features'],
        diagnosis_features=params['diagnosis_features'],
        intervention_features=params['intervention_features'],
        use_idf=params['use_idf'],
        remove_outliers=params['remove_outliers'],
        )


In [10]:
print(X_train.shape)
print(y_train.shape)


print(f'{np.sum(y_train==0):7,}')
print(f'{np.sum(y_train==1):7,}')


(416877, 17136)
(416877,)
399,507
 17,370


In [90]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

print(f'Majority: {np.sum(y_train==0):7,}')
print(f'Minority: {np.sum(y_train==1):7,}')


over = SMOTE(sampling_strategy=0.07)
under = RandomUnderSampler(sampling_strategy=1)

steps = [('o', over), ('u', under)]
# steps = [('u', under)]
# steps = [('o', over)]

pipeline = Pipeline(steps=steps)

x, y = pipeline.fit_resample(X_train, y_train)
print(x.shape)

print(f'Majority: {np.sum(y==0):7,} ({np.sum(y==0)/np.sum(y==1):.1f}X the minority)')
print(f'Minority: {np.sum(y==1):7,}')

Majority: 399,507
Minority:  17,370
(55930, 17136)
Majority:  27,965 (1.0X the minority)
Minority:  27,965


In [91]:
logreg.fit(x, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [92]:
y_true = y_train
model = logreg
y_pred = model.predict(X_train)
y_score= model.predict_proba(X_train)
model_name = 'LogReg'
columns = ['Model',
            'split',
            'TN',
            'FP',
            'FN',
            'TP',
            'Precision',
            'Recall',
            'F1-Score',
            'AUC',
            ]

tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
vec1 = [model_name,
        'TRAIN',
        tn,
        fp,
        fn,
        tp,
        precision_score(y_true, y_pred,),
        recall_score(y_true, y_pred,),
        f1_score(y_true, y_pred,),
        roc_auc_score(y_true=y_true, y_score=y_pred),
        ]

# Evaluating metrics on TESTING
y_true = y_test
y_pred = model.predict(X_test)
y_score= model.predict_proba(X_test)

tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()    
vec2 = [model_name,
        'TEST',
        tn,
        fp,
        fn,
        tp,
        precision_score(y_true, y_pred,),
        recall_score(y_true, y_pred,),
        f1_score(y_true, y_pred,),
        roc_auc_score(y_true=y_true, y_score=y_pred),
        ]
m = np.vstack([vec1, vec2])
new_df = pd.DataFrame(m, columns=columns)

print(sampling_strategy)
new_df


0.5


Unnamed: 0,Model,split,TN,FP,FN,TP,Precision,Recall,F1-Score,AUC
0,LogReg,TRAIN,252348,147159,4671,12699,0.0794392523364486,0.7310880829015544,0.1433069266707292,0.6813682948430831
1,LogReg,TEST,63648,36703,1465,3068,0.077141635865329,0.6768144716523273,0.1384976525821596,0.655534120461095


In [72]:
x.shape

(799014, 17136)

In [7]:
print(X.shape)
print(X_train.shape)
# model.fit(X,y)

ValueError: The specified ratio required to generate new sample in the majority class while trying to remove samples. Please increase the ratio.

In [22]:
X.shape

(119850, 17136)

### Under Sampling


In [None]:
number_of_positives_in_training = np.sum(y_train)
number_of_positives_in_training

positive_x = X_train[y_train==1,:]
negative_x = X_train[y_train==0,:]

print(positive_x.shape)
print(negative_x.shape)

under_sampling_ix = np.random.choice(range(negative_x.shape[0]), size=number_of_positives_in_training, replace=False)

under_sampled_negative_x = negative_x[under_sampling_ix,:]

print(positive_x.shape)
print(under_sampled_negative_x.shape)

undersampled_x = np.vstack([positive_x,under_sampled_negative_x])

print(undersampled_x.shape)

undersampled_y = np.array([1]*number_of_positives_in_training + [0]*number_of_positives_in_training)

print(undersampled_y.shape)
# X_train_under_sampled = X_train[under_sampling_ix,:]
# X_train_under_sampled.shape
