In [1]:
import numpy as np
import sys
sys.path.append('..')

from utilities import configuration


In [2]:
model_dict = {'model_name':'BalancedRandomForestClassifier',
              "n_estimators": 100,
              "criterion": "gini",
              "max_depth": None,
              "min_samples_split": 2,
              "min_samples_leaf": 1000,
              "min_weight_fraction_leaf":0.0,
              "max_features": "sqrt",
              "max_leaf_nodes": None,
              "min_impurity_decrease":0.0,
              "bootstrap": True,
              "oob_score": False,
              "sampling_strategy": "all",
              "replacement": True,
              "n_jobs": None,
              "verbose": 0,
              "warm_start": False,
              "class_weight": None,
              "ccp_alpha": 0.0,
              "max_samples": None,
              }

model_seed = 1270833263
model_random_state=np.random.RandomState(model_seed)
model = configuration.model_from_configuration(model_dict, random_state=model_random_state)
model

In [3]:
import pandas as pd
import numpy as np
import json
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score
import os
import sys
sys.path.append('..')

from utilities import configuration
from utilities import logger
from utilities import health_data

from sklearn.linear_model import LogisticRegression

In [4]:
config = configuration.get_config()

experiment_configurations = json.load(open(config['experiments_config'], encoding='utf-8'))


params = experiment_configurations['configuration_0']


X_train, y_train, X_test, y_test, feature_names = health_data.Admission.get_train_test_matrices(
    fix_missing_in_testing=params['fix_missing_in_testing'],
    normalize=params['normalize'],
    fix_skew=params['fix_skew'],
    numerical_features=params['numerical_features'],
    categorical_features=params['categorical_features'],
    diagnosis_features=params['diagnosis_features'],
    intervention_features=params['intervention_features'],
    use_idf=params['use_idf'],
    remove_outliers=params['remove_outliers'],
    )

In [5]:
model.fit(X_train, y_train)

In [6]:
# Evaluating metrics on TRAINING
y_true = y_train
y_pred = model.predict(X_train)
y_score= model.predict_proba(X_train)

model_name = str(model)
columns = ['Model',
        'split',
        'TN',
        'FP',
        'FN',
        'TP',
        'Precision',
        'Recall',
        'F1-Score',
        'AUC', 
        ]

tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
vec1 = [model_name,
        'TRAIN',
        tn,
        fp,
        fn,
        tp,
        precision_score(y_true, y_pred,),
        recall_score(y_true, y_pred,),
        f1_score(y_true, y_pred,),
        roc_auc_score(y_true=y_true, y_score=y_pred)
        ]

# Evaluating metrics on TESTING
y_true = y_test
y_pred = model.predict(X_test)
y_score= model.predict_proba(X_test)

tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()    
vec2 = [model_name,
        'TEST',
        tn,
        fp,
        fn,
        tp,
        precision_score(y_true, y_pred,),
        recall_score(y_true, y_pred,),
        f1_score(y_true, y_pred,),
        roc_auc_score(y_true=y_true, y_score=y_pred)
        ]

m = np.vstack([vec1, vec2])
new_df = pd.DataFrame(m, columns=columns)
new_df

Unnamed: 0,Model,split,TN,FP,FN,TP,Precision,Recall,F1-Score,AUC
0,BalancedRandomForestClassifier(min_samples_lea...,TRAIN,213319,186188,5654,11716,0.0592004204058533,0.674496257915947,0.1088473294499103,0.6042259290966444
1,BalancedRandomForestClassifier(min_samples_lea...,TEST,53332,47019,1497,3036,0.0606532813904704,0.6697551290536069,0.111233238074302,0.6006048617136774


In [18]:
X_train.shape

(416877, 17136)