In [1]:
import pandas as pd
import numpy as np
import json
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score
import os
import sys
sys.path.append('..')

from utilities import configuration
from utilities import logger
from utilities import health_data

from sklearn.linear_model import LogisticRegression

In [4]:
config = configuration.get_config()

# Retrieving model and experiment configurations 
model_configurations = json.load(open(config['models_config'], encoding='utf-8'))
experiment_configurations = json.load(open(config['experiments_config'], encoding='utf-8'))


params = experiment_configurations['configuration_0']

# Computing training and testing matrices.
X_train, y_train, X_test, y_test, feature_names = health_data.Admission.get_train_test_matrices(
    fix_missing_in_testing=params['fix_missing_in_testing'],
    normalize=params['normalize'],
    fix_skew=params['fix_skew'],
    numerical_features=params['numerical_features'],
    categorical_features=params['categorical_features'],
    diagnosis_features=params['diagnosis_features'],
    intervention_features=params['intervention_features'],
    use_idf=params['use_idf'],
    remove_outliers=params['remove_outliers'],
    )


# Creating classification model and training ...
model = LogisticRegression(max_iter=1000, class_weight='balanced')

model.fit(X_train, y_train)

# Evaluating metrics on TRAINING
y_true = y_train
y_pred = model.predict(X_train)
y_score= model.predict_proba(X_train)

model_name = str(model)
columns = ['Model',
        'split',
        'TN',
        'FP',
        'FN',
        'TP',
        'Precision',
        'Recall',
        'F1-Score',
        'AUC', 
        ]

tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
vec1 = [model_name,
        'TRAIN',
        tn,
        fp,
        fn,
        tp,
        precision_score(y_true, y_pred,),
        recall_score(y_true, y_pred,),
        f1_score(y_true, y_pred,),
        roc_auc_score(y_true=y_true, y_score=y_pred)
        ]

# Evaluating metrics on TESTING
y_true = y_test
y_pred = model.predict(X_test)
y_score= model.predict_proba(X_test)

tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()    
vec2 = [model_name,
        'TEST',
        tn,
        fp,
        fn,
        tp,
        precision_score(y_true, y_pred,),
        recall_score(y_true, y_pred,),
        f1_score(y_true, y_pred,),
        roc_auc_score(y_true=y_true, y_score=y_pred)
        ]
m = np.vstack([vec1, vec2])
new_df = pd.DataFrame(m, columns=columns)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [15]:
diagnosis_mapping = health_data.Admission.get_diagnoses_mapping()
intervention_mapping = health_data.Admission.get_intervention_mapping()



In [34]:
features = [(name,importance) for importance,name in zip(model.coef_[0,:], feature_names)]
features = sorted(features, key=lambda x: (x[1]))
for word, score in features[-20:][::-1]:
    if word.upper() in diagnosis_mapping:
        print(f'{word:10} - {score:+4.3f} - {diagnosis_mapping[word.upper()]}')
    if word.upper() in intervention_mapping:
        print(f'{word:10} - {score:+4.3f} - {intervention_mapping[word.upper()]}')


d334       - +1.243 - {'Benign neoplasm of spinal cord'}
n9952      - +1.076 - {'Other malfunction of external stoma of urinary tract, NEC'}
s250       - +1.001 - {'Injury of thoracic aorta'}
t0121      - +0.944 - {'Open wounds involving multiple regions of upper limb(s), complicated'}
1vx87la    - +0.935 - {'Excision partial, soft tissue of leg using simple apposition technique [e.g. suture, staple] (for cl'}
b008       - +0.912 - {'Other forms of herpesviral infection'}
s02700     - +0.839 - {'Multiple fractures involving skull and facial bones, closed'}
s998       - +0.814 - {'Other specified injuries of ankle and foot'}
1nf86mexxe - +0.795 - {'Closure of fistula, stomach using local flap [e.g. omental transposition flap] for fistula terminati'}
b084       - +0.773 - {'Enteroviral vesicular stomatitis with exanthem'}
n998       - +0.765 - {'Other postprocedural disorders of genitourinary system'}
o90002     - +0.762 - {'Disruption of caesarean section wound, delivered, with mention 

In [33]:
features = [(name,importance) for importance,name in zip(model.coef_[0,:], feature_names)]
features = sorted(features, key=lambda x: (x[1]))
for word, score in features[:20]:
    if word.upper() in diagnosis_mapping:
        print(f'{word:10} - {score:+4.3f} - {diagnosis_mapping[word.upper()]}')
    if word.upper() in intervention_mapping:
        print(f'{word:10} - {score:+4.3f} - {intervention_mapping[word.upper()]}')


t904       - -1.173 - {'Sequelae of injury of eye and orbit'}
c022       - -1.062 - {'Malignant neoplasm ventral surface of tongue'}
a085       - -0.975 - {'Other specified intestinal infections'}
1vx35hak0  - -0.856 - {'Pharmacotherapy (local), soft tissue of leg using percutaneous approach of antiinfective NEC'}
1zz35hap9  - -0.801 - {'Pharmacotherapy, total body nervous system agents percutaneous approach [intramuscular, intravenous,'}
c021       - -0.800 - {'Malignant neoplasm of border of tongue'}
1gk87la    - -0.769 - {'Excision partial, thymus using open approach'}
v953       - -0.766 - {'Accident to commercial fixed-wing aircraft, injuring occupant'}
b023       - -0.755 - {'Zoster ocular disease'}
r944       - -0.755 - {'Abnormal results of kidney function studies'}
s855       - -0.748 - {'Injury of popliteal vein'}
1na13dae3  - -0.701 - {'Control of bleeding, esophagus using endoscopic [VATS] approach and cardiac stimulant (e.g. epinephr'}
m351       - -0.646 - {'Other overlap

In [12]:
model.coef_.shape

(1, 17136)