In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from datetime import date
import datetime
import numpy as np
import seaborn as sns
import re
import json
import pandas as pd
import math
import ast
import os
from collections import defaultdict
import sys
sys.path.append('..')
from utilities import logger
from utilities import configuration
from utilities import health_data

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.preprocessing import StandardScaler

from sklearn.neural_network import MLPClassifier

from scipy import sparse


In [2]:
config = configuration.get_config()
for key in config:
    print(f'{key:40} (type: {type(config[key])})')


logging = logger.init_logger(config['system_log'])
logging.debug('Logger has started ont notebook 09 Random sample of instances.ipynb ...')

system_log                               (type: <class 'str'>)
json_file                                (type: <class 'str'>)
train_val_json                           (type: <class 'str'>)
heldout_json                             (type: <class 'str'>)
unused_after_heldout_json                (type: <class 'str'>)
unified_merged_file_cz                   (type: <class 'str'>)
unified_merged_file_noncz                (type: <class 'str'>)
unified_merged_file                      (type: <class 'str'>)
data_path                                (type: <class 'str'>)
cz_files                                 (type: <class 'list'>)
noncz_files                              (type: <class 'list'>)
2023-11-15 09:42:36,088 - root - DEBUG - Logger has started ont notebook 09 Random sample of instances.ipynb ...


In [3]:
f = open(config['train_val_json'])
train_val_data = json.load(f)

all_admissions = []
for ix in train_val_data:
    all_admissions.append(
        health_data.Admission.from_dict_data(admit_id=int(ix), admission=train_val_data[ix])
        )
len(all_admissions)

# Dictionary organizing data by patient
patient2admissions = defaultdict(list)
for admission in all_admissions:
    code = admission.code
    patient2admissions[code].append(admission)

# Ordering patient list by discharge date (from back )
for patient_code in patient2admissions:
    admissions_list = patient2admissions[patient_code]
    admissions_list = sorted(admissions_list, key=lambda admission: admission.discharge_date, reverse=False)
    assert all([admissions_list[i].discharge_date <= admissions_list[i+1].discharge_date for i in range(len(admissions_list)-1)])
    patient2admissions[patient_code] = admissions_list

patient_count=0
valid_readmission_count=0
for patient_code in patient2admissions:
    patient_admissions = patient2admissions[patient_code]
    ix = 0 
    while ix < len(patient_admissions):
        readmission_code = patient_admissions[ix].readmission_code
        if health_data.ReadmissionCode.is_readmit(readmission_code):
            # Either is not the first admission (ix>0) or 
            # we don't have the patient previous admition (readmission close to begining of dataset) (admit-(2015-01-01))<28 days
            # assert ix>0 or (patient_admissions[ix].admit_date - datetime.datetime.fromisoformat('2015-01-01')).days<365
            if ix>0 and  patient_admissions[ix-1].is_valid_readmission(patient_admissions[ix]):
                patient_admissions[ix-1].add_readmission(patient_admissions[ix])
                valid_readmission_count+=1
        ix+=1
    patient_count+=1


print(f'Size of train_val data:   {len(all_admissions):,}')
print(f"Begining heldout:         {min([admission.discharge_date for admission in all_admissions])}")
print(f"End heldout:              {max([admission.discharge_date for admission in all_admissions])}")
print(f'Number of instances with readmissions: {valid_readmission_count} ({valid_readmission_count/len(all_admissions):.2%})')



Size of train_val data:   524,986
Begining heldout:         2015-01-01 00:00:00
End heldout:              2021-10-01 00:00:00
Number of instances with readmissions: 50533 (9.63%)


In [4]:
rng = np.random.default_rng(seed=5348363479653547918)

train_indexes = rng.choice(range(len(all_admissions)),size=int(0.8*len(all_admissions)), replace=False)

# Checking that every time I am getting the same training instances ( and validation instances)
assert all(train_indexes[:3] ==np.array([478898, 46409, 322969]))
assert all(train_indexes[-3:] ==np.array([415014, 330673, 338415]))
assert hash(tuple(train_indexes))==2028319680436964623

train_indexes = set(train_indexes)

train = [admission for ix, admission in enumerate(all_admissions) if ix in train_indexes ]
validation = [admission for ix, admission in enumerate(all_admissions) if not ix in train_indexes ]

print(f'Size of training (before filter)=     {len(train):,}')
print(f'Size of validation (before filter)=   {len(validation):,}')

# Filtering missing values
train = list(filter(lambda admission: not admission.has_missing, train))
validation = list(filter(lambda admission: not admission.has_missing, validation))


# Filtering STILL_BORNS admissions
train = list(filter(lambda admission: admission.admit_category!=health_data.AdmitCategory.STILLBORN, train))
validation = list(filter(lambda admission: admission.admit_category!=health_data.AdmitCategory.STILLBORN, validation))

# Filtering CADAVER admissions
train = list(filter(lambda admission: admission.admit_category!=health_data.AdmitCategory.CADAVER, train))
validation = list(filter(lambda admission: admission.admit_category!=health_data.AdmitCategory.CADAVER, validation))


print(f'Size of training (after filter)=      {len(train):,}')
print(f'Size of validation (after filter)=    {len(validation):,}')

Size of training (before filter)=     419,988
Size of validation (before filter)=   104,998
Size of training (after filter)=      419,345
Size of validation (after filter)=    104,861


In [6]:
vocab_diag, diagnosis_matrix = health_data.Admission.create_codes_matrix(train)
vocab_inter, intervention_matrix = health_data.Admission.create_intervention_matrix(train)
continuos_matrix = sparse.csr_matrix((health_data.Admission.continuos_matrix(train)))
categorical_matrix = sparse.csr_matrix(health_data.Admission.categorical_matrix(train))

print(f'diagnosis:          {diagnosis_matrix.shape}')
print(f'intervention:       {intervention_matrix.shape}')
print(f'continuos_matrix:   {continuos_matrix.shape}')
print(f'categorical_matrix: {categorical_matrix.shape}')

diagnosis:          (419345, 9546)
intervention:       (419345, 7519)
continuos_matrix:   (419345, 5)
categorical_matrix: (419345, 13)


In [7]:
scaler = StandardScaler()

vocab_diag, diagnosis_matrix = health_data.Admission.create_codes_matrix(train)
vocab_inter, intervention_matrix = health_data.Admission.create_intervention_matrix(train)
continuos_matrix = sparse.csr_matrix(scaler.fit_transform(health_data.Admission.continuos_matrix(train)))
categorical_matrix = sparse.csr_matrix(health_data.Admission.categorical_matrix(train))

X_train = sparse.hstack([continuos_matrix,
                         categorical_matrix,
                         diagnosis_matrix.toarray(),
                         intervention_matrix.toarray(),
                         ])
y_train = health_data.Admission.get_y(train)

_, diagnosis_matrix_val = health_data.Admission.create_codes_matrix(validation, vocabulary=vocab_diag)
_, intervention_matrix_val = health_data.Admission.create_intervention_matrix(validation, vocabulary=vocab_inter)
continuos_matrix_val = sparse.csr_matrix(scaler.fit_transform(health_data.Admission.continuos_matrix(validation)))
categorical_matrix_val = sparse.csr_matrix(health_data.Admission.categorical_matrix(validation))

X_val = sparse.hstack([continuos_matrix_val,
                         categorical_matrix_val,
                         diagnosis_matrix_val.toarray(),
                         intervention_matrix_val.toarray(),
                         ])
y_val = health_data.Admission.get_y(validation)



print(f'X_train.shape = {X_train.shape}')
print(f'y_train.shape = {y_train.shape}')
print()
print(f'X_val.shape = {X_val.shape}')
print(f'y_val.shape = {y_val.shape}')


X_train.shape = (419345, 17083)
y_train.shape = (419345,)

X_val.shape = (104861, 17083)
y_val.shape = (104861,)


### Undersampling

In [29]:
ratio = 4 
print(f'Positive instance for each {ratio} negative instances')

number_of_positives_in_training = np.sum(y_train)
print(f'Number of positive examples:                       {number_of_positives_in_training} '\
      f'({number_of_positives_in_training/len(y_train):.2%})')

positive_x = X_train.tocsr()[y_train==1,:]
negative_x = X_train.tocsr()[y_train==0,:]

print(f'Positive instances matrix shape =                  {positive_x.shape}')
print(f'Negative instances matrix shape =                  {negative_x.shape}')


under_sampling_ix = np.random.choice(range(negative_x.shape[0]), size=number_of_positives_in_training*ratio, replace=False)
under_sampled_negative_x = negative_x[under_sampling_ix,:]

print(f'Undersampled Negative instances matrix shape =     {under_sampled_negative_x.shape}')

undersampled_x = sparse.vstack([positive_x,under_sampled_negative_x])

print(f'Combined positives + undersampled neg, new shape = {undersampled_x.shape}')

assert undersampled_x.shape[0]==number_of_positives_in_training*(ratio+1) and under_sampled_negative_x.shape[1]==X_train.shape[1]

undersampled_y = np.array([1]*number_of_positives_in_training + [0]*number_of_positives_in_training*ratio)


print(f'Undersampled target shape =                        {undersampled_y.shape}')
# X_train_under_sampled = X_train[under_sampling_ix,:]
# X_train_under_sampled.shape


Positive instance for each 4 negative instances
Number of positive examples:                       17412 (4.15%)
Positive instances matrix shape =                  (17412, 17083)
Negative instances matrix shape =                  (401933, 17083)
Undersampled Negative instances matrix shape =     (69648, 17083)
Combined positives + undersampled neg, new shape = (87060, 17083)
Undersampled target shape =                        (87060,)


### Logistic Regression (Balanced)

In [17]:
print(' ** ** Logistic Regression ** **')
clf = LogisticRegression(random_state=0, max_iter=1000,class_weight='balanced').fit(X_train, y_train,)
y_true = y_train
y_pred = clf.predict(X_train)
print('** Results on Training **')
print(f'Precision= {precision_score(y_true, y_pred,):4.3f}')
print(f'Recall=    {recall_score(y_true, y_pred,):4.3f}')
print(f'F1-score=  {f1_score(y_true, y_pred,):4.3f}')
print()


y_true = y_val
y_pred = clf.predict(X_val)
print('** Results on Training **')
print(f'Precision= {precision_score(y_true, y_pred,):4.3f}')
print(f'Recall=    {recall_score(y_true, y_pred,):4.3f}')
print(f'F1-score=  {f1_score(y_true, y_pred,):4.3f}')

 ** ** Logistic Regression ** **


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


** Results on Training **
Precision= 0.090
Recall=    0.765
F1-score=  0.162

** Results on Training **
Precision= 0.077
Recall=    0.627
F1-score=  0.137


### Undersampled NN

In [30]:
print(' ** **   MLP   ** **')
clf = MLPClassifier(random_state=0, 
                    # early_stopping=True, 
                    # n_iter_no_change=200,
                    # validation_fraction=0.2, 
                    max_iter=1000, 
                    hidden_layer_sizes=(100,),
                    verbose=True)
clf.fit(undersampled_x, undersampled_y,)
y_true = y_train
y_pred = clf.predict(X_train)
print('** Results on Training **')
print(f'Precision= {precision_score(y_true, y_pred,):4.3f}')
print(f'Recall=    {recall_score(y_true, y_pred,):4.3f}')
print(f'F1-score=  {f1_score(y_true, y_pred,):4.3f}')
print()


y_true = y_val
y_pred = clf.predict(X_val)
print('** Results on Validation **')
print(f'Precision= {precision_score(y_true, y_pred,):4.3f}')
print(f'Recall=    {recall_score(y_true, y_pred,):4.3f}')
print(f'F1-score=  {f1_score(y_true, y_pred,):4.3f}')


# RATIO = 1
# Iteration 241, loss = 0.06847385
# Iteration 242, loss = 0.06996890
# Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
# ** Results on Training **
# Precision= 0.097
# Recall=    0.976
# F1-score=  0.176

# ** Results on Validation **
# Precision= 0.063
# Recall=    0.648
# F1-score=  0.115

 ** **   MLP   ** **
Iteration 1, loss = 0.47744679
Iteration 2, loss = 0.44525451
Iteration 3, loss = 0.43259685
Iteration 4, loss = 0.42014210
Iteration 5, loss = 0.40697968
Iteration 6, loss = 0.39202644
Iteration 7, loss = 0.37610618
Iteration 8, loss = 0.35897297
Iteration 9, loss = 0.34160117
Iteration 10, loss = 0.32423760
Iteration 11, loss = 0.30850399
Iteration 12, loss = 0.29369930
Iteration 13, loss = 0.28001763
Iteration 14, loss = 0.26699416
Iteration 15, loss = 0.25534652
Iteration 16, loss = 0.24543357
Iteration 17, loss = 0.23443974
Iteration 18, loss = 0.22518577
Iteration 19, loss = 0.21678208
Iteration 20, loss = 0.20912120
Iteration 21, loss = 0.20209801
Iteration 22, loss = 0.19625250
Iteration 23, loss = 0.18890269
Iteration 24, loss = 0.18293732
Iteration 25, loss = 0.17846523
Iteration 26, loss = 0.17266426
Iteration 27, loss = 0.16756364
Iteration 28, loss = 0.16343038
Iteration 29, loss = 0.15979185
Iteration 30, loss = 0.15522989
Iteration 31, loss = 0.15170

In [22]:
print(' ** **   SVM (RBF)   ** **')
clf = SVC(random_state=0, kernel='rbf', max_iter=10000,class_weight='balanced').fit(X_train, y_train.ravel(),)
y_true = y_train
y_pred = clf.predict(X_train)
print('** Results on Training **')
print(f'Precision= {precision_score(y_true, y_pred,):4.3f}')
print(f'Recall=    {recall_score(y_true, y_pred,):4.3f}')
print(f'F1-score=  {f1_score(y_true, y_pred,):4.3f}')
print()


y_true = y_val
y_pred = clf.predict(X_val)
print('** Results on Training **')
print(f'Precision= {precision_score(y_true, y_pred,):4.3f}')
print(f'Recall=    {recall_score(y_true, y_pred,):4.3f}')
print(f'F1-score=  {f1_score(y_true, y_pred,):4.3f}')


 ** **   SVM (RBF)   ** **




** Results on Training **
Precision= 0.042
Recall=    0.990
F1-score=  0.081

** Results on Training **
Precision= 0.044
Recall=    0.976
F1-score=  0.084


In [24]:
y_va.shape

(419345, 17083)

In [25]:
print(' ** **   MLP   ** **')
clf = MLPClassifier(random_state=0, 
                    # early_stopping=True, 
                    # n_iter_no_change=200,
                    # validation_fraction=0.2, 
                    max_iter=1000, 
                    hidden_layer_sizes=(100,),
                    verbose=True)
clf.fit(X_train, y_train)
y_true = y_train
y_pred = clf.predict(X_train)
print('** Results on Training **')
print(f'Precision= {precision_score(y_true, y_pred,):4.3f}')
print(f'Recall=    {recall_score(y_true, y_pred,):4.3f}')
print(f'F1-score=  {f1_score(y_true, y_pred,):4.3f}')
print()


y_true = y_val
y_pred = clf.predict(X_val)
print('** Results on Validation **')
print(f'Precision= {precision_score(y_true, y_pred,):4.3f}')
print(f'Recall=    {recall_score(y_true, y_pred,):4.3f}')
print(f'F1-score=  {f1_score(y_true, y_pred,):4.3f}')

# Iteration 192, loss = 0.05301263
# Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
# ** Results on Training **
# Precision= 0.967
# Recall=    0.752
# F1-score=  0.846

# ** Results on Validation **
# Precision= 0.091
# Recall=    0.054
# F1-score=  0.068


 ** **   MLP   ** **
Iteration 1, loss = 0.17170415
Iteration 2, loss = 0.15946111
Iteration 3, loss = 0.15785508
Iteration 4, loss = 0.15661386
Iteration 5, loss = 0.15512396
Iteration 6, loss = 0.15331318
Iteration 7, loss = 0.15141291
Iteration 8, loss = 0.14938711
Iteration 9, loss = 0.14733772
Iteration 10, loss = 0.14531048
Iteration 11, loss = 0.14331910
Iteration 12, loss = 0.14131980
Iteration 13, loss = 0.13926195
Iteration 14, loss = 0.13696274
Iteration 15, loss = 0.13462112
Iteration 16, loss = 0.13207356
Iteration 17, loss = 0.12930444
Iteration 18, loss = 0.12655796
Iteration 19, loss = 0.12379298
Iteration 20, loss = 0.12108977
Iteration 21, loss = 0.11863595
Iteration 22, loss = 0.11600631
Iteration 23, loss = 0.11351117
Iteration 24, loss = 0.11114954
Iteration 25, loss = 0.10870817
Iteration 26, loss = 0.10658731
Iteration 27, loss = 0.10429627
Iteration 28, loss = 0.10270786
Iteration 29, loss = 0.10085632
Iteration 30, loss = 0.09902174
Iteration 31, loss = 0.09726