In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from datetime import date
import datetime
import numpy as np
import seaborn as sns
import re
import json
import pandas as pd
import math
import ast
import os
from collections import defaultdict
import sys
sys.path.append('..')
from utilities import logger
from utilities import configuration
from utilities import health_data

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.preprocessing import StandardScaler

from sklearn.neural_network import MLPClassifier


In [2]:
config = configuration.get_config()
for key in config:
    print(f'{key:40} (type: {type(config[key])})')


logging = logger.init_logger(config['system_log'])
logging.debug('Logger has started ont notebook 09 Random sample of instances.ipynb ...')

system_log                               (type: <class 'str'>)
json_file                                (type: <class 'str'>)
train_val_json                           (type: <class 'str'>)
heldout_json                             (type: <class 'str'>)
unused_after_heldout_json                (type: <class 'str'>)
unified_merged_file_cz                   (type: <class 'str'>)
unified_merged_file_noncz                (type: <class 'str'>)
unified_merged_file                      (type: <class 'str'>)
data_path                                (type: <class 'str'>)
cz_files                                 (type: <class 'list'>)
noncz_files                              (type: <class 'list'>)
2023-11-08 10:40:50,610 - root - DEBUG - Logger has started ont notebook 09 Random sample of instances.ipynb ...


In [4]:
f = open(config['train_val_json'])
train_val_data = json.load(f)
print(len(train_val_data))

all_admissions = []
for ix in train_val_data:
    all_admissions.append(
        health_data.Admission.from_dict_data(admit_id=int(ix), admission=train_val_data[ix])
        )
len(all_admissions)

# Dictionary organizing data by patient
patient2admissions = defaultdict(list)
for admission in all_admissions:
    code = admission.code
    patient2admissions[code].append(admission)

# Ordering patient list by discharge date (from back )
for patient_code in patient2admissions:
    admissions_list = patient2admissions[patient_code]
    admissions_list = sorted(admissions_list, key=lambda admission: admission.discharge_date, reverse=False)
    assert all([admissions_list[i].discharge_date <= admissions_list[i+1].discharge_date for i in range(len(admissions_list)-1)])
    patient2admissions[patient_code] = admissions_list
print(len(patient2admissions))

patient_count=0
valid_readmission_count=0
for patient_code in patient2admissions:
    patient_admissions = patient2admissions[patient_code]
    ix = 0 
    while ix < len(patient_admissions):
        readmission_code = patient_admissions[ix].readmission_code
        if health_data.ReadmissionCode.is_readmit(readmission_code):
            # Either is not the first admission (ix>0) or 
            # we don't have the patient previous admition (readmission close to begining of dataset) (admit-(2015-01-01))<28 days
            # assert ix>0 or (patient_admissions[ix].admit_date - datetime.datetime.fromisoformat('2015-01-01')).days<365
            if ix>0 and  patient_admissions[ix-1].is_valid_readmission(patient_admissions[ix]):
                patient_admissions[ix-1].add_readmission(patient_admissions[ix])
                valid_readmission_count+=1
        ix+=1
    patient_count+=1
valid_readmission_count

524986
263719


50533

In [5]:
print(min([admission.discharge_date for admission in all_admissions]))
print(max([admission.discharge_date for admission in all_admissions]))

2015-01-01 00:00:00
2021-10-01 00:00:00


In [6]:
f = open(config['heldout_json'])
heldout_data = json.load(f)
print(f'{len(heldout_data):,}')
print(min([heldout_data[ix]['Discharge Date'] for ix in heldout_data]))
print(max([heldout_data[ix]['Discharge Date'] for ix in heldout_data]))
del(heldout_data)

74,350
2021-10-02 00:00:00
2022-10-02 00:00:00


In [7]:
print(len(all_admissions))
count=0
for admission in all_admissions:
    if  admission.has_readmission :
        print(admission)
        count+=1 
        if count==2:
            break



524986
<Admission Patient_code='1159480' admit='2014-12-24' discharged='2015-01-01' Age='67' gender='Gender.MALE' ALC_days='0' acute_days='8' readmited(2015-01-01,2015-02-03,ReadmissionCode.PLANNED_READMIT)>
<Admission Patient_code='2232961' admit='2014-12-30' discharged='2015-01-01' Age='38' gender='Gender.FEMALE' ALC_days='0' acute_days='2' readmited(2015-01-03,2015-01-04,ReadmissionCode.UNPLANNED_READMIT_0_7)>


In [None]:
# np.random.randint(2**63)
# output 5348363479653547918

5348363479653547918

In [8]:
rng = np.random.default_rng(seed=5348363479653547918)

train_indexes = rng.choice(range(len(all_admissions)),size=int(0.8*len(all_admissions)), replace=False)

# Checking that every time I am getting the same training instances ( and validation instances)
assert all(train_indexes[:3] ==np.array([478898, 46409, 322969]))
assert all(train_indexes[-3:] ==np.array([415014, 330673, 338415]))
assert hash(tuple(train_indexes))==2028319680436964623

train_indexes = set(train_indexes)

train = [admission for ix, admission in enumerate(all_admissions) if ix in train_indexes ]
validation = [admission for ix, admission in enumerate(all_admissions) if not ix in train_indexes ]

print(f'Size of training (before filter)=     {len(train):,}')
print(f'Size of validation (before filter)=   {len(validation):,}')

# Filtering missing values
train = list(filter(lambda admission: not admission.has_missing, train))
validation = list(filter(lambda admission: not admission.has_missing, validation))


# Filtering STILL_BORNS admissions
train = list(filter(lambda admission: admission.admit_category!=health_data.AdmitCategory.STILLBORN, train))
validation = list(filter(lambda admission: admission.admit_category!=health_data.AdmitCategory.STILLBORN, validation))

# Filtering CADAVER admissions
train = list(filter(lambda admission: admission.admit_category!=health_data.AdmitCategory.CADAVER, train))
validation = list(filter(lambda admission: admission.admit_category!=health_data.AdmitCategory.CADAVER, validation))


print(f'Size of training (after filter)=      {len(train):,}')
print(f'Size of validation (after filter)=    {len(validation):,}')

Size of training (before filter)=     419,988
Size of validation (before filter)=   104,998
Size of training (after filter)=      419,345
Size of validation (after filter)=    104,861


In [8]:

# def get_x(admission: health_data.Admission) -> np.ndarray:
#     return np.array([admission.length_of_stay, 
#                      admission.case_weight, 
#                      admission.cmg, 
#                      admission.age, 
#                      admission.alc_days,
#                      1 if admission.gender==health_data.Gender.MALE else 0 ,
#                      1 if admission.gender==health_data.Gender.FEMALE else 0 ,
#                      1 if admission.is_central_zone else 0 ,
#                      1 if admission.transfusion_given.received_transfusion else 0,
#                      ])

# def get_y(admission: health_data.Admission) ->np.ndarray:
#     return np.array([1 if admission.has_readmission and \
#                      admission.readmission.readmission_code!=health_data.ReadmissionCode.PLANNED_READMIT else 0])

# get_x(train[0])

In [9]:
scaler = StandardScaler()
X_train = np.hstack([scaler.fit_transform(health_data.Admission.continuos_matrix(train)), 
                     health_data.Admission.categorical_matrix(train)])
y_train = health_data.Admission.get_y(train)

X_val = np.hstack([scaler.fit_transform(health_data.Admission.continuos_matrix(validation)), 
                   health_data.Admission.categorical_matrix(validation)])
y_val = health_data.Admission.get_y(validation)

print(f'X_train= {X_train.shape}')
print(f'y_train= {y_train.shape}')
print(f'X_val=   {X_val.shape}')
print(f'y_val=   {y_val.shape}')

X_train= (419345, 18)
y_train= (419345,)
X_val=   (104861, 18)
y_val=   (104861,)


### Under sampling majority class

In [10]:
number_of_positives_in_training = np.sum(y_train)
number_of_positives_in_training

positive_x = X_train[y_train==1,:]
negative_x = X_train[y_train==0,:]

print(positive_x.shape)
print(negative_x.shape)

under_sampling_ix = np.random.choice(range(negative_x.shape[0]), size=number_of_positives_in_training, replace=False)


under_sampled_negative_x = negative_x[under_sampling_ix,:]

print(positive_x.shape)
print(under_sampled_negative_x.shape)

undersampled_x = np.vstack([positive_x,under_sampled_negative_x])

print(undersampled_x.shape)

undersampled_y = np.array([1]*number_of_positives_in_training + [0]*number_of_positives_in_training)

print(undersampled_y.shape)
# X_train_under_sampled = X_train[under_sampling_ix,:]
# X_train_under_sampled.shape


(17412, 18)
(401933, 18)
(17412, 18)
(17412, 18)
(34824, 18)
(34824,)


In [14]:
print(' ** ** Logistic Regression ** **')
clf = LogisticRegression(random_state=0, max_iter=1000,class_weight='balanced').fit(X_train, y_train,)
y_true = y_train
y_pred = clf.predict(X_train)
print('** Results on Training **')
print(f'Precision= {precision_score(y_true, y_pred,):4.3f}')
print(f'Recall=    {recall_score(y_true, y_pred,):4.3f}')
print(f'F1-score=  {f1_score(y_true, y_pred,):4.3f}')
print()


y_true = y_val
y_pred = clf.predict(X_val)
print('** Results on Training **')
print(f'Precision= {precision_score(y_true, y_pred,):4.3f}')
print(f'Recall=    {recall_score(y_true, y_pred,):4.3f}')
print(f'F1-score=  {f1_score(y_true, y_pred,):4.3f}')

#  ** ** Logistic Regression ** **
# ** Results on Training **
# Precision= 0.049
# Recall=    0.573
# F1-score=  0.091

# ** Results on Training **
# Precision= 0.051
# Recall=    0.570
# F1-score=  0.094

# With all basic atributes (categorical and continuos)
#  ** ** Logistic Regression ** **
# ** Results on Training **
# Precision= 0.054
# Recall=    0.812
# F1-score=  0.101

# ** Results on Training **
# Precision= 0.056
# Recall=    0.804
# F1-score=  0.104

 ** ** Logistic Regression ** **
** Results on Training **
Precision= 0.054
Recall=    0.802
F1-score=  0.101

** Results on Training **
Precision= 0.056
Recall=    0.807
F1-score=  0.104


In [15]:
cols = health_data.Admission.continuos_columns() + health_data.Admission.categorical_columns()
for coef, col in sorted(zip(clf.coef_[0,:],cols),key=lambda value: np.abs(value[0]), reverse=True):
    print(f'{col:20}: {coef:7.3f}')

New Born Admission  :  -0.859
Level 1 Comorbidity :   0.805
Level 2 Comorbidity :   0.797
Level 3 Comorbidity :   0.738
Is ALC              :  -0.688
Level 4 Comorbidity :   0.564
Urgent Admission    :   0.537
No Comorbidity      :   0.473
Transfusion Given   :   0.408
Female              :  -0.360
Elective Admission  :  -0.322
Case Weight         :  -0.296
Male                :  -0.255
Length of Stay      :   0.251
ALC Days            :  -0.248
CMG                 :  -0.050
Is central Zone     :   0.040
Age                 :  -0.010


In [16]:
print(' ** **   SVM   (Linear) ** **')
clf = SVC(random_state=0, max_iter=10000, kernel='linear', class_weight='balanced').fit(X_train, y_train.ravel(),)
y_true = y_train
y_pred = clf.predict(X_train)
print('** Results on Training **')
print(f'Precision= {precision_score(y_true, y_pred,):4.3f}')
print(f'Recall=    {recall_score(y_true, y_pred,):4.3f}')
print(f'F1-score=  {f1_score(y_true, y_pred,):4.3f}')
print()


y_true = y_val
y_pred = clf.predict(X_val)
print('** Results on Training **')
print(f'Precision= {precision_score(y_true, y_pred,):4.3f}')
print(f'Recall=    {recall_score(y_true, y_pred,):4.3f}')
print(f'F1-score=  {f1_score(y_true, y_pred,):4.3f}')


 ** **   SVM   (Linear) ** **




** Results on Training **
Precision= 0.042
Recall=    0.999
F1-score=  0.080

** Results on Training **
Precision= 0.043
Recall=    0.997
F1-score=  0.083


In [17]:
print(' ** **   SVM (RBF)   ** **')
clf = SVC(random_state=0, kernel='rbf', max_iter=10000,class_weight='balanced').fit(X_train, y_train.ravel(),)
y_true = y_train
y_pred = clf.predict(X_train)
print('** Results on Training **')
print(f'Precision= {precision_score(y_true, y_pred,):4.3f}')
print(f'Recall=    {recall_score(y_true, y_pred,):4.3f}')
print(f'F1-score=  {f1_score(y_true, y_pred,):4.3f}')
print()


y_true = y_val
y_pred = clf.predict(X_val)
print('** Results on Training **')
print(f'Precision= {precision_score(y_true, y_pred,):4.3f}')
print(f'Recall=    {recall_score(y_true, y_pred,):4.3f}')
print(f'F1-score=  {f1_score(y_true, y_pred,):4.3f}')


 ** **   SVM (RBF)   ** **




** Results on Training **
Precision= 0.051
Recall=    0.634
F1-score=  0.094

** Results on Training **
Precision= 0.053
Recall=    0.626
F1-score=  0.098


In [19]:
print(' ** **   Decision Tree   ** **')
clf = DecisionTreeClassifier(random_state=0).fit(undersampled_x, undersampled_y.ravel(),)
y_true = y_train
y_pred = clf.predict(X_train)
print('** Results on Training **')
print(f'Precision= {precision_score(y_true, y_pred,):4.3f}')
print(f'Recall=    {recall_score(y_true, y_pred,):4.3f}')
print(f'F1-score=  {f1_score(y_true, y_pred,):4.3f}')
print()


y_true = y_val
y_pred = clf.predict(X_val)
print('** Results on Training **')
print(f'Precision= {precision_score(y_true, y_pred,):4.3f}')
print(f'Recall=    {recall_score(y_true, y_pred,):4.3f}')
print(f'F1-score=  {f1_score(y_true, y_pred,):4.3f}')


 ** **   Decision Tree   ** **
** Results on Training **
Precision= 0.097
Recall=    0.986
F1-score=  0.177

** Results on Training **
Precision= 0.049
Recall=    0.575
F1-score=  0.090


In [36]:
print(' ** **   MLP   ** **')
clf = MLPClassifier(random_state=0, 
                    # early_stopping=True, 
                    # n_iter_no_change=200,
                    # validation_fraction=0.2, 
                    max_iter=1000, 
                    hidden_layer_sizes=(300, 200, 100, 50, 20, 5, 4, 3, 2),
                    verbose=True)
clf.fit(undersampled_x, undersampled_y,)
y_true = y_train
y_pred = clf.predict(X_train)
print('** Results on Training **')
print(f'Precision= {precision_score(y_true, y_pred,):4.3f}')
print(f'Recall=    {recall_score(y_true, y_pred,):4.3f}')
print(f'F1-score=  {f1_score(y_true, y_pred,):4.3f}')
print()


y_true = y_val
y_pred = clf.predict(X_val)
print('** Results on Validation **')
print(f'Precision= {precision_score(y_true, y_pred,):4.3f}')
print(f'Recall=    {recall_score(y_true, y_pred,):4.3f}')
print(f'F1-score=  {f1_score(y_true, y_pred,):4.3f}')


 ** **   MLP   ** **
Iteration 1, loss = 0.66665176
Iteration 2, loss = 0.65892955
Iteration 3, loss = 0.65631846
Iteration 4, loss = 0.65448327
Iteration 5, loss = 0.65293521
Iteration 6, loss = 0.65135114
Iteration 7, loss = 0.64913607
Iteration 8, loss = 0.64810033
Iteration 9, loss = 0.64718905
Iteration 10, loss = 0.64534718
Iteration 11, loss = 0.64443307
Iteration 12, loss = 0.64322995
Iteration 13, loss = 0.64176864
Iteration 14, loss = 0.64200421
Iteration 15, loss = 0.63983588
Iteration 16, loss = 0.63867002
Iteration 17, loss = 0.63696668
Iteration 18, loss = 0.63659016
Iteration 19, loss = 0.63584601
Iteration 20, loss = 0.63412637
Iteration 21, loss = 0.63256733
Iteration 22, loss = 0.63105161
Iteration 23, loss = 0.63013559
Iteration 24, loss = 0.62872462
Iteration 25, loss = 0.62696446
Iteration 26, loss = 0.62611542
Iteration 27, loss = 0.62501343
Iteration 28, loss = 0.62301208
Iteration 29, loss = 0.62147913
Iteration 30, loss = 0.61996888
Iteration 31, loss = 0.61874

In [30]:
admission.cmg

278.0

In [31]:
admission.comorbidity_level

<ComorbidityLevel.NO_COMORBIDITY: 0>

In [32]:
admission.gender

'Female'

In [33]:
admission.is_central_zone

True