In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from datetime import date
import datetime
import numpy as np
import seaborn as sns
import re
import json
import pandas as pd
import math
import ast
import os
from collections import defaultdict
import sys
sys.path.append('..')
from utilities import logger
from utilities import configuration
from utilities import health_data

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.preprocessing import StandardScaler

from sklearn.neural_network import MLPClassifier

from scipy import stats
from scipy import sparse
from sklearn.metrics import roc_auc_score, confusion_matrix

In [2]:
config = configuration.get_config()
for key in config:
    print(f'{key:40} (type: {type(config[key])})')


logging = logger.init_logger(config['system_log'])
logging.debug('Logger has started ont notebook 09 Random sample of instances.ipynb ...')

system_log                               (type: <class 'str'>)
logreg_log                               (type: <class 'str'>)
lasso_log                                (type: <class 'str'>)
logreg_results                           (type: <class 'str'>)
json_file                                (type: <class 'str'>)
train_val_json                           (type: <class 'str'>)
heldout_json                             (type: <class 'str'>)
unused_after_heldout_json                (type: <class 'str'>)
unified_merged_file_cz                   (type: <class 'str'>)
unified_merged_file_noncz                (type: <class 'str'>)
unified_merged_file                      (type: <class 'str'>)
data_path                                (type: <class 'str'>)
cz_files                                 (type: <class 'list'>)
noncz_files                              (type: <class 'list'>)
2023-11-24 14:51:33,717 - root - DEBUG - Logger has started ont notebook 09 Random sample of instances.ipynb ...


In [12]:
training ,testing = health_data.Admission.get_training_testing_data(filtering=False)
print(f'BOTH:                      {len(training+testing)}')
print(f'Training before filtering: {len(training):,}')
print(f'testing before filtering:  {len(testing):,}')
print()
training = list(filter(lambda admission: not admission.code is None and \
                       admission.admit_category!=health_data.AdmitCategory.CADAVER and \
                       admission.admit_category!=health_data.AdmitCategory.STILLBORN,
                       training))

testing = list(filter(lambda admission: not admission.code is None and \
                       admission.admit_category!=health_data.AdmitCategory.CADAVER and \
                       admission.admit_category!=health_data.AdmitCategory.STILLBORN,
                       testing))

print(f'BOTH:                      {len(training+testing)}')
print(f'Training after filtering:  {len(training):,}')
print(f'testing after filtering:   {len(testing):,}')
print()

training ,testing = health_data.Admission.get_training_testing_data(filtering=True)

print(f'BOTH:                      {len(training+testing)}')
print(f'Training after filtering:  {len(training):,}')
print(f'testing after filtering:   {len(testing):,}')
print()

# categorical_df, main_pt_services_list = health_data.Admission.categorical_features(training)
numerical_df = health_data.Admission.numerical_features(training, 
                                                        fix_skew=False, 
                                                        normalize=False)
stds = np.std(numerical_df)
mean = np.mean(numerical_df, axis=0)
is_outlier=np.sum(numerical_df.values > (mean+4*stds).values, axis=1)>0
print(f'outlier count = {np.sum(is_outlier)}')
mask=~is_outlier

features = []
# features.append(sparse.csr_matrix(categorical_df.values))
features.append((numerical_df.values))

X_train = sparse.hstack([matrix[mask,:] for matrix in features])
print(X_train.shape)

BOTH:                      524986
Training before filtering: 419,988
testing before filtering:  104,998

BOTH:                      524310
Training after filtering:  419,426
testing after filtering:   104,884

Training instances before filtering: 419988
Training instances after filtering:  419139
Testomg instances before filtering:  104998
Testomg instances after filtering:   104884
BOTH:                      524023
Training after filtering:  419,139
testing after filtering:   104,884

outlier count = 2262


ValueError: blocks must be 2-D

In [18]:
((X_train - np.mean(X_train)) / np.std(X_train))

array([[-0.29954376,  0.9418599 , -0.4925691 , -0.48575431, -0.49609934],
       [-0.1754034 ,  1.15393302, -0.48659691, -0.4805818 , -0.49609934],
       [-0.06678058,  3.69881053, -0.48110319, -0.37195898, -0.49609934],
       ...,
       [-0.11850573,  3.52811752, -0.49354309, -0.47023677, -0.49609934],
       [-0.14436831,  1.50049155, -0.49100028, -0.48575431, -0.49609934],
       [-0.40299407,  3.16604146, -0.49150667, -0.48575431, -0.49609934]])

In [19]:
StandardScaler().fit_transform(X_train)

array([[-0.87596286, -0.54089673, -0.39906024, -0.47837871, -0.12883797],
       [ 0.14236158, -0.37358917,  0.10378096, -0.39489338, -0.12883797],
       [ 1.03339547,  1.63410157,  0.56633738,  1.35829854, -0.12883797],
       ...,
       [ 0.60909362,  1.49943938, -0.48106699, -0.22792272, -0.12883797],
       [ 0.39694269, -0.10018413, -0.2669697 , -0.47837871, -0.12883797],
       [-1.72456657,  1.21379233, -0.30960624, -0.47837871, -0.12883797]])

### Categorical Variables

In [29]:
num_fields = [
          'age',
          'alc_days',
          'acute_days',
          'cmg',
          'case_weight',
]
field_names = ['Age',
               'ALC Days',
               'Acute Days',
               'Case Mix Group',
               'Case Weight',
               ]

data = {'Mean':[],
        'Std':[],
        'Min':[],
        'Q1':[],
        'Median':[],
        'Q3':[],
        'Max':[],
        'Kurtosis':[],
        'Skew':[],
        'Mode':[],
        'Mode Count':[],
        'Missing Count (%)': [],
        }
for field in num_fields:
    numbers = [getattr(admission, field) for admission in training+testing]
    missing_count = len([_ for num in numbers if num is None or np.isnan(num)])
    data['Missing Count (%)'].append(100*(missing_count/len(training+testing)))
    numbers = list(filter(lambda num: not num is None and not np.isnan(num), numbers))
    data['Mean'].append(np.average(numbers))
    data['Std'].append(np.std(numbers))
    data['Min'].append(np.min(numbers))
    data['Max'].append(np.max(numbers))
    data['Median'].append(np.median(numbers))
    data['Mode'].append(stats.mode(numbers)[0])
    data['Mode Count'].append(stats.mode(numbers)[1])
    data['Kurtosis'].append(stats.kurtosis(numbers))
    data['Skew'].append(stats.skew(numbers))
    data['Q1'].append(np.quantile(numbers,0.25))
    data['Q3'].append(np.quantile(numbers,0.75))
df = pd.DataFrame(data, index=field_names)
# print(df)
print(df.to_latex(float_format=f"{{:0.3f}}".format))
df

\begin{tabular}{lrrrrrrrrrrrr}
\toprule
 & Mean & Std & Min & Q1 & Median & Q3 & Max & Kurtosis & Skew & Mode & Mode Count & Missing Count (%) \\
\midrule
Age & 58.720 & 23.540 & -1.000 & 46.000 & 64.000 & 76.000 & 110.000 & 0.126 & -0.867 & 0.000 & 26204 & 0.000 \\
ALC Days & 1.831 & 29.944 & 0.000 & 0.000 & 0.000 & 0.000 & 16300.000 & 169931.870 & 327.887 & 0.000 & 506458 & 0.000 \\
Acute Days & 8.593 & 68.502 & 0.000 & 2.000 & 4.000 & 8.000 & 17343.000 & 50989.340 & 213.401 & 1.000 & 102193 & 0.000 \\
Case Mix Group & 411.381 & 245.461 & 1.000 & 196.000 & 336.000 & 635.000 & 999.000 & -1.263 & 0.270 & 139.000 & 16214 & 0.020 \\
Case Weight & 1.792 & 5.967 & 0.000 & 0.537 & 0.923 & 1.618 & 2000.000 & 48869.462 & 164.367 & 1.118 & 2184 & 0.020 \\
\bottomrule
\end{tabular}



Unnamed: 0,Mean,Std,Min,Q1,Median,Q3,Max,Kurtosis,Skew,Mode,Mode Count,Missing Count (%)
Age,58.719797,23.539954,-1.0,46.0,64.0,76.0,110.0,0.125957,-0.866721,0.0,26204,0.0
ALC Days,1.831199,29.9443,0.0,0.0,0.0,0.0,16300.0,169931.870146,327.88732,0.0,506458,0.0
Acute Days,8.593136,68.501844,0.0,2.0,4.0,8.0,17343.0,50989.340163,213.401016,1.0,102193,0.0
Case Mix Group,411.38086,245.461446,1.0,196.0,336.0,635.0,999.0,-1.263009,0.270479,139.0,16214,0.019836
Case Weight,1.791877,5.967024,0.0,0.5373,0.9234,1.6184,2000.0,48869.461686,164.366998,1.1182,2184,0.019836


### Nmuerical Variables

In [30]:
fields = ['code',
          'admit_date',
          'discharge_date',
          'gender',
          'postal_code',
          'is_central_zone',
          'institution_number',
          'institution_to',
          'institution_from',
          'institution_type',
          'admit_category',
          'readmission_code',
          'main_pt_service',
          'mrdx',
          'transfusion_given',]

data = {
        'Levels':[],
        'Levels (in training)':[],
        'Mode':[],
        'Mode Count':[],
        'Less Frequent':[],
        'Less Frequent Count':[],
        'Missings (%)':[],
}

for field in fields:
    vector = [getattr(admission,field) for admission in training+testing]
    if field == 'gender':
        miss_count = len([_ for elem in vector if elem is None or elem==health_data.Gender.NONE])
        vector = list(filter(lambda elem: not elem is None and elem!=health_data.Gender.NONE  ,vector))
    elif field == 'admit_category':
        miss_count = len([_ for elem in vector if elem is None or elem==health_data.AdmitCategory.NONE])
        vector = list(filter(lambda elem: not elem is None and elem!=health_data.AdmitCategory.NONE  ,vector))
    elif field == 'readmission_code':
        miss_count = len([_ for elem in vector if elem is None or elem==health_data.ReadmissionCode.NONE])
        vector = list(filter(lambda elem: not elem is None and elem!=health_data.ReadmissionCode.NONE  ,vector))
    elif field == 'transfusion_given':
        miss_count = len([_ for elem in vector if elem is None or elem==health_data.TransfusionGiven.NONE])
        vector = list(filter(lambda elem: not elem is None and elem!=health_data.TransfusionGiven.NONE  ,vector))
    else:
        miss_count = len([_ for elem in vector if elem is None])
        vector = list(filter(lambda elem: not elem is None,vector))
    data['Missings (%)'].append(100*(miss_count/len(training+testing)))
    data['Levels'].append(len(set(vector)))
    data['Mode'].append(pd.Series(vector).mode()[0])
    data['Mode Count'].append(len([elem for elem in vector if elem==data['Mode'][-1]]))
    # Computing Less freq
    freq = defaultdict(int)
    for elem in vector:
        freq[elem]+=1
    data['Less Frequent'].append(sorted(freq.items(), key=lambda key_value: key_value[1])[0][0])
    data['Less Frequent Count'].append(sorted(freq.items(), key=lambda key_value: key_value[1])[0][1])
    # print(f'{field:20} min: {sorted(freq.items(), key=lambda key_value: key_value[1])[0]}')
    # print(f'{field:20} max: {sorted(freq.items(), key=lambda key_value: key_value[1])[-1]}')

    # Count levels in training
    vector = [getattr(admission,field) for admission in training]
    if field == 'gender':
        miss_count = len([_ for elem in vector if elem is None or elem==health_data.Gender.NONE])
        vector = list(filter(lambda elem: not elem is None and elem!=health_data.Gender.NONE  ,vector))
    elif field == 'admit_category':
        miss_count = len([_ for elem in vector if elem is None or elem==health_data.AdmitCategory.NONE])
        vector = list(filter(lambda elem: not elem is None and elem!=health_data.AdmitCategory.NONE  ,vector))
    elif field == 'readmission_code':
        miss_count = len([_ for elem in vector if elem is None or elem==health_data.ReadmissionCode.NONE])
        vector = list(filter(lambda elem: not elem is None and elem!=health_data.ReadmissionCode.NONE  ,vector))
    elif field == 'transfusion_given':
        miss_count = len([_ for elem in vector if elem is None or elem==health_data.TransfusionGiven.NONE])
        vector = list(filter(lambda elem: not elem is None and elem!=health_data.TransfusionGiven.NONE  ,vector))
    else:
        miss_count = len([_ for elem in vector if elem is None])
        vector = list(filter(lambda elem: not elem is None,vector))
    data['Levels (in training)'].append(len(set(vector)))

pd.DataFrame(data, index=fields)

Unnamed: 0,Levels,Levels (in training),Mode,Mode Count,Less Frequent,Less Frequent Count,Missings (%)
code,263717,230774,170806,1023,4584715,1,0.0
admit_date,2746,2720,2015-01-06 00:00:00,314,2014-03-26 00:00:00,1,0.001144
discharge_date,2466,2466,2015-04-02 00:00:00,362,2017-12-25 00:00:00,59,0.0
gender,3,3,Gender.FEMALE,270064,Gender.OTHER,9,0.0
postal_code,32622,30661,B0P1R0,5162,B2N7H9,1,0.0
is_central_zone,2,2,False,306466,True,217844,0.0
institution_number,34,34,85,172939,58,42,0.0
institution_to,249,244,HOME CARE SERVICES-HOME CARE,19335,South Cumberland Hospital,1,70.62463
institution_from,289,287,29087,15832,ST. MARTHA'S REGIONAL HOSPITAL-INPT REHAB,1,68.527207
institution_type,28,28,"OTHER (9002,9003,9088,9089)",20600,NACRS AMBULATORY CARE (9),1,85.162022


In [21]:
freq = defaultdict(int)
vec = [1,2,2,3,3,4,4]
list(map(lambda elem: freq[elem]+1, vec))
freq

defaultdict(int, {1: 0, 2: 0, 3: 0, 4: 0})

#### MISSING VALUES

In [27]:
# training ,testing = health_data.Admission.get_training_testing_data(filtering=False)

field = 'code'
missing_count = len([_ for admission in training+testing if getattr(admission, field) is None])
print(f'{field:20} = {missing_count/len(training+testing):.3%}')

field = 'admit_date'
missing_count = len([_ for admission in training+testing if getattr(admission, field) is None])
print(f'{field:20} = {missing_count/len(training+testing):.3%}')

field = 'discharge_date'
missing_count = len([_ for admission in training+testing if getattr(admission, field) is None])
print(f'{field:20} = {missing_count/len(training+testing):.3%}')

field = 'age'
missing_count = len([_ for admission in training+testing if getattr(admission, field) is None or np.isnan(getattr(admission,field))])
print(f'{field:20} = {missing_count/len(training+testing):.3%}')

field = 'alc_days'
missing_count = len([_ for admission in training+testing if getattr(admission, field) is None or np.isnan(getattr(admission,field))])
print(f'{field:20} = {missing_count/len(training+testing):.3%}')

field = 'acute_days'
missing_count = len([_ for admission in training+testing if getattr(admission, field) is None or np.isnan(getattr(admission,field))])
print(f'{field:20} = {missing_count/len(training+testing):.3%}')

field = 'cmg'
missing_count = len([_ for admission in training+testing if getattr(admission, field) is None or np.isnan(getattr(admission,field))])
print(f'{field:20} = {missing_count/len(training+testing):.3%}')

field = 'case_weight'
missing_count = len([_ for admission in training+testing if getattr(admission, field) is None or np.isnan(getattr(admission,field))])
print(f'{field:20} = {missing_count/len(training+testing):.3%}')

field = 'gender'
missing_count = len([_ for admission in training+testing if getattr(admission, field) is None or \
                                                                getattr(admission,field)==health_data.Gender.NONE])
print(f'{field:20} = {missing_count/len(training+testing):.3%}')

field = 'postal_code'
missing_count = len([_ for admission in training+testing if getattr(admission, field) is None or \
                                                                getattr(admission,field)=='x9x9x9'])
print(f'{field:20} = {missing_count/len(training+testing):.3%}')

field = 'is_central_zone'
missing_count = len([_ for admission in training+testing if getattr(admission, field) is None ])
print(f'{field:20} = {missing_count/len(training+testing):.3%}')

field = 'institution_number'
missing_count = len([_ for admission in training+testing if getattr(admission, field) is None or np.isnan(getattr(admission, field)) ])
print(f'{field:20} = {missing_count/len(training+testing):.3%}')

field = 'institution_to'
missing_count = len([_ for admission in training+testing if getattr(admission, field) is None ])
print(f'{field:20} = {missing_count/len(training+testing):.3%}')

field = 'institution_from'
missing_count = len([_ for admission in training+testing if getattr(admission, field) is None ])
print(f'{field:20} = {missing_count/len(training+testing):.3%}')

field = 'institution_type'
missing_count = len([_ for admission in training+testing if getattr(admission, field) is None])
print(f'{field:20} = {missing_count/len(training+testing):.3%}')

field = 'admit_category'
missing_count = len([_ for admission in training+testing if getattr(admission, field) is None or \
                                                                getattr(admission,field)==health_data.AdmitCategory.NONE])
print(f'{field:20} = {missing_count/len(training+testing):.3%}')

field = 'readmission_code'
missing_count = len([_ for admission in training+testing if getattr(admission, field) is None or \
                                                                getattr(admission,field)==health_data.ReadmissionCode.NONE])
print(f'{field:20} = {missing_count/len(training+testing):.3%}')

field = 'main_pt_service'
missing_count = len([_ for admission in training+testing if getattr(admission, field) is None or getattr(admission, field) ==[]])
print(f'{field:20} = {missing_count/len(training+testing):.3%}')

field = 'mrdx'
missing_count = len([_ for admission in training+testing if getattr(admission, field) is None])
print(f'{field:20} = {missing_count/len(training+testing):.3%}')

field = 'transfusion_given'
missing_count = len([_ for admission in training+testing if getattr(admission, field) is None or \
                                                                getattr(admission,field)==health_data.TransfusionGiven.NONE])
print(f'{field:20} = {missing_count/len(training+testing):.3%}')

field = 'intervention_code'
missing_count = len([_ for admission in training+testing if getattr(admission, field) is None])
print(f'{field:20} = {missing_count/len(training+testing):.3%}')

field = 'diagnosis_types'
missing_count = len([_ for admission in training+testing if getattr(admission, 'diagnosis').types is None])
print(f'{field:20} = {missing_count/len(training+testing):.3%}')

field = 'diagnosis_texts'
missing_count = len([_ for admission in training+testing if getattr(admission, 'diagnosis').texts is None])
print(f'{field:20} = {missing_count/len(training+testing):.3%}')

field = 'diagnosis_codes'
missing_count = len([_ for admission in training+testing if getattr(admission, 'diagnosis').codes is None])
print(f'{field:20} = {missing_count/len(training+testing):.3%}')

code                 = 0.111%
admit_date           = 0.001%
discharge_date       = 0.000%
age                  = 0.000%
alc_days             = 0.000%
acute_days           = 0.000%
cmg                  = 0.020%
case_weight          = 0.020%
gender               = 0.001%
postal_code          = 0.000%
is_central_zone      = 0.000%
institution_number   = 0.000%
institution_to       = 70.643%
institution_from     = 68.536%
institution_type     = 85.172%
admit_category       = 0.000%
readmission_code     = 0.087%
main_pt_service      = 0.018%
mrdx                 = 0.000%
transfusion_given    = 0.019%
intervention_code    = 0.000%
diagnosis_types      = 0.000%
diagnosis_texts      = 0.000%
diagnosis_codes      = 0.000%


#### UNIQUE VALUE COUNT

In [40]:

fields = ['code',
          'admit_date',
          'discharge_date',]


for field in fields:
    count_unique = len(set([getattr(admission, field) for admission in training+testing]))
    print(f'{field:20} = {count_unique:>,}')

fields = [
          'age',
          'alc_days',
          'acute_days',
          'cmg',
          'case_weight',
]
for field in fields:
    min_ = (min([getattr(admission, field) for admission in training+testing if not getattr(admission, field) is None and not np.isnan(getattr(admission, field))]))
    max_ = (max([getattr(admission, field) for admission in training+testing if not getattr(admission, field) is None and not np.isnan(getattr(admission, field))]))
    print(f'{field:20} = \t\t\t\t{min_:>,}\t\t\t\t{max_}')

fields = [
          'gender',
          'postal_code',
          'is_central_zone',
          'institution_number',
          'institution_to',
          'institution_from',
          'institution_type',
          'admit_category',
          'readmission_code',
          'main_pt_service',
          'mrdx',
          'transfusion_given',
          ]

for field in fields:
    count_unique = len(set([getattr(admission, field) for admission in training+testing]))
    print(f'{field:20} = {count_unique:>,}')

fields = [
          'intervention_code',
          'diagnosis_types',
          'diagnosis_texts',
          'diagnosis_codes',
]


code                 = 263,719
admit_date           = 2,747
discharge_date       = 2,466
age                  = 				-1				110
alc_days             = 				0				16300
acute_days           = 				0				17343
cmg                  = 				1.0				999.0
case_weight          = 				0.0				2000.0
gender               = 5
postal_code          = 32,769
is_central_zone      = 2
institution_number   = 34
institution_to       = 250
institution_from     = 290
institution_type     = 29
admit_category       = 6
readmission_code     = 7
main_pt_service      = 53
mrdx                 = 6,462
transfusion_given    = 3


In [None]:
for admission in training+testing:
    