## Predicting using Diagnosis codes and creating code2text (diagnosis code to diagnosis text)

In [21]:
import pandas as pd
import matplotlib.pyplot as plt
from datetime import date
import datetime
import numpy as np
import seaborn as sns
import re
import json
import pandas as pd
import math
import ast
import os
from collections import defaultdict
import sys
sys.path.append('..')
from utilities import logger
from utilities import configuration
from utilities import health_data

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.preprocessing import StandardScaler

from sklearn.neural_network import MLPClassifier


In [22]:
config = configuration.get_config()
for key in config:
    print(f'{key:40} (type: {type(config[key])})')


logging = logger.init_logger(config['system_log'])
logging.debug('Logger has started ont notebook 09 Random sample of instances.ipynb ...')

system_log                               (type: <class 'str'>)
json_file                                (type: <class 'str'>)
train_val_json                           (type: <class 'str'>)
heldout_json                             (type: <class 'str'>)
unused_after_heldout_json                (type: <class 'str'>)
unified_merged_file_cz                   (type: <class 'str'>)
unified_merged_file_noncz                (type: <class 'str'>)
unified_merged_file                      (type: <class 'str'>)
data_path                                (type: <class 'str'>)
cz_files                                 (type: <class 'list'>)
noncz_files                              (type: <class 'list'>)
2023-11-10 08:56:56,011 - root - DEBUG - Logger has started ont notebook 09 Random sample of instances.ipynb ...


In [23]:
f = open(config['train_val_json'])
train_val_data = json.load(f)

all_admissions = []
for ix in train_val_data:
    all_admissions.append(
        health_data.Admission.from_dict_data(admit_id=int(ix), admission=train_val_data[ix])
        )
len(all_admissions)

# Dictionary organizing data by patient
patient2admissions = defaultdict(list)
for admission in all_admissions:
    code = admission.code
    patient2admissions[code].append(admission)

# Ordering patient list by discharge date (from back )
for patient_code in patient2admissions:
    admissions_list = patient2admissions[patient_code]
    admissions_list = sorted(admissions_list, key=lambda admission: admission.discharge_date, reverse=False)
    assert all([admissions_list[i].discharge_date <= admissions_list[i+1].discharge_date for i in range(len(admissions_list)-1)])
    patient2admissions[patient_code] = admissions_list

patient_count=0
valid_readmission_count=0
for patient_code in patient2admissions:
    patient_admissions = patient2admissions[patient_code]
    ix = 0 
    while ix < len(patient_admissions):
        readmission_code = patient_admissions[ix].readmission_code
        if health_data.ReadmissionCode.is_readmit(readmission_code):
            # Either is not the first admission (ix>0) or 
            # we don't have the patient previous admition (readmission close to begining of dataset) (admit-(2015-01-01))<28 days
            # assert ix>0 or (patient_admissions[ix].admit_date - datetime.datetime.fromisoformat('2015-01-01')).days<365
            if ix>0 and  patient_admissions[ix-1].is_valid_readmission(patient_admissions[ix]):
                patient_admissions[ix-1].add_readmission(patient_admissions[ix])
                valid_readmission_count+=1
        ix+=1
    patient_count+=1


print(f'Size of train_val data:   {len(all_admissions):,}')
print(f"Begining heldout:         {min([admission.discharge_date for admission in all_admissions])}")
print(f"End heldout:              {max([admission.discharge_date for admission in all_admissions])}")
print(f'Number of instances with readmissions: {valid_readmission_count} ({valid_readmission_count/len(all_admissions):.2%})')



Size of train_val data:   524,986
Begining heldout:         2015-01-01 00:00:00
End heldout:              2021-10-01 00:00:00
Number of instances with readmissions: 50533 (9.63%)


### Building ICD 10 codes (diagnosiscode 2 text)

In [24]:
code2text = {}
for admission in all_admissions:
    for code, text in zip(admission.diagnosis.codes, admission.diagnosis.texts):
        if not code in code2text:
            code2text[code]={text}
        else:
            code2text[code].add(text)

### Splitting into train and validation


In [25]:
rng = np.random.default_rng(seed=5348363479653547918)

train_indexes = rng.choice(range(len(all_admissions)),size=int(0.8*len(all_admissions)), replace=False)

# Checking that every time I am getting the same training instances ( and validation instances)
assert all(train_indexes[:3] ==np.array([478898, 46409, 322969]))
assert all(train_indexes[-3:] ==np.array([415014, 330673, 338415]))
assert hash(tuple(train_indexes))==2028319680436964623

train_indexes = set(train_indexes)

train = [admission for ix, admission in enumerate(all_admissions) if ix in train_indexes ]
validation = [admission for ix, admission in enumerate(all_admissions) if not ix in train_indexes ]

print(f'Size of training (before filter)=     {len(train):,}')
print(f'Size of validation (before filter)=   {len(validation):,}')

# Filtering missing values
train = list(filter(lambda admission: not admission.has_missing, train))
validation = list(filter(lambda admission: not admission.has_missing, validation))


# Filtering STILL_BORNS admissions
train = list(filter(lambda admission: admission.admit_category!=health_data.AdmitCategory.STILLBORN, train))
validation = list(filter(lambda admission: admission.admit_category!=health_data.AdmitCategory.STILLBORN, validation))

# Filtering CADAVER admissions
train = list(filter(lambda admission: admission.admit_category!=health_data.AdmitCategory.CADAVER, train))
validation = list(filter(lambda admission: admission.admit_category!=health_data.AdmitCategory.CADAVER, validation))


print(f'Size of training (after filter)=      {len(train):,}')
print(f'Size of validation (after filter)=    {len(validation):,}')

Size of training (before filter)=     419,988
Size of validation (before filter)=   104,998
Size of training (after filter)=      419,345
Size of validation (after filter)=    104,861


#### Creating diagnosis codes matrices

In [26]:
vocab, X_train = health_data.Admission.create_codes_matrix(train)
y_train = health_data.Admission.get_y(train)


vocab2, X_val = health_data.Admission.create_codes_matrix(validation, vocabulary=vocab)
y_val = health_data.Admission.get_y(validation)

assert all(vocab2==vocab)

print('** Training **')
print(f'X_train.shape = {X_train.shape}')
print(f'y_train.shape = {y_train.shape}')
print()
print('** Validation **')
print(f'X_val.shape =   {X_val.shape}')
print(f'y_val.shape =   {y_val.shape}')


** Training **
X_train.shape = (419345, 9546)
y_train.shape = (419345,)

** Validation **
X_val.shape =   (104861, 9546)
y_val.shape =   (104861,)


### Undersampling

In [27]:
number_of_positives_in_training = np.sum(y_train)
print(f'Number of positive examples:                       {number_of_positives_in_training} '\
      f'({number_of_positives_in_training/len(y_train):.2%})')

positive_x = X_train[y_train==1,:]
negative_x = X_train[y_train==0,:]

print(f'Positive instances matrix shape =                  {positive_x.shape}')
print(f'Negative instances matrix shape =                  {negative_x.shape}')


under_sampling_ix = np.random.choice(range(negative_x.shape[0]), size=number_of_positives_in_training, replace=False)
under_sampled_negative_x = negative_x[under_sampling_ix,:]

print(f'Undersampled Negative instances matrix shape =     {under_sampled_negative_x.shape}')

undersampled_x = np.vstack([positive_x.toarray(),under_sampled_negative_x.toarray()])

print(f'Combined positives + undersampled neg, new shape = {undersampled_x.shape}')

assert undersampled_x.shape[0]==number_of_positives_in_training*2 and under_sampled_negative_x.shape[1]==X_train.shape[1]

undersampled_y = np.array([1]*number_of_positives_in_training + [0]*number_of_positives_in_training)


print(f'Undersampled target shape =                        {undersampled_y.shape}')
# X_train_under_sampled = X_train[under_sampling_ix,:]
# X_train_under_sampled.shape


Number of positive examples:                       17412 (4.15%)
Positive instances matrix shape =                  (17412, 9546)
Negative instances matrix shape =                  (401933, 9546)
Undersampled Negative instances matrix shape =     (17412, 9546)
Combined positives + undersampled neg, new shape = (34824, 9546)
Undersampled target shape =                        (34824,)


In [7]:
vector, matrix = health_data.Admission.create_intervention_matrix(train)
matrix.shape

(419345, 7519)

### Logistic Regression (balanced, diagnosis codes only)

In [7]:
print(' ** ** Logistic Regression ** **')
clf = LogisticRegression(random_state=0, max_iter=1000,class_weight='balanced').fit(X_train, y_train,)
y_true = y_train
y_pred = clf.predict(X_train)
print('** Results on Training **')
print(f'Precision= {precision_score(y_true, y_pred,):4.3f}')
print(f'Recall=    {recall_score(y_true, y_pred,):4.3f}')
print(f'F1-score=  {f1_score(y_true, y_pred,):4.3f}')
print()


y_true = y_val
y_pred = clf.predict(X_val)
print('** Results on Training **')
print(f'Precision= {precision_score(y_true, y_pred,):4.3f}')
print(f'Recall=    {recall_score(y_true, y_pred,):4.3f}')
print(f'F1-score=  {f1_score(y_true, y_pred,):4.3f}')

 ** ** Logistic Regression ** **
** Results on Training **
Precision= 0.083
Recall=    0.729
F1-score=  0.149

** Results on Training **
Precision= 0.076
Recall=    0.640
F1-score=  0.136


In [17]:
cols = vocab
for coef, col in sorted(zip(clf.coef_[0,:],cols),key=lambda value: np.abs(value[0]), reverse=True)[:20]:
    print(f'{str(code2text[col.upper()]):110}: {coef:7.3f}')

{'Cardiac arrest, unspecified'}                                                                               :  -6.624
{'Assistance in dying'}                                                                                       :  -5.517
{'Antibody deficiency with near-normal immunoglobulins or with hyperimmunoglobulinaemia'}                     :   4.705
{'Complete uterovaginal prolapse'}                                                                            :  -3.966
{'Immunodeficiency associated with other specified major defects'}                                            :   3.951
{'Fracture of shaft of femur, closed'}                                                                        :  -3.938
{'Anoxic brain damage, not elsewhere classified'}                                                             :  -3.927
{'Mechanical complication of hip prosthesis, breakage and dissociation'}                                      :   3.717
{'Infection of obstetric surgical wound,

### NN

In [38]:
undersampled_y.shape

(34824,)

In [39]:
print(' ** **   MLP   ** **')
clf = MLPClassifier(random_state=0, 
                    # early_stopping=True, 
                    # n_iter_no_change=200,
                    # validation_fraction=0.2, 
                    max_iter=1000, 
                    hidden_layer_sizes=(100,),
                    verbose=True)
clf.fit(undersampled_x, undersampled_y,)
y_true = y_train
y_pred = clf.predict(X_train)
print('** Results on Training **')
print(f'Precision= {precision_score(y_true, y_pred,):4.3f}')
print(f'Recall=    {recall_score(y_true, y_pred,):4.3f}')
print(f'F1-score=  {f1_score(y_true, y_pred,):4.3f}')
print()


y_true = y_val
y_pred = clf.predict(X_val)
print('** Results on Validation **')
print(f'Precision= {precision_score(y_true, y_pred,):4.3f}')
print(f'Recall=    {recall_score(y_true, y_pred,):4.3f}')
print(f'F1-score=  {f1_score(y_true, y_pred,):4.3f}')


 ** **   MLP   ** **
Iteration 1, loss = 0.65473330
Iteration 2, loss = 0.61147693
Iteration 3, loss = 0.59407000
Iteration 4, loss = 0.58197912
Iteration 5, loss = 0.57257484
Iteration 6, loss = 0.56349270
Iteration 7, loss = 0.55490397
Iteration 8, loss = 0.54670958
Iteration 9, loss = 0.53755466
Iteration 10, loss = 0.52935501
Iteration 11, loss = 0.52044591
Iteration 12, loss = 0.51115665
Iteration 13, loss = 0.50126563
Iteration 14, loss = 0.49202015
Iteration 15, loss = 0.48199186
Iteration 16, loss = 0.47194056
Iteration 17, loss = 0.46153691
Iteration 18, loss = 0.45076903
Iteration 19, loss = 0.43960150
Iteration 20, loss = 0.42978041
Iteration 21, loss = 0.41866083
Iteration 22, loss = 0.40778206
Iteration 23, loss = 0.39726103
Iteration 24, loss = 0.38723688
Iteration 25, loss = 0.37641156
Iteration 26, loss = 0.36709833
Iteration 27, loss = 0.35759093
Iteration 28, loss = 0.34813243
Iteration 29, loss = 0.34023522
Iteration 30, loss = 0.33078273
Iteration 31, loss = 0.32418

In [None]:
print(' ** **   MLP   ** **')
clf = MLPClassifier(random_state=0, 
                    # early_stopping=True, 
                    # n_iter_no_change=200,
                    # validation_fraction=0.2, 
                    max_iter=1000, 
                    hidden_layer_sizes=(300, 200, 100, 50, 20, 5, 4, 3, 2),
                    verbose=True)
clf.fit(undersampled_x, undersampled_y,)
y_true = y_train
y_pred = clf.predict(X_train)
print('** Results on Training **')
print(f'Precision= {precision_score(y_true, y_pred,):4.3f}')
print(f'Recall=    {recall_score(y_true, y_pred,):4.3f}')
print(f'F1-score=  {f1_score(y_true, y_pred,):4.3f}')
print()


y_true = y_val
y_pred = clf.predict(X_val)
print('** Results on Validation **')
print(f'Precision= {precision_score(y_true, y_pred,):4.3f}')
print(f'Recall=    {recall_score(y_true, y_pred,):4.3f}')
print(f'F1-score=  {f1_score(y_true, y_pred,):4.3f}')


 ** **   MLP   ** **
Iteration 1, loss = 0.66665176
Iteration 2, loss = 0.65892955
Iteration 3, loss = 0.65631846
Iteration 4, loss = 0.65448327
Iteration 5, loss = 0.65293521
Iteration 6, loss = 0.65135114
Iteration 7, loss = 0.64913607
Iteration 8, loss = 0.64810033
Iteration 9, loss = 0.64718905
Iteration 10, loss = 0.64534718
Iteration 11, loss = 0.64443307
Iteration 12, loss = 0.64322995
Iteration 13, loss = 0.64176864
Iteration 14, loss = 0.64200421
Iteration 15, loss = 0.63983588
Iteration 16, loss = 0.63867002
Iteration 17, loss = 0.63696668
Iteration 18, loss = 0.63659016
Iteration 19, loss = 0.63584601
Iteration 20, loss = 0.63412637
Iteration 21, loss = 0.63256733
Iteration 22, loss = 0.63105161
Iteration 23, loss = 0.63013559
Iteration 24, loss = 0.62872462
Iteration 25, loss = 0.62696446
Iteration 26, loss = 0.62611542
Iteration 27, loss = 0.62501343
Iteration 28, loss = 0.62301208
Iteration 29, loss = 0.62147913
Iteration 30, loss = 0.61996888
Iteration 31, loss = 0.61874

In [17]:
vocab, matrix = health_data.Admission.create_intervention_matrix(train)
len(vocab)

7519

In [18]:
intervention_codes = set()
for admission in train:
    intervention_codes = intervention_codes.union(set(admission.intervention_code))
len(intervention_codes)

7524

In [8]:
intervention_codes = set()
for admission in train:
    intervention_codes = intervention_codes.union(set(admission.intervention_code))
len(intervention_codes)

7524

In [15]:
(list(vocab)[:10])

['a000',
 'a010',
 'a020',
 'a021',
 'a022',
 'a028',
 'a029',
 'a030',
 'a041',
 'a044']

In [14]:
list(intervention_codes)[:10]

['1YY54LAFU',
 '1VD80LAXXQ',
 '1TM74LANWA',
 '1PQ50BABD',
 '1NF13GQW0',
 '1ED03JASR',
 '1TM04JH',
 '1IC80LA',
 '1NQ74EJ',
 '1TA35DAD1']

In [12]:
# freq = defaultdict(int)
# for admission in all_admissions:
#     for code in admission.diagnosis.codes:
#         freq[code]+=1
# print(len(freq))

9945


In [30]:

# from sklearn.feature_extraction.text import TfidfVectorizer
# vectorizer = TfidfVectorizer()
# type(vectorizer.fit_transform(['aaa aaab', 'aaa bs']))
# vectorizer.get_feature_names_out()

array(['aaa', 'aaab', 'bs'], dtype=object)

In [42]:
# from scipy.sparse._csr import csr_matrix

# def create_codes_matrix(admissions: list[health_data.Admission], freq_cutoff=0)->pd.DataFrame:
#     freq = defaultdict(int)
#     for admission in admissions:
#         for code in admission.diagnosis.codes:
#             freq[code.lower()]+=1
#     print(f'Found {len(freq)} distincts codes.')
#     codes_vocab = np.array([code for code in freq if freq[code]>freq_cutoff])
#     print(f'After filtering codes that appear {freq_cutoff} times or less we have {len(codes_vocab)} codes.')


#     return codes_vocab, matrix

# codes_vocab, matrix = create_codes_matrix(all_admissions)
# codes_vocab.shape

Found 9929 distincts codes.
After filtering codes that appear 0 times or less we have 9929 codes.


(9929,)

In [41]:
# from scipy.sparse._csr import csr_matrix


#     # freq = defaultdict(int)
#     # for admission in admissions:
#     #     for code in admission.diagnosis.codes:
#     #         freq[code]+=1
#     # print(f'Found {len(freq)} distincts codes.')
#     # codes_vocab = np.array([code for code in freq if freq[code]>freq_cutoff])
#     # print(f'After filtering codes that appear {freq_cutoff} times or less we have {len(codes_vocab)} codes.')
#     # code2index = dict([(code, idx) for idx,code in enumerate(codes_vocab)])
#     # del(freq)

#     # matrix = csr_matrix((len(admissions),len(codes_vocab)), dtype=np.int8)
#     # for ix, admission in enumerate(admissions):
#     #     for code in admission.diagnosis.codes:
#     #         if code in code2index:
#     #             matrix[ix, code2index[code]] += 1

#     return codes_vocab, matrix


# codes_vocab2, matrix = create_codes_matrix(all_admissions)
# codes_vocab2.shape

Number of features=9929


(9929,)

In [44]:
set(codes_vocab)==set(codes_vocab2)

True

In [32]:
matrix

<524986x8397 sparse matrix of type '<class 'numpy.float64'>'
	with 2314704 stored elements in Compressed Sparse Row format>

In [None]:
# rng = np.random.default_rng(seed=5348363479653547918)

# train_indexes = rng.choice(range(len(all_admissions)),size=int(0.8*len(all_admissions)), replace=False)

# # Checking that every time I am getting the same training instances ( and validation instances)
# assert all(train_indexes[:3] ==np.array([478898, 46409, 322969]))
# assert all(train_indexes[-3:] ==np.array([415014, 330673, 338415]))
# assert hash(tuple(train_indexes))==2028319680436964623

# train_indexes = set(train_indexes)

# train = [admission for ix, admission in enumerate(all_admissions) if ix in train_indexes ]
# validation = [admission for ix, admission in enumerate(all_admissions) if not ix in train_indexes ]

# print(f'Size of training (before filter)=     {len(train):,}')
# print(f'Size of validation (before filter)=   {len(validation):,}')

# # Filtering missing values
# train = list(filter(lambda admission: not admission.has_missing, train))
# validation = list(filter(lambda admission: not admission.has_missing, validation))


# # Filtering STILL_BORNS admissions
# train = list(filter(lambda admission: admission.admit_category!=health_data.AdmitCategory.STILLBORN, train))
# validation = list(filter(lambda admission: admission.admit_category!=health_data.AdmitCategory.STILLBORN, validation))

# # Filtering CADAVER admissions
# train = list(filter(lambda admission: admission.admit_category!=health_data.AdmitCategory.CADAVER, train))
# validation = list(filter(lambda admission: admission.admit_category!=health_data.AdmitCategory.CADAVER, validation))


# print(f'Size of training (after filter)=      {len(train):,}')
# print(f'Size of validation (after filter)=    {len(validation):,}')

In [4]:
vocab, matrix = health_data.Admission.create_codes_matrix(all_admissions)

X = matrix
y = health_data.Admission.get_y(all_admissions)

from sklearn.model_selection import cross_validate

scoring = ['precision', 'recall', 'f1']
clf = SVC(kernel='linear', C=1, random_state=0)
scores = cross_validate(clf, X, y, scoring=scoring)


Number of features=9929


  _warn_prf(average, modifier, msg_start, len(result))
