### Readme
This notebooks read the etire database from ``confg['unified_merged_file']``, creates a JSON with the same information (cleaner) and stores it in ``config['json_file']``

A total of 618,697 instances are stored. 28 features for each (original 27 + cz status)



In [47]:
import pandas as pd
import matplotlib.pyplot as plt
from datetime import date
import datetime
import numpy as np
import seaborn as sns
import re
import json
import pandas as pd
import math
import ast
import os
import sys
sys.path.append('..')
from utilities import logger
from utilities import configuration


In [3]:
config = configuration.get_config()
for key in config:
    print(f'{key:40} (type: {type(config[key])})')


logging = logger.init_logger(config['system_log'])
logging.debug('Logger has started ont notebook 07. Create full database JSON')


system_log                               (type: <class 'str'>)
json_file                                (type: <class 'str'>)
unified_merged_file_cz                   (type: <class 'str'>)
unified_merged_file_noncz                (type: <class 'str'>)
unified_merged_file                      (type: <class 'str'>)
data_path                                (type: <class 'str'>)
cz_files                                 (type: <class 'list'>)
noncz_files                              (type: <class 'list'>)
2023-09-29 09:04:52,431 - root - DEBUG - Logger has started ont notebook 05. Analysing features.ipynb ...


In [4]:
df = pd.read_csv(config['unified_merged_file'])


  df = pd.read_csv(config['unified_merged_file'])


In [29]:
data={}
for ix in range(df.shape[0]):
    row = df.iloc[ix,:]
    # --------- #
    # Coded HCN #
    # --------- #

    coded_hcn = row['Coded HCN']
    if  '[' in coded_hcn and ']' in coded_hcn: # Coded HCN is a list.
        if coded_hcn=='[]': # Empty list
            coded_hcn=None
        else: # list with at least one element.
            coded_hcn = re.findall('([0-9\.][0-9\.]*)',coded_hcn)
            assert str(coded_hcn).replace("'",'')==row['Coded HCN'], str(ix)+':'+str(coded_hcn)
            coded_hcn = list(filter(lambda elem: elem!=170805, map(int, map(float,coded_hcn))))
            assert len(coded_hcn)<=1
            coded_hcn = None if len(coded_hcn)==0 else coded_hcn[0]
    else: # Coded HCN is a single number
        coded_hcn = int(float(coded_hcn))
        assert coded_hcn==float(row['Coded HCN'])
    data[ix]={'HCN code':coded_hcn}

    # ------------------ #
    # Institution Number #
    # ------------------ #
    institution_number = int(row['Institution Number:'])
    data[ix]['Institution Number'] = institution_number

    # ------------------------ #
    # Admit and Discharge Date #
    # ------------------------ #
    admit_date = datetime.datetime.fromisoformat(row['Admit Date:'][:10]) if not isinstance(row['Admit Date:'], float) else None
    discharge_date = datetime.datetime.fromisoformat(row['Disch Date:'][:10])
    data[ix]['Admit Date'] = admit_date
    data[ix]['Discharge Date'] = discharge_date

    readmission_code = row['Readmission Code:'] if row['Readmission Code:']!='**' else None
    assert readmission_code is None or (not '[' in readmission_code and not ']' in readmission_code and not '**' in readmission_code, row['Readmission Code:'])
    data[ix]['Readmission Code']=readmission_code

    # ----------- #
    # Patient Age #
    # ----------- #
    patient_age = int(row['Patient Age:'])
    data[ix]['Patient Age'] = patient_age

    # ------ #
    # Gender #
    # ------ #
    gender = row['Gender:'] if row['Gender:']!='**' else None
    data[ix]['Gender'] = gender


    # ---- #
    # MRDx #
    # ---- #
    mrdx = row['MRDx']
    assert not mrdx.startswith('[') and not '**' in mrdx, mrdx
    data[ix]['MRDx'] = mrdx

    postal_code = row['Postal Code:'] if row['Postal Code:']!='**' else None
    assert postal_code is None or len(postal_code)==6,postal_code
    assert postal_code is None or postal_code.isalnum(), postal_code
    data[ix]['Postal Code'] = postal_code

    # --------- #
    # Diagnosis #
    # --------- #
    diagnosis = row['Diagnosis:']
    if '[' in diagnosis:
        if diagnosis=='[]':
            diagnosis=[]
        else:
            diagnosis = ast.literal_eval(diagnosis)
            assert str(diagnosis)==row['Diagnosis:']
    elif diagnosis=='**':
        diagnosis = []
    else:
        diagnosis = [diagnosis]
    assert diagnosis!='**',diagnosis
    data[ix]['Diagnosis Code']= diagnosis

    # ------------------- #
    # Diagnosis Long Text #
    # ------------------- #
    diagnosis_long_text = row['Diagnosis Long Text']
    if '**'== diagnosis_long_text or '[]'==diagnosis_long_text:
        diagnosis_long_text=[]
    elif diagnosis_long_text.startswith('[')  and diagnosis_long_text.endswith(']'):
        diagnosis_long_text = ast.literal_eval(diagnosis_long_text)
        assert str(diagnosis_long_text)==row['Diagnosis Long Text'],str(ix)+':'+str(diagnosis_long_text)
    else:
        diagnosis_long_text = [diagnosis_long_text]
    data[ix]['Diagnosis Long Text']= diagnosis_long_text

    # -------------- #
    # Diagnosis type #
    # -------------- #
    diagnosis_type = row['Diagnosis Type']
    if '**'== diagnosis_type or '[]'==diagnosis_type:
        diagnosis_type=[]
    elif '[' in diagnosis_type and ']' in diagnosis_type:
        diagnosis_type = ast.literal_eval(diagnosis_type)
        assert str(diagnosis_type)==row['Diagnosis Type']
    else:
        diagnosis_type = [diagnosis_type]
    data[ix]['Diagnosis Type']= diagnosis_type
    
    # ----------------- #
    # Intervention Code #
    # ----------------- #
    intervention_code = row['Intervention Code']
    if '**'== intervention_code or '[]'==intervention_code:
        intervention_code=[]
    elif '[' in intervention_code and ']' in intervention_code:
        intervention_code = ast.literal_eval(intervention_code)
        assert str(intervention_code) == row['Intervention Code']
    else:
        intervention_code = [intervention_code]
    data[ix]['Intervention Code']= intervention_code

    # ------------ #
    # Px Long Text #
    # ------------ #
    px_long_text = row['Px Long Text']
    if '**'== px_long_text or '[]'==px_long_text:
        px_long_text=[]
    elif px_long_text.startswith('[') and px_long_text.endswith(']'):
        px_long_text = ast.literal_eval(px_long_text)
        assert str(px_long_text)==row['Px Long Text'], str(ix)+':'+str(px_long_text)
    else:
        px_long_text = [px_long_text]
    data[ix]['Px Long Text']= px_long_text
    
    # ------------- #
    # Admit Ctegory #
    # ------------- #
    admit_category = row['Admit Category:'] if row['Admit Category:']!='**' else None
    assert admit_category!='**' and admit_category!='[]', admit_category
    data[ix]['Admit Category']=admit_category

    # ---------- #
    # Entry Code #
    # ---------- #
    entry_code = row['Entry Code:'] if row['Entry Code:']!='**' else None
    assert entry_code!='**' and entry_code!='[]', entry_code
    data[ix]['Entry Code']=entry_code

    # ----------------- #
    # Transfusion Given #
    # ----------------- #
    transfusion_given = row['Transfusion Given']  if row['Transfusion Given']!='**' else None
    assert transfusion_given!='**' and transfusion_given!='[]', transfusion_given
    data[ix]['Transfusion Given']=transfusion_given

    # ---------------- #
    # Main Pt Service: #
    # ---------------- #
    main_pt_service = row['Main Pt Service:'] if row['Main Pt Service:']!='**' else None
    assert main_pt_service!='**' and main_pt_service!='[]', main_pt_service
    data[ix]['Main Pt Service']=main_pt_service

    # --- #
    # CMG # 
    # --- #
    cmg = float(row['CMG']) if row['CMG']!='**' else None
    assert cmg!='**' and cmg!='[]', cmg
    data[ix]['CMG'] = cmg

    # ----------------- #
    # Comorbidity Level #
    # ----------------- #
    comorbidity_level = row['Comorbidity Level'] if row['Comorbidity Level']!='**' else None
    assert comorbidity_level!='**' and comorbidity_level!='[]', comorbidity_level
    data[ix]['Comorbidity Level']=comorbidity_level

    # ----------- #
    # Case Weight #
    # ----------- #
    case_weight = row['Case Weight']
    if case_weight=='1,946.89':
        case_weight=1946.89
    case_weight = float(case_weight) if case_weight!='**' else None
    
    assert case_weight!='**' and case_weight!='[]', case_weight
    data[ix]['Case Weight']=case_weight

    # ------- #
    # ALCDays #
    # ------- #
    alcdays = int(row['ALCDays']) if row['ALCDays']!='**' else None
    assert alcdays!='**' and alcdays!='[]', alcdays
    data[ix]['ALC Days']=alcdays

    # ---------- #
    # Acute Days #
    # ---------- # 
    acute_days = int(row['Acute Days']) if row['Acute Days']!='**' else None
    assert acute_days!='**' and acute_days!='[]', acute_days
    data[ix]['Acute Days']=acute_days

    # -------------- #
    # Institution To #
    # -------------- # 
    institution_to = (row['Institution To']) if row['Institution To']!='**' else None
    assert institution_to!='**' and institution_to!='[]', institution_to
    data[ix]['Institution To']=institution_to


    # ---------------- #
    # Institution From #
    # ---------------- #
    institution_from = (row['Institution From']) if row['Institution From']!='**' else None
    assert institution_from!='**' and institution_from!='[]', institution_from
    data[ix]['Institution From']=institution_from


    # ---------------- #
    # Institution Type #
    # ---------------- #
    institution_type = (row['Institution Type']) if row['Institution Type']!='**' else None
    assert institution_type!='**' and institution_type!='[]', institution_type
    data[ix]['Institution Type']=institution_type


    # -------------------- #
    # Discharge Nurse Unit #
    # -------------------- #
    discharge_nurse_unit = (row['Discharge Nurse Unit']) if row['Discharge Nurse Unit']!='**' else None
    assert discharge_nurse_unit!='**' and discharge_nurse_unit!='[]', discharge_nurse_unit
    data[ix]['Discharge Nurse Unit']=discharge_nurse_unit

    # --------- #
    # CZ Status #
    # --------- #
    cz_status = row['CZ status']
    assert cz_status!='**' and cz_status!='[]'
    data[ix]['CZ Status']=cz_status

    

**Saving to JSON ...**

In [52]:
with open(config['json_file'], 'w') as f:
    json.dump(data, f, default=str)

**How to load JSON?**

In [53]:
f = open(config['json_file'])
recovered_data = json.load(f)

In [56]:
list(recovered_data)[0]

'0'

### Testing with a Random data entry

In [63]:
for key in recovered_data['0']:
    print(f"{key:20} ({str(type(recovered_data['0'][key])):10}): {recovered_data['0'][key]}")

HCN code             (<class 'int'>): 1159480
Institution Number   (<class 'int'>): 65
Admit Date           (<class 'str'>): 2014-12-24 00:00:00
Discharge Date       (<class 'str'>): 2015-01-01 00:00:00
Readmission Code     (<class 'str'>): 5 New patient to the acute care unit
Patient Age          (<class 'int'>): 67
Gender               (<class 'str'>): Male
MRDx                 (<class 'str'>): K746  Other and unspecified cirrhosis of liver
Postal Code          (<class 'str'>): B3A2K1
Diagnosis Code       (<class 'list'>): ['K746', 'K431', 'R18', 'K760', 'K650', 'I518', 'L988', 'L984', 'K439', 'J90']
Diagnosis Long Text  (<class 'list'>): ['Other and unspecified cirrhosis of liver', 'Incisional hernia with gangrene', 'Ascites', 'Fatty (change of) liver, not elsewhere classified', 'Acute peritonitis', 'Other ill-defined heart diseases', 'Other specified disorders of skin and subcutaneous tissue', 'Chronic ulcer of skin, not elsewhere classified', 'Other and unspecified ventral hernia 

In [55]:
ix = np.random.randint(0,high=df.shape[0])
for key in recovered_data[ix]:
    print(f'{key}: {recovered_data[ix][key]}')


KeyError: 306092

In [42]:
print(df.iloc[ix,:])

Institution Number:                                                        85
Coded HCN                                                          13003494.0
Admit Date:                                                        2017-07-10
Disch Date:                                                        2017-07-12
Readmission Code:                        5 New patient to the acute care unit
Patient Age:                                                               20
Gender:                                                                Female
MRDx                                          K074  Malocclusion, unspecified
Postal Code:                                                           B3L4P9
Diagnosis:                                                               K074
Diagnosis Long Text                                 Malocclusion, unspecified
Diagnosis Type                                                              M
Intervention Code                      ['1EF80LANWK', '1FE57JA',

### Checking large case weight cases

In [46]:
for data_item in [data[ix] for ix in range(len(data)) if not data[ix]['Case Weight'] is None and data[ix]['Case Weight']>1600]:
    print(data_item)
    print()
    print()

{'HCN code': 8244681, 'Institution Number': 77, 'Admit Date': datetime.datetime(1969, 1, 17, 0, 0), 'Discharge Date': datetime.datetime(2017, 7, 26, 0, 0), 'Readmission Code': '5 New patient to the acute care unit', 'Patient Age': 6, 'Gender': 'Male', 'MRDx': 'F718  Moderate mental retardation, other impairments of behaviour', 'Postal Code': 'B2Y3Z9', 'Diagnosis Code': ['F718', 'Z751', 'F639', 'G4090', 'K029', 'N390', 'R310'], 'Diagnosis Long Text': ['Moderate mental retardation, other impairments of behaviour', 'Person awaiting admission to adequate facility elsewhere', 'Habit and impulse disorder, unspecified', 'Epilepsy, unspecified, not stated as intractable', 'Dental caries, unspecified', 'Urinary tract infection, site not specified', 'Gross hematuria'], 'Diagnosis Type': ['M', 'W', '1', '1', '1', '2', '2'], 'Intervention Code': ['1FE29JARG', '1FE29JARG', '1FE94JA', '2PM70BA'], 'Px Long Text': ['Restoration,  tooth filling using amalgam (bonded or unbonded)', 'Restoration,  tooth 