In [1]:
import os 
import pandas as pd
import re
import simple_icd_10_cm as cm

In [8]:
def numerical_sort(filename):
    return [int(x) if x.isdigit() else x for x in re.split(r'(\d+)', filename)]

def getFileList(folder):
    file_list = [folder+'/'+i for i in os.listdir(folder)]
    file_list = sorted(file_list, key=numerical_sort)

    return file_list

In [9]:
def results(file_list):
    results_df = pd.DataFrame()
    for file in file_list:
        df = pd.read_csv(file)
        df = df.iloc[:, 1:]
        results_df = pd.concat([results_df, df])

    return results_df.sort_values(by='test_r2', ascending=False)

In [10]:
def getResultDF(path, r2_thresh = 0.5):
    files = getFileList(path)
    results_df = results(files)
    results_df = results_df.sort_values(by=['numDataPoints','test_r2'], ascending=[False, False])
    results_df = results_df[results_df['test_r2'] > 0.5].reset_index(drop=True)

    return results_df


In [11]:
# Define the ICD-10-CM major categories mapping
icd_categories = {
    'A': 'Infectious and Parasitic Diseases',
    'B': 'Infectious and Parasitic Diseases',
    'C': 'Neoplasms',
    'D': 'Diseases of the Blood and Blood-Forming Organs',
    'E': 'Endocrine, Nutritional, and Metabolic Diseases',
    'F': 'Mental, Behavioral, and Neurodevelopmental Disorders',
    'G': 'Diseases of the Nervous System',
    'H': 'Diseases of the Eye and Adnexa',
    'I': 'Diseases of the Circulatory System',
    'J': 'Diseases of the Respiratory System',
    'K': 'Diseases of the Digestive System',
    'L': 'Diseases of the Skin and Subcutaneous Tissue',
    'M': 'Diseases of the Musculoskeletal System and Connective Tissue',
    'N': 'Diseases of the Genitourinary System',
    'O': 'Pregnancy, Childbirth, and the Puerperium',
    'P': 'Certain Conditions Originating in the Perinatal Period',
    'Q': 'Congenital Malformations, Deformations, and Chromosomal Abnormalities',
    'R': 'Symptoms, Signs, and Abnormal Clinical and Laboratory Findings',
    'S': 'Injury, Poisoning, and Certain Other Consequences of External Causes',
    'T': 'Injury, Poisoning, and Certain Other Consequences of External Causes',
    'V': 'External Causes of Morbidity',
    'Y': 'External Causes of Morbidity',
    'Z': 'Factors Influencing Health Status and Contact with Health Services'
}



In [12]:
pandas_results_df = getResultDF('../Results_nthresh_3_pandas')
pandas_results_df['code_category'] = [icd_categories.get(i[0], 'Unknown Category') for i in list(pandas_results_df.ICD)]
pandas_results_df['code_label'] = [cm.get_description(i) if cm.is_valid_item(i) else "Unknown Code" for i in list(pandas_results_df.ICD)]
pandas_results_df

Unnamed: 0,ICD,train_r2,test_r2,rmse,numDataPoints,code_category,code_label
0,A419,0.78544,0.629419,0.210329,66982,Infectious and Parasitic Diseases,"Sepsis, unspecified organism"
1,J189,0.820799,0.62475,0.219521,65312,Diseases of the Respiratory System,"Pneumonia, unspecified organism"
2,I5023,0.761252,0.589458,0.251908,46599,Diseases of the Circulatory System,Acute on chronic systolic (congestive) heart f...
3,I2510,0.828652,0.668282,0.227571,45743,Diseases of the Circulatory System,Atherosclerotic heart disease of native corona...
4,O80,0.761145,0.543828,0.202642,44662,"Pregnancy, Childbirth, and the Puerperium",Encounter for full-term uncomplicated delivery
5,I5033,0.759822,0.613229,0.257332,42329,Diseases of the Circulatory System,Acute on chronic diastolic (congestive) heart ...
6,I5043,0.826028,0.651938,0.252471,34210,Diseases of the Circulatory System,Acute on chronic combined systolic (congestive...
7,I5021,0.768544,0.620482,0.260848,30385,Diseases of the Circulatory System,Acute systolic (congestive) heart failure
8,I5031,0.812697,0.642278,0.252676,30088,Diseases of the Circulatory System,Acute diastolic (congestive) heart failure
9,I5030,0.71156,0.519001,0.28929,29620,Diseases of the Circulatory System,Unspecified diastolic (congestive) heart failure


In [13]:
pandas_results_df.to_csv('../assets/results_pandas.csv',index=False)

In [14]:
polars_results_df = getResultDF('../Results_nthresh_3_polars')
polars_results_df['code_category'] = [icd_categories.get(i[0], 'Unknown Category') for i in list(polars_results_df.ICD)]
polars_results_df['code_label'] = [cm.get_description(i) if cm.is_valid_item(i) else "Unknown Code" for i in list(polars_results_df.ICD)]
polars_results_df

Unnamed: 0,ICD,train_r2,test_r2,rmse,numDataPoints,code_category,code_label
0,A419,0.840612,0.644872,0.20523,66982,Infectious and Parasitic Diseases,"Sepsis, unspecified organism"
1,J189,0.777363,0.601642,0.222802,65312,Diseases of the Respiratory System,"Pneumonia, unspecified organism"
2,I5023,0.742687,0.57729,0.257851,46599,Diseases of the Circulatory System,Acute on chronic systolic (congestive) heart f...
3,I2510,0.8297,0.674955,0.225791,45743,Diseases of the Circulatory System,Atherosclerotic heart disease of native corona...
4,O80,0.71604,0.534437,0.202684,44662,"Pregnancy, Childbirth, and the Puerperium",Encounter for full-term uncomplicated delivery
5,I5033,0.792477,0.627586,0.251831,42329,Diseases of the Circulatory System,Acute on chronic diastolic (congestive) heart ...
6,I5043,0.798715,0.638263,0.258561,34210,Diseases of the Circulatory System,Acute on chronic combined systolic (congestive...
7,I5021,0.796683,0.613465,0.263231,30385,Diseases of the Circulatory System,Acute systolic (congestive) heart failure
8,I5031,0.823676,0.63855,0.258456,30088,Diseases of the Circulatory System,Acute diastolic (congestive) heart failure
9,I5030,0.694553,0.512901,0.294467,29620,Diseases of the Circulatory System,Unspecified diastolic (congestive) heart failure


In [15]:
polars_results_df.to_csv('../assets/results_polars.csv',index=False)

---

In [28]:
t3_pandas = getFileList('../pandas/Results_nthresh_3/')
t3_pandas_df = results(t3_pandas)
t3_pandas_df.sort_values(by=['numDataPoints','test_r2'], ascending=[False, False])

Unnamed: 0,ICD,train_r2,test_r2,rmse,numDataPoints
0,I6359,0.836895,0.757321,0.136177,76015
0,Z3801,0.721827,0.447302,0.191407,70445
0,J189,0.73925,0.564649,0.244716,56823
0,I214,0.651465,0.415438,0.228688,44343
0,J9600,0.878951,0.706829,0.159035,43480
1,N179,0.679992,0.382933,0.225465,43185
3,N390,0.682863,0.436578,0.22871,41986
1,O80,0.757403,0.541435,0.210622,36843
0,I4891,0.668409,0.448522,0.226991,30702
1,Z3800,0.794827,0.374077,0.184225,30244


In [34]:
t3_dense = getFileList('../pandas/Results_nthresh_3_polars_10000/')
t3_dense_df = results(t3_dense)
t3_dense_df = t3_dense_df.sort_values(by=['numDataPoints','test_r2'], ascending=[False, False])
t3_dense_df[t3_dense_df['test_r2'] > 0.5]

Unnamed: 0,ICD,train_r2,test_r2,rmse,numDataPoints
1,A419,0.817408,0.637543,0.212193,66982
1,I2510,0.821758,0.685638,0.220995,45743
1,O80,0.706916,0.53306,0.206021,44662
1,I5021,0.847237,0.63699,0.252858,30385
0,I5031,0.831802,0.621393,0.260167,30088
1,I5020,0.800174,0.596512,0.264783,28440
0,I5040,0.861982,0.633038,0.245427,28102
0,E860,0.849868,0.585271,0.219916,24355


In [31]:
t3_dense_df.to_csv('../assets/results_polars.csv', index=False)

In [19]:
t3_dense_df[(t3_dense_df['numDataPoints'] > 10000) & (t3_dense_df['test_r2'] > 0.65)].sort_values(by=['numDataPoints','test_r2'],ascending=[False,False])

Unnamed: 0,ICD,train_r2,test_r2,rmse,numDataPoints
1,I6359,0.82997,0.734719,0.142705,76015
2,J9600,0.867169,0.68303,0.164396,43480
2,I5023,0.925535,0.701115,0.16031,25995
3,I2609,0.955789,0.775159,0.113045,19893
1,I120,0.972863,0.819341,0.113076,19571
2,I5033,0.952846,0.707591,0.151422,19228
3,J9620,0.969057,0.812585,0.124266,17354
2,M179,0.979074,0.809314,0.130626,14478
1,I10,0.99194,0.924063,0.069936,13632
2,R001,0.987341,0.790339,0.111663,12188


In [15]:
t3_dense_df.sort_values(by=['numDataPoints','test_r2'],ascending=[False,False]).head(20)

Unnamed: 0,ICD,train_r2,test_r2,rmse,numDataPoints
1,I6359,0.82997,0.734719,0.142705,76015
0,Z3801,0.716187,0.424072,0.193486,70445
0,J189,0.788243,0.571939,0.239422,56823
0,I214,0.645151,0.406624,0.234941,44343
2,J9600,0.867169,0.68303,0.164396,43480
4,N179,0.631579,0.369633,0.227456,43185
6,N390,0.72561,0.398141,0.231358,41986
2,O80,0.780372,0.520245,0.217057,36843
0,I4891,0.682917,0.407541,0.235097,30702
3,Z3800,0.741891,0.351621,0.188528,30244
