In [None]:
# == Motivations ==
# What would you fight for?
# Berceau story and PSC
# https://raredisease.net/diagnosis
# https://www.rarediseasesinternational.org/living-with-a-rare-disease/#:~:text=Over%20300%20million%20persons%20live%20with%20a%20rare%20disease%20worldwide.
# https://www.genomicseducation.hee.nhs.uk/genotes/knowledge-hub/the-diagnostic-odyssey-in-rare-disease/#:~:text=The%20'diagnostic%20odyssey'%20is%20a,%2C%20on%20average%2C%205.6%20years.


==========

Near the bottom is where the more cleaned up code is. If you search for the definition of the get_model_data function, everything after that is really what ran the models

==========


In [8]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import polars as pl


In [2]:
import time

def time_function(func):
    def wrapper(*args, **kwargs):
        start_time = time.time()
        display(f"Function name: '{func.__name__}' started at {start_time}")
        result = func(*args, **kwargs)
        end_time = time.time()
        elapsed_time = end_time - start_time
        display(f"Function name: '{func.__name__}' took {elapsed_time:.4f} seconds to complete.")
        return result
    return wrapper

In [8]:
chunk_size = 400_000
charts_path = './data/CHARTEVENTS.csv.gz'
chart_columns = ['HADM_ID', 'SUBJECT_ID', 'ITEMID', 'VALUENUM']

In [9]:
tmp = pd.read_csv(charts_path, nrows=10)
tmp

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ICUSTAY_ID,ITEMID,CHARTTIME,STORETIME,CGID,VALUE,VALUENUM,VALUEUOM,WARNING,ERROR,RESULTSTATUS,STOPPED
0,788,36,165660,241249,223834,2134-05-12 12:00:00,2134-05-12 13:56:00,17525,15.0,15.0,L/min,0,0,,
1,789,36,165660,241249,223835,2134-05-12 12:00:00,2134-05-12 13:56:00,17525,100.0,100.0,,0,0,,
2,790,36,165660,241249,224328,2134-05-12 12:00:00,2134-05-12 12:18:00,20823,0.37,0.37,,0,0,,
3,791,36,165660,241249,224329,2134-05-12 12:00:00,2134-05-12 12:19:00,20823,6.0,6.0,min,0,0,,
4,792,36,165660,241249,224330,2134-05-12 12:00:00,2134-05-12 12:19:00,20823,2.5,2.5,,0,0,,
5,793,36,165660,241249,224331,2134-05-12 12:00:00,2134-05-12 12:19:00,20823,0.0,0.0,ml/hr,0,0,,
6,794,36,165660,241249,224332,2134-05-12 12:00:00,2134-05-12 14:44:00,17525,3.0,3.0,,0,0,,
7,795,36,165660,241249,224663,2134-05-12 12:00:00,2134-05-12 14:44:00,17525,8.0,8.0,,0,0,,
8,796,36,165660,241249,224665,2134-05-12 12:00:00,2134-05-12 14:44:00,17525,1.11,1.11,,0,0,,
9,797,36,165660,241249,220224,2134-05-12 12:35:00,2134-05-12 12:38:00,20889,58.0,58.0,mmHg,1,0,,


In [10]:
@time_function
def read_large_file(file_path, columns, val_col):
    # empty df to store the results
    df = pl.DataFrame()

    # read file in chunks
    for chunk in pl.read_csv(
        file_path,
        batch_size=chunk_size,
        schema_overrides={val_col: pl.Utf8}, 
        ignore_errors=True  # ignore parsing errors during reading
    ):
        # check that 'chunk' is a df
        if isinstance(chunk, pl.Series):
            chunk = chunk.to_frame()

        # process each chunk
        chunk = chunk.select(columns)

        # sub non-numeric values to NaN
        chunk = chunk.with_columns(
            pl.col(val_col).apply(
                lambda x: None if not x.replace(".", "", 1).isdigit() else float(x)
            ).alias(val_col)
        )

        # append the processed chunk to main df
        df = df.vstack(chunk)

    return df

In [11]:
# chart_events = read_large_file(charts_path, chart_columns, 'VALUENUM')

In [12]:
# Display the DataFrame
# display(chart_events.head())

In [13]:
diagnosis_data = pd.read_csv("./data/DIAGNOSES_ICD.csv.gz")
diagnosis_data = diagnosis_data.drop(columns=["ROW_ID"])  
display(diagnosis_data.head())

Unnamed: 0,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE
0,109,172335,1.0,40301
1,109,172335,2.0,486
2,109,172335,3.0,58281
3,109,172335,4.0,5855
4,109,172335,5.0,4254


In [14]:
diagnosis_labels = pd.read_csv("./data/D_ICD_DIAGNOSES.csv.gz")
diagnosis_labels = diagnosis_labels.drop(columns=["ROW_ID"])  

display(diagnosis_labels.head())

Unnamed: 0,ICD9_CODE,SHORT_TITLE,LONG_TITLE
0,1166,TB pneumonia-oth test,"Tuberculous pneumonia [any form], tubercle bac..."
1,1170,TB pneumothorax-unspec,"Tuberculous pneumothorax, unspecified"
2,1171,TB pneumothorax-no exam,"Tuberculous pneumothorax, bacteriological or h..."
3,1172,TB pneumothorx-exam unkn,"Tuberculous pneumothorax, bacteriological or h..."
4,1173,TB pneumothorax-micro dx,"Tuberculous pneumothorax, tubercle bacilli fou..."


In [15]:
diagnoses = pd.merge(diagnosis_data, diagnosis_labels, on="ICD9_CODE", how="left")
display(diagnoses.head())

Unnamed: 0,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE,SHORT_TITLE,LONG_TITLE
0,109,172335,1.0,40301,Mal hyp kid w cr kid V,"Hypertensive chronic kidney disease, malignant..."
1,109,172335,2.0,486,"Pneumonia, organism NOS","Pneumonia, organism unspecified"
2,109,172335,3.0,58281,Chr nephritis in oth dis,Chronic glomerulonephritis in diseases classif...
3,109,172335,4.0,5855,Chron kidney dis stage V,"Chronic kidney disease, Stage V"
4,109,172335,5.0,4254,Prim cardiomyopathy NEC,Other primary cardiomyopathies


Unnamed: 0,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE,SHORT_TITLE,LONG_TITLE
34709,4957,191463,7.0,E9408,Adv eff cns stimulnt NEC,Other specified central nervous system stimula...


In [37]:
display(diagnoses.shape)  
display(diagnoses.info())  
display(diagnoses.describe())  

(651047, 6)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 651047 entries, 0 to 651046
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   SUBJECT_ID   651047 non-null  int64  
 1   HADM_ID      651047 non-null  int64  
 2   SEQ_NUM      651000 non-null  float64
 3   ICD9_CODE    651000 non-null  object 
 4   SHORT_TITLE  634709 non-null  object 
 5   LONG_TITLE   634709 non-null  object 
dtypes: float64(1), int64(2), object(3)
memory usage: 29.8+ MB


None

Unnamed: 0,SUBJECT_ID,HADM_ID,SEQ_NUM
count,651047.0,651047.0,651000.0
mean,38971.159758,150017.744819,7.913836
std,29372.198841,28878.068648,6.072633
min,2.0,100001.0,1.0
25%,14562.5,125028.0,3.0
50%,28671.0,150140.0,6.0
75%,63715.0,174978.0,11.0
max,99999.0,199999.0,39.0


In [38]:
print(diagnoses.isnull().sum())  # get counts of missing values per column

SUBJECT_ID         0
HADM_ID            0
SEQ_NUM           47
ICD9_CODE         47
SHORT_TITLE    16338
LONG_TITLE     16338
dtype: int64


In [39]:
top_diagnoses = icd9_counts_sorted.sort_values(by='COUNT', ascending=False).head(10)
display(top_diagnoses)

Unnamed: 0,ICD9_CODE,COUNT
6983,4019,20703
6982,4280,13111
6981,42731,12891
6980,41401,12429
6979,5849,9119
6978,25000,9058
6977,2724,8690
6976,51881,7497
6975,5990,6555
6974,53081,6326


In [16]:
# counts of each ICD9_CODE
icd9_counts = diagnoses['ICD9_CODE'].value_counts().reset_index()

# rename to make it more easy to read
icd9_counts.columns = ['ICD9_CODE', 'COUNT']

# sort df by counts in ascending order
icd9_counts_sorted = icd9_counts.sort_values(by='COUNT', ascending=True)
icd9_counts_sorted.reset_index(drop=True, inplace=True)

print(icd9_counts_sorted)

     ICD9_CODE  COUNT
0        20930      1
1        E9250      1
2        37886      1
3        32724      1
4        37214      1
...        ...    ...
6979      5849   9119
6980     41401  12429
6981     42731  12891
6982      4280  13111
6983      4019  20703

[6984 rows x 2 columns]


In [17]:
single_diagnosis = icd9_counts_sorted[icd9_counts_sorted["COUNT"] == 1]
single_diagnosis

Unnamed: 0,ICD9_CODE,COUNT
0,20930,1
1,E9250,1
2,37886,1
3,32724,1
4,37214,1
...,...,...
1498,E9408,1
1499,49301,1
1500,55202,1
1501,1118,1


In [18]:
#check to make sure diagnoses left merge worked properly
display(len(diagnosis_data))
display(len(diagnoses))

651047

651047

In [24]:
f"{np.round((len(single_diagnosis)/len(diagnoses))*100,2)}%"

'0.23%'

In [78]:
rare_codes = single_diagnosis["ICD9_CODE"].tolist()
rare_codes

['20930',
 'E9250',
 '37886',
 '32724',
 '37214',
 'E0011',
 'E0299',
 '11519',
 '73345',
 '62570',
 'V537',
 '72706',
 '36544',
 '81404',
 '9594',
 '07998',
 '55841',
 '85184',
 '66541',
 '66131',
 'E9051',
 '9065',
 '6203',
 '01505',
 '1703',
 '7358',
 '5260',
 '94524',
 'V789',
 '20480',
 'E9295',
 '5646',
 '78499',
 '71296',
 '80196',
 '9102',
 '9114',
 '71891',
 '27709',
 'E857',
 '9610',
 '72882',
 '27702',
 '53171',
 '1765',
 '82534',
 '9617',
 '80046',
 '37990',
 'V8489',
 'V8709',
 'V5842',
 '9309',
 '2882',
 '80080',
 '37520',
 '55129',
 '8053',
 '72665',
 '71195',
 '5931',
 '71985',
 '53401',
 'E9419',
 'E0064',
 'E9001',
 '7723',
 '4878',
 '33701',
 '45181',
 '20148',
 'E8669',
 '5161',
 '9619',
 '87333',
 '64101',
 '6232',
 '68101',
 '7791',
 '69279',
 '0919',
 '71918',
 '78932',
 'E8318',
 '71237',
 '0540',
 'V1381',
 'E8839',
 '20005',
 '2692',
 'E8041',
 'E0062',
 '37182',
 'V9103',
 '65701',
 '20903',
 '37733',
 '94100',
 '95219',
 '78053',
 '2458',
 '71104',
 '33709',

In [79]:
diagnoses["rare"] = diagnoses["ICD9_CODE"].isin(rare_codes)

In [80]:
diagnoses

Unnamed: 0,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE,SHORT_TITLE,LONG_TITLE,rare
0,109,172335,1.0,40301,Mal hyp kid w cr kid V,"Hypertensive chronic kidney disease, malignant...",False
1,109,172335,2.0,486,"Pneumonia, organism NOS","Pneumonia, organism unspecified",False
2,109,172335,3.0,58281,Chr nephritis in oth dis,Chronic glomerulonephritis in diseases classif...,False
3,109,172335,4.0,5855,Chron kidney dis stage V,"Chronic kidney disease, Stage V",False
4,109,172335,5.0,4254,Prim cardiomyopathy NEC,Other primary cardiomyopathies,False
...,...,...,...,...,...,...,...
651042,97503,188195,2.0,20280,Oth lymp unsp xtrndl org,"Other malignant lymphomas, unspecified site, e...",False
651043,97503,188195,3.0,V5869,Long-term use meds NEC,Long-term (current) use of other medications,False
651044,97503,188195,4.0,V1279,Prsnl hst ot spf dgst ds,Personal history of other diseases of digestiv...,False
651045,97503,188195,5.0,5275,Sialolithiasis,Sialolithiasis,False


In [26]:
rare_diseases = diagnoses[diagnoses["ICD9_CODE"].isin(rare_codes)]

In [27]:
rare_diseases

Unnamed: 0,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE,SHORT_TITLE,LONG_TITLE
0,109,172335,1.0,40301,Mal hyp kid w cr kid V,"Hypertensive chronic kidney disease, malignant..."
1,109,172335,2.0,486,"Pneumonia, organism NOS","Pneumonia, organism unspecified"
2,109,172335,3.0,58281,Chr nephritis in oth dis,Chronic glomerulonephritis in diseases classif...
3,109,172335,4.0,5855,Chron kidney dis stage V,"Chronic kidney disease, Stage V"
4,109,172335,5.0,4254,Prim cardiomyopathy NEC,Other primary cardiomyopathies
...,...,...,...,...,...,...
651042,97503,188195,2.0,20280,Oth lymp unsp xtrndl org,"Other malignant lymphomas, unspecified site, e..."
651043,97503,188195,3.0,V5869,Long-term use meds NEC,Long-term (current) use of other medications
651044,97503,188195,4.0,V1279,Prsnl hst ot spf dgst ds,Personal history of other diseases of digestiv...
651045,97503,188195,5.0,5275,Sialolithiasis,Sialolithiasis


In [28]:
len(rare_diseases)

651000

In [36]:
rare_diseases[rare_diseases["ICD9_CODE"] == "40301"]

Unnamed: 0,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE,SHORT_TITLE,LONG_TITLE
0,109,172335,1.0,40301,Mal hyp kid w cr kid V,"Hypertensive chronic kidney disease, malignant..."
14,109,173633,1.0,40301,Mal hyp kid w cr kid V,"Hypertensive chronic kidney disease, malignant..."
1130,109,131345,1.0,40301,Mal hyp kid w cr kid V,"Hypertensive chronic kidney disease, malignant..."
1150,109,131376,1.0,40301,Mal hyp kid w cr kid V,"Hypertensive chronic kidney disease, malignant..."
1170,109,135923,1.0,40301,Mal hyp kid w cr kid V,"Hypertensive chronic kidney disease, malignant..."
...,...,...,...,...,...,...
563380,81543,172553,5.0,40301,Mal hyp kid w cr kid V,"Hypertensive chronic kidney disease, malignant..."
567559,79804,146876,2.0,40301,Mal hyp kid w cr kid V,"Hypertensive chronic kidney disease, malignant..."
585297,83418,118975,2.0,40301,Mal hyp kid w cr kid V,"Hypertensive chronic kidney disease, malignant..."
641749,98015,129709,3.0,40301,Mal hyp kid w cr kid V,"Hypertensive chronic kidney disease, malignant..."


In [29]:
lab_events = pd.read_csv('./data/LABEVENTS.csv.gz')
lab_events = lab_events.drop(columns=["ROW_ID"])  
# lab_events.columns = lab_events.columns.str.lower()
lab_labels = pd.read_csv('./data/D_LABITEMS.csv.gz')
lab_labels = lab_labels.drop(columns=["ROW_ID"])  
# lab_labels.columns = lab_events.columns.str.lower()
display(lab_events.head())
display(lab_labels.head())

Unnamed: 0,SUBJECT_ID,HADM_ID,ITEMID,CHARTTIME,VALUE,VALUENUM,VALUEUOM,FLAG
0,3,,50820,2101-10-12 16:07:00,7.39,7.39,units,
1,3,,50800,2101-10-12 18:17:00,ART,,,
2,3,,50802,2101-10-12 18:17:00,-1,-1.0,mEq/L,
3,3,,50804,2101-10-12 18:17:00,22,22.0,mEq/L,
4,3,,50808,2101-10-12 18:17:00,0.93,0.93,mmol/L,abnormal


Unnamed: 0,ITEMID,LABEL,FLUID,CATEGORY,LOINC_CODE
0,51346,Blasts,Cerebrospinal Fluid (CSF),Hematology,26447-3
1,51347,Eosinophils,Cerebrospinal Fluid (CSF),Hematology,26451-5
2,51348,"Hematocrit, CSF",Cerebrospinal Fluid (CSF),Hematology,30398-2
3,51349,Hypersegmented Neutrophils,Cerebrospinal Fluid (CSF),Hematology,26506-6
4,51350,Immunophenotyping,Cerebrospinal Fluid (CSF),Hematology,


In [43]:
print(lab_events.HADM_ID.isnull().sum()/len(lab_events))  # Count missing values per column

0.2013717930836282


In [30]:
labs = pd.merge(lab_events, lab_labels, on = "ITEMID", how="left")

In [31]:
labs.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,ITEMID,CHARTTIME,VALUE,VALUENUM,VALUEUOM,FLAG,LABEL,FLUID,CATEGORY,LOINC_CODE
0,3,,50820,2101-10-12 16:07:00,7.39,7.39,units,,pH,Blood,Blood Gas,11558-4
1,3,,50800,2101-10-12 18:17:00,ART,,,,SPECIMEN TYPE,BLOOD,BLOOD GAS,
2,3,,50802,2101-10-12 18:17:00,-1,-1.0,mEq/L,,Base Excess,Blood,Blood Gas,11555-0
3,3,,50804,2101-10-12 18:17:00,22,22.0,mEq/L,,Calculated Total CO2,Blood,Blood Gas,34728-6
4,3,,50808,2101-10-12 18:17:00,0.93,0.93,mmol/L,abnormal,Free Calcium,Blood,Blood Gas,1994-3


In [32]:

lab_test_counts = labs['ITEMID'].value_counts().reset_index()

lab_test_counts.columns = ['ITEMID', 'COUNT']

lab_test_counts_sorted = lab_test_counts.sort_values(by='COUNT', ascending=False)

lab_test_counts_sorted.reset_index(drop=True, inplace=True)

print(lab_test_counts_sorted)

     ITEMID   COUNT
0     51221  881764
1     50971  845737
2     50983  808401
3     50912  797389
4     50902  795480
..      ...     ...
721   51537       1
722   51126       1
723   51378       1
724   51531       1
725   51483       1

[726 rows x 2 columns]


In [33]:
common_lab_ids = lab_test_counts_sorted["ITEMID"].head(10).tolist()
common_lab_ids

[51221, 50971, 50983, 50912, 50902, 51006, 50882, 51265, 50868, 51301]

In [34]:
common_labs = labs[labs["ITEMID"].isin(common_lab_ids)]

In [35]:
common_labs.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,ITEMID,CHARTTIME,VALUE,VALUENUM,VALUEUOM,FLAG,LABEL,FLUID,CATEGORY,LOINC_CODE
11,3,,50868,2101-10-13 03:00:00,13.0,13.0,mEq/L,,Anion Gap,Blood,Chemistry,1863-0
12,3,,50882,2101-10-13 03:00:00,23.0,23.0,mEq/L,,Bicarbonate,Blood,Chemistry,1963-8
14,3,,50902,2101-10-13 03:00:00,109.0,109.0,mEq/L,,Chloride,Blood,Chemistry,2075-0
15,3,,50912,2101-10-13 03:00:00,1.7,1.7,mg/dL,abnormal,Creatinine,Blood,Chemistry,2160-0
19,3,,50971,2101-10-13 03:00:00,4.3,4.3,mEq/L,,Potassium,Blood,Chemistry,2823-3


In [44]:
print(common_labs.HADM_ID.isnull().sum()/len(common_labs))  # count missing values per column

0.21553552303217446


In [45]:
len(common_labs)

8002653

In [50]:
admit_common_labs = common_labs[["SUBJECT_ID","HADM_ID","ITEMID","VALUE","VALUENUM"]]

In [51]:
len(admit_common_labs)

8002653

In [52]:
admit_common_labs.isnull().sum()

SUBJECT_ID          0
HADM_ID       1724856
ITEMID              0
VALUE              22
VALUENUM         1653
dtype: int64

In [57]:
admit_common_labs[admit_common_labs["VALUENUM"].isnull() & admit_common_labs["VALUE"].notnull()]

Unnamed: 0,SUBJECT_ID,HADM_ID,ITEMID,VALUE,VALUENUM
2145,3,,50971,GREATER THAN 10,
19822,52,190797.0,51221,ERROR,
19834,52,190797.0,51265,ERROR,
19840,52,190797.0,51301,ERROR,
25149,61,176332.0,51301,<0.1,
...,...,...,...,...,...
27422974,91090,,51301,ERROR,
27623343,96686,175190.0,50882,LESS THAN 5,
27718970,96688,,51265,ERROR,
27817223,99100,199665.0,50971,HEMOLYSIS FALSELY ELEVATES K,


In [59]:
admit_common_labs = common_labs[["SUBJECT_ID","HADM_ID","ITEMID","VALUENUM"]].dropna()

In [69]:
lab_features = admit_common_labs.pivot_table(
    index=['SUBJECT_ID','HADM_ID'], 
    columns='ITEMID', 
    values='VALUENUM', 
    # aggfunc=np.mean
).reset_index()

In [72]:
lab_features

ITEMID,SUBJECT_ID,HADM_ID,50868,50882,50902,50912,50971,50983,51006,51221,51265,51301
0,2,163353.0,,,,,,,,24.800000,153.500000,11.050000
1,3,145834.0,15.250000,18.937500,107.235294,1.826667,4.105263,137.555556,28.133333,30.226667,210.866667,12.533333
2,4,185777.0,17.250000,23.000000,101.000000,0.450000,3.537500,137.750000,16.125000,32.300000,286.428571,9.442857
3,5,178980.0,,,,,,,,43.000000,309.000000,13.900000
4,6,107064.0,16.235294,17.941176,105.117647,3.741176,4.911765,134.411765,53.941176,31.168750,247.947368,13.589474
...,...,...,...,...,...,...,...,...,...,...,...,...
57214,99985,176670.0,12.166667,25.000000,101.722222,0.905882,3.905263,134.833333,17.333333,27.833333,558.166667,18.727778
57215,99991,151118.0,12.500000,24.833333,109.500000,1.208333,3.773333,143.000000,20.333333,30.360000,254.500000,7.150000
57216,99992,197084.0,12.000000,24.500000,105.250000,0.975000,4.550000,137.250000,47.500000,26.287500,126.800000,15.480000
57217,99995,137810.0,9.500000,32.750000,99.000000,0.425000,3.775000,137.500000,11.500000,27.766667,159.666667,7.300000


In [None]:
lab_features.dropna()

In [85]:
display(lab_features.columns)
display(diagnoses.columns)
display(diagnoses[diagnoses["HADM_ID"] == 172335])

Index(['SUBJECT_ID',    'HADM_ID',        50868,        50882,        50902,
              50912,        50971,        50983,        51006,        51221,
              51265,        51301],
      dtype='object', name='ITEMID')

Index(['SUBJECT_ID', 'HADM_ID', 'SEQ_NUM', 'ICD9_CODE', 'SHORT_TITLE',
       'LONG_TITLE', 'rare'],
      dtype='object')

Unnamed: 0,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE,SHORT_TITLE,LONG_TITLE,rare
0,109,172335,1.0,40301,Mal hyp kid w cr kid V,"Hypertensive chronic kidney disease, malignant...",False
1,109,172335,2.0,486,"Pneumonia, organism NOS","Pneumonia, organism unspecified",False
2,109,172335,3.0,58281,Chr nephritis in oth dis,Chronic glomerulonephritis in diseases classif...,False
3,109,172335,4.0,5855,Chron kidney dis stage V,"Chronic kidney disease, Stage V",False
4,109,172335,5.0,4254,Prim cardiomyopathy NEC,Other primary cardiomyopathies,False
5,109,172335,6.0,2762,Acidosis,Acidosis,False
6,109,172335,7.0,7100,Syst lupus erythematosus,Systemic lupus erythematosus,False
7,109,172335,8.0,2767,Hyperpotassemia,Hyperpotassemia,False
8,109,172335,9.0,7243,Sciatica,Sciatica,False
9,109,172335,10.0,45829,Iatrogenc hypotnsion NEC,Other iatrogenic hypotension,False


In [86]:
grouped_diagnoses = diagnoses.groupby(['SUBJECT_ID', 'HADM_ID'], as_index=False).agg({
    'SEQ_NUM': 'first',  
    'ICD9_CODE': list,   # combine all ICD9_CODE into a list
    'SHORT_TITLE': list, # combine SHORT_TITLEs into a list
    'LONG_TITLE': list,  # combine LONG_TITLEs into a list
    'rare': 'max'        # Keep True if any row has True; False otherwise
})

In [87]:
grouped_diagnoses

Unnamed: 0,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE,SHORT_TITLE,LONG_TITLE,rare
0,2,163353,1.0,"[V3001, V053, V290]","[Single lb in-hosp w cs, Need prphyl vc vrl he...","[Single liveborn, born in hospital, delivered ...",False
1,3,145834,1.0,"[0389, 78559, 5849, 4275, 41071, 4280, 6826, 4...","[Septicemia NOS, Shock w/o trauma NEC, Acute k...","[Unspecified septicemia, Other shock without m...",False
2,4,185777,1.0,"[042, 1363, 7994, 2763, 7907, 5715, 04111, V09...","[Human immuno virus dis, Pneumocystosis, Cache...","[Human immunodeficiency virus [HIV] disease, P...",False
3,5,178980,1.0,"[V3000, V053, V290]","[Single lb in-hosp w/o cs, Need prphyl vc vrl ...","[Single liveborn, born in hospital, delivered ...",False
4,6,107064,1.0,"[40391, 4440, 9972, 2766, 2767, 2859, 2753, V1...","[Hyp kid NOS w cr kid V, nan, Surg comp-peri v...","[Hypertensive chronic kidney disease, unspecif...",False
...,...,...,...,...,...,...,...
58971,99985,176670,1.0,"[0389, 51881, 48241, 4870, 78552, V4281, 99592...","[Septicemia NOS, Acute respiratry failure, Met...","[Unspecified septicemia, Acute respiratory fai...",False
58972,99991,151118,1.0,"[56211, 0389, 5570, 5849, 99592, 56081, 78959,...","[Dvrtcli colon w/o hmrhg, Septicemia NOS, Ac v...",[Diverticulitis of colon (without mention of h...,True
58973,99992,197084,1.0,"[9999, 56881, 5772, 2851, 5849, 5799, 72992, 5...","[Complic med care NEC/NOS, Hemoperitoneum, Pan...",[Other and unspecified complications of medica...,False
58974,99995,137810,1.0,"[4414, 42833, 99812, 2851, 4241, 25000, 99811,...","[Abdom aortic aneurysm, Ac on chr diast hrt fa...",[Abdominal aneurysm without mention of rupture...,False


In [88]:
grouped_diagnoses[grouped_diagnoses["HADM_ID"] == 172335]

Unnamed: 0,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE,SHORT_TITLE,LONG_TITLE,rare
140,109,172335,1.0,"[40301, 486, 58281, 5855, 4254, 2762, 7100, 27...","[Mal hyp kid w cr kid V, Pneumonia, organism N...","[Hypertensive chronic kidney disease, malignan...",False


In [89]:
diagnoses_final = grouped_diagnoses[["SUBJECT_ID", "HADM_ID", "rare"]]

In [130]:
patients = pd.read_csv("./data/PATIENTS.csv.gz")
patients = patients[["SUBJECT_ID", "GENDER", "DOB"]]

In [125]:
common_labs_copy = common_labs.copy()

In [135]:
patients_labs = pd.merge(common_labs_copy, patients, on="SUBJECT_ID", how="left")

In [179]:
patients_labs = pd.merge(common_labs_copy, patients, on="SUBJECT_ID", how="left")
patients_labs['CHARTTIME'] = pd.to_datetime(patients_labs['CHARTTIME']).dt.date
patients_labs['DOB'] = pd.to_datetime(patients_labs['DOB']).dt.date
patients_labs["AGE"] = patients_labs.apply(lambda e: (e['CHARTTIME'] - e['DOB']).days/365, axis=1)


# apply bounds: Set values under 0 to 0 and values over 100 to 100
patients_labs["AGE"] = patients_labs["AGE"].clip(lower=0, upper=100)
# map 'M' to 0 and 'F' to 1
patients_labs['GENDER'] = patients_labs['GENDER'].map({'M': 0, 'F': 1})

In [180]:
patients_labs["AGE"].describe()

count    8.002653e+06
mean     6.324067e+01
std      1.873476e+01
min      0.000000e+00
25%      5.208767e+01
50%      6.472329e+01
75%      7.673151e+01
max      1.000000e+02
Name: AGE, dtype: float64

In [181]:
patients_labs

Unnamed: 0,SUBJECT_ID,HADM_ID,ITEMID,CHARTTIME,VALUE,VALUENUM,VALUEUOM,FLAG,LABEL,FLUID,CATEGORY,LOINC_CODE,GENDER,DOB,AGE
0,3,,50868,2101-10-13,13,13.0,mEq/L,,Anion Gap,Blood,Chemistry,1863-0,0,2025-04-11,76.556164
1,3,,50882,2101-10-13,23,23.0,mEq/L,,Bicarbonate,Blood,Chemistry,1963-8,0,2025-04-11,76.556164
2,3,,50902,2101-10-13,109,109.0,mEq/L,,Chloride,Blood,Chemistry,2075-0,0,2025-04-11,76.556164
3,3,,50912,2101-10-13,1.7,1.7,mg/dL,abnormal,Creatinine,Blood,Chemistry,2160-0,0,2025-04-11,76.556164
4,3,,50971,2101-10-13,4.3,4.3,mEq/L,,Potassium,Blood,Chemistry,2823-3,0,2025-04-11,76.556164
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8002648,96442,120151.0,51221,2115-06-29,29.0,29.0,%,abnormal,Hematocrit,Blood,Hematology,4544-3,1,2070-05-14,45.153425
8002649,96443,103219.0,50868,2109-12-30,15,15.0,mEq/L,,Anion Gap,Blood,Chemistry,1863-0,1,2045-02-20,64.898630
8002650,96443,103219.0,50882,2109-12-30,26,26.0,mEq/L,,Bicarbonate,Blood,Chemistry,1963-8,1,2045-02-20,64.898630
8002651,96443,103219.0,50902,2109-12-30,97,97.0,mEq/L,,Chloride,Blood,Chemistry,2075-0,1,2045-02-20,64.898630


In [182]:
admit_patients_labs = patients_labs[["SUBJECT_ID","HADM_ID","ITEMID","VALUENUM", "AGE", "GENDER"]].dropna()

In [183]:
patient_demo = admit_patients_labs[["SUBJECT_ID","HADM_ID", "AGE", "GENDER"]].drop_duplicates(subset=["SUBJECT_ID", "HADM_ID"])

In [184]:
patient_demo

Unnamed: 0,SUBJECT_ID,HADM_ID,AGE,GENDER
54,3,145834.0,76.575342,0
111,2,163353.0,0.000000,0
353,4,185777.0,47.873973,1
663,5,178980.0,0.000000,0
857,6,107064.0,66.000000,1
...,...,...,...,...
7997689,98939,115549.0,22.597260,1
7998423,98883,144680.0,23.446575,0
8000584,98759,175386.0,69.134247,1
8000941,98982,167146.0,79.915068,1


In [185]:
admit = admit_patients_labs.drop(columns=["AGE", "GENDER"])

In [186]:
admit

Unnamed: 0,SUBJECT_ID,HADM_ID,ITEMID,VALUENUM
54,3,145834.0,50868,17.0
55,3,145834.0,50882,25.0
56,3,145834.0,50902,99.0
57,3,145834.0,50912,3.2
58,3,145834.0,50971,5.4
...,...,...,...,...
8002648,96442,120151.0,51221,29.0
8002649,96443,103219.0,50868,15.0
8002650,96443,103219.0,50882,26.0
8002651,96443,103219.0,50902,97.0


In [187]:
lab_patients_features = admit.pivot_table(
    index=['SUBJECT_ID','HADM_ID'], 
    columns='ITEMID', 
    values='VALUENUM', 
    # aggfunc=np.mean
).reset_index()

In [188]:
lab_patients_features

ITEMID,SUBJECT_ID,HADM_ID,50868,50882,50902,50912,50971,50983,51006,51221,51265,51301
0,2,163353.0,,,,,,,,24.800000,153.500000,11.050000
1,3,145834.0,15.250000,18.937500,107.235294,1.826667,4.105263,137.555556,28.133333,30.226667,210.866667,12.533333
2,4,185777.0,17.250000,23.000000,101.000000,0.450000,3.537500,137.750000,16.125000,32.300000,286.428571,9.442857
3,5,178980.0,,,,,,,,43.000000,309.000000,13.900000
4,6,107064.0,16.235294,17.941176,105.117647,3.741176,4.911765,134.411765,53.941176,31.168750,247.947368,13.589474
...,...,...,...,...,...,...,...,...,...,...,...,...
57214,99985,176670.0,12.166667,25.000000,101.722222,0.905882,3.905263,134.833333,17.333333,27.833333,558.166667,18.727778
57215,99991,151118.0,12.500000,24.833333,109.500000,1.208333,3.773333,143.000000,20.333333,30.360000,254.500000,7.150000
57216,99992,197084.0,12.000000,24.500000,105.250000,0.975000,4.550000,137.250000,47.500000,26.287500,126.800000,15.480000
57217,99995,137810.0,9.500000,32.750000,99.000000,0.425000,3.775000,137.500000,11.500000,27.766667,159.666667,7.300000


In [189]:
lab_patients_features = pd.merge(lab_patients_features, patient_demo, on=["SUBJECT_ID", "HADM_ID"], how="left")

In [190]:
lab_patients_features


Unnamed: 0,SUBJECT_ID,HADM_ID,50868,50882,50902,50912,50971,50983,51006,51221,51265,51301,AGE,GENDER
0,2,163353.0,,,,,,,,24.800000,153.500000,11.050000,0.000000,0
1,3,145834.0,15.250000,18.937500,107.235294,1.826667,4.105263,137.555556,28.133333,30.226667,210.866667,12.533333,76.575342,0
2,4,185777.0,17.250000,23.000000,101.000000,0.450000,3.537500,137.750000,16.125000,32.300000,286.428571,9.442857,47.873973,1
3,5,178980.0,,,,,,,,43.000000,309.000000,13.900000,0.000000,0
4,6,107064.0,16.235294,17.941176,105.117647,3.741176,4.911765,134.411765,53.941176,31.168750,247.947368,13.589474,66.000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57214,99985,176670.0,12.166667,25.000000,101.722222,0.905882,3.905263,134.833333,17.333333,27.833333,558.166667,18.727778,53.841096,0
57215,99991,151118.0,12.500000,24.833333,109.500000,1.208333,3.773333,143.000000,20.333333,30.360000,254.500000,7.150000,47.761644,0
57216,99992,197084.0,12.000000,24.500000,105.250000,0.975000,4.550000,137.250000,47.500000,26.287500,126.800000,15.480000,65.813699,1
57217,99995,137810.0,9.500000,32.750000,99.000000,0.425000,3.775000,137.500000,11.500000,27.766667,159.666667,7.300000,88.758904,1


In [92]:
final_data = pd.merge(diagnoses_final, lab_features, on=["SUBJECT_ID", "HADM_ID"], how="left")

In [93]:
final_data

Unnamed: 0,SUBJECT_ID,HADM_ID,rare,50868,50882,50902,50912,50971,50983,51006,51221,51265,51301
0,2,163353,False,,,,,,,,24.800000,153.500000,11.050000
1,3,145834,False,15.250000,18.937500,107.235294,1.826667,4.105263,137.555556,28.133333,30.226667,210.866667,12.533333
2,4,185777,False,17.250000,23.000000,101.000000,0.450000,3.537500,137.750000,16.125000,32.300000,286.428571,9.442857
3,5,178980,False,,,,,,,,43.000000,309.000000,13.900000
4,6,107064,False,16.235294,17.941176,105.117647,3.741176,4.911765,134.411765,53.941176,31.168750,247.947368,13.589474
...,...,...,...,...,...,...,...,...,...,...,...,...,...
58971,99985,176670,False,12.166667,25.000000,101.722222,0.905882,3.905263,134.833333,17.333333,27.833333,558.166667,18.727778
58972,99991,151118,True,12.500000,24.833333,109.500000,1.208333,3.773333,143.000000,20.333333,30.360000,254.500000,7.150000
58973,99992,197084,False,12.000000,24.500000,105.250000,0.975000,4.550000,137.250000,47.500000,26.287500,126.800000,15.480000
58974,99995,137810,False,9.500000,32.750000,99.000000,0.425000,3.775000,137.500000,11.500000,27.766667,159.666667,7.300000


In [94]:
model_data = final_data.drop(['SUBJECT_ID', 'HADM_ID'], axis=1)

In [95]:
model_data

Unnamed: 0,rare,50868,50882,50902,50912,50971,50983,51006,51221,51265,51301
0,False,,,,,,,,24.800000,153.500000,11.050000
1,False,15.250000,18.937500,107.235294,1.826667,4.105263,137.555556,28.133333,30.226667,210.866667,12.533333
2,False,17.250000,23.000000,101.000000,0.450000,3.537500,137.750000,16.125000,32.300000,286.428571,9.442857
3,False,,,,,,,,43.000000,309.000000,13.900000
4,False,16.235294,17.941176,105.117647,3.741176,4.911765,134.411765,53.941176,31.168750,247.947368,13.589474
...,...,...,...,...,...,...,...,...,...,...,...
58971,False,12.166667,25.000000,101.722222,0.905882,3.905263,134.833333,17.333333,27.833333,558.166667,18.727778
58972,True,12.500000,24.833333,109.500000,1.208333,3.773333,143.000000,20.333333,30.360000,254.500000,7.150000
58973,False,12.000000,24.500000,105.250000,0.975000,4.550000,137.250000,47.500000,26.287500,126.800000,15.480000
58974,False,9.500000,32.750000,99.000000,0.425000,3.775000,137.500000,11.500000,27.766667,159.666667,7.300000


In [None]:
# I want to do 2 models. one with the labs and diagnoses, one with demographics + labs + diagnoses. and see if we see any difference
# then try the 20 most common labs

In [97]:
model_data.columns = model_data.columns.map(str)

In [98]:
import numpy as np

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
model_data_normalized = pd.DataFrame(
    scaler.fit_transform(model_data),
    index=model_data.index,
    columns=model_data.columns
)

print("Normalized Lab Features:\n", model_data_normalized.head())


Normalized Lab Features:
    rare     50868     50882     50902     50912     50971     50983     51006  \
0   0.0       NaN       NaN       NaN       NaN       NaN       NaN       NaN   
1   0.0  0.266304  0.281250  0.493548  0.008759  0.338549  0.521280  0.115829   
2   0.0  0.309783  0.369565  0.402054  0.002158  0.261824  0.523779  0.063759   
3   0.0       NaN       NaN       NaN       NaN       NaN       NaN       NaN   
4   0.0  0.287724  0.259591  0.462475  0.017939  0.447536  0.480871  0.227736   

      51221     51265     51301  
0  0.252129  0.095640  0.023419  
1  0.344577  0.132754  0.026592  
2  0.379898  0.181638  0.019982  
3  0.562181  0.196241  0.029515  
4  0.360626  0.156743  0.028850  


In [99]:
model_data_normalized[model_data_normalized["rare"] != 0]

Unnamed: 0,rare,50868,50882,50902,50912,50971,50983,51006,51221,51265,51301
56,1.0,,,,,,,,0.660988,0.197535,0.037856
60,1.0,0.169437,0.394501,0.479306,0.001847,0.266813,0.521700,0.086683,0.256753,0.011789,0.001243
63,1.0,0.245652,0.345652,0.443140,0.004577,0.254505,0.507712,0.049026,0.254357,0.132134,0.028620
64,1.0,0.213043,0.408696,0.399120,0.004076,0.289189,0.481087,0.045872,0.598240,0.179312,0.036002
80,1.0,0.253623,0.376812,0.465640,0.004156,0.283784,0.544130,0.044427,0.503691,0.128527,0.036358
...,...,...,...,...,...,...,...,...,...,...,...
58834,1.0,0.228261,0.304348,0.490095,0.005994,0.371622,0.507712,0.080561,0.282794,0.253657,0.019320
58852,1.0,0.244802,0.451796,0.389555,0.006525,0.350851,0.511759,0.098283,0.350827,0.142295,0.029814
58936,1.0,0.204969,0.391304,0.482759,0.004110,0.319257,0.544666,0.054544,0.381911,0.116379,0.018702
58939,1.0,0.191304,0.543478,0.413793,0.004795,0.413514,0.550129,0.121755,0.371948,0.194171,0.020318


In [3]:
@time_function
def read_gz_data(filename):
    display(f"reading filename: {filename}")
    return pd.read_csv(filename)

In [6]:
read_gz_data("./data/DIAGNOSES_ICD.csv.gz")
read_gz_data("./data/D_ICD_DIAGNOSES.csv.gz")
read_gz_data("./data/LABEVENTS.csv.gz")
read_gz_data("./data/D_LABITEMS.csv.gz")
read_gz_data("./data/PATIENTS.csv.gz")

'reading filename: ./data/DIAGNOSES_ICD.csv.gz'

"Function name: 'read_gz_data' took 0.3713 seconds to complete."

'reading filename: ./data/D_ICD_DIAGNOSES.csv.gz'

"Function name: 'read_gz_data' took 0.0480 seconds to complete."

'reading filename: ./data/LABEVENTS.csv.gz'

"Function name: 'read_gz_data' took 27.7117 seconds to complete."

'reading filename: ./data/D_LABITEMS.csv.gz'

"Function name: 'read_gz_data' took 0.0122 seconds to complete."

'reading filename: ./data/PATIENTS.csv.gz'

"Function name: 'read_gz_data' took 0.0797 seconds to complete."

Unnamed: 0,ROW_ID,SUBJECT_ID,GENDER,DOB,DOD,DOD_HOSP,DOD_SSN,EXPIRE_FLAG
0,234,249,F,2075-03-13 00:00:00,,,,0
1,235,250,F,2164-12-27 00:00:00,2188-11-22 00:00:00,2188-11-22 00:00:00,,1
2,236,251,M,2090-03-15 00:00:00,,,,0
3,237,252,M,2078-03-06 00:00:00,,,,0
4,238,253,F,2089-11-26 00:00:00,,,,0
...,...,...,...,...,...,...,...,...
46515,31840,44089,M,2026-05-25 00:00:00,,,,0
46516,31841,44115,F,2124-07-27 00:00:00,,,,0
46517,31842,44123,F,2049-11-26 00:00:00,2135-01-12 00:00:00,2135-01-12 00:00:00,,1
46518,31843,44126,F,2076-07-25 00:00:00,,,,0


In [5]:
def get_model_data(common_lab_count=10, rare_diagnosis_count=1):
    print(f"rare_diagnosis_count: {rare_diagnosis_count}")
    print(f"common_lab_count: {common_lab_count}")
    diagnosis_data = pd.read_csv("./data/DIAGNOSES_ICD.csv.gz")
    diagnosis_data = diagnosis_data.drop(columns=["ROW_ID"])  
    diagnosis_labels = pd.read_csv("./data/D_ICD_DIAGNOSES.csv.gz")
    diagnosis_labels = diagnosis_labels.drop(columns=["ROW_ID"])  
    diagnoses = pd.merge(diagnosis_data, diagnosis_labels, on="ICD9_CODE", how="left")

    icd9_counts = diagnoses['ICD9_CODE'].value_counts().reset_index()
    
    # Rename the columns for clarity
    icd9_counts.columns = ['ICD9_CODE', 'COUNT']
    
    # Sort the DataFrame by counts in ascending order
    icd9_counts_sorted = icd9_counts.sort_values(by='COUNT', ascending=True)
    
    # Reset the index for clean output
    icd9_counts_sorted.reset_index(drop=True, inplace=True)
    single_diagnosis = icd9_counts_sorted[icd9_counts_sorted["COUNT"] == rare_diagnosis_count]
    rare_codes = single_diagnosis["ICD9_CODE"].tolist()

    diagnoses["rare"] = diagnoses["ICD9_CODE"].isin(rare_codes)
    lab_events = pd.read_csv('./data/LABEVENTS.csv.gz')
    lab_events = lab_events.drop(columns=["ROW_ID"])  
    # lab_events.columns = lab_events.columns.str.lower()
    lab_labels = pd.read_csv('./data/D_LABITEMS.csv.gz')
    lab_labels = lab_labels.drop(columns=["ROW_ID"]) 
    labs = pd.merge(lab_events, lab_labels, on = "ITEMID", how="left")
    lab_test_counts = labs['ITEMID'].value_counts().reset_index()
    
    lab_test_counts.columns = ['ITEMID', 'COUNT']
    
    lab_test_counts_sorted = lab_test_counts.sort_values(by='COUNT', ascending=False)
    
    lab_test_counts_sorted.reset_index(drop=True, inplace=True)
    common_lab_ids = lab_test_counts_sorted["ITEMID"].head(common_lab_count).tolist()
    common_labs = labs[labs["ITEMID"].isin(common_lab_ids)]
    
    
    
    patients = pd.read_csv("./data/PATIENTS.csv.gz")
    patients = patients[["SUBJECT_ID", "GENDER", "DOB"]]
    patients_labs = pd.merge(common_labs, patients, on="SUBJECT_ID", how="left")
    patients_labs['CHARTTIME'] = pd.to_datetime(patients_labs['CHARTTIME']).dt.date
    patients_labs['DOB'] = pd.to_datetime(patients_labs['DOB']).dt.date
    patients_labs["AGE"] = patients_labs.apply(lambda e: (e['CHARTTIME'] - e['DOB']).days/365, axis=1)
    
    # Apply bounds: Set values under 0 to 0 and values over 100 to 100
    patients_labs["AGE"] = patients_labs["AGE"].clip(lower=0, upper=100)
    # Map 'M' to 0 and 'F' to 1
    patients_labs['GENDER'] = patients_labs['GENDER'].map({'M': 0, 'F': 1})
    
    admit_patients_labs = patients_labs[["SUBJECT_ID","HADM_ID","ITEMID","VALUENUM", "AGE", "GENDER"]].dropna()
    patient_demo = admit_patients_labs[["SUBJECT_ID","HADM_ID", "AGE", "GENDER"]].drop_duplicates(subset=["SUBJECT_ID", "HADM_ID"])
    admit = admit_patients_labs.drop(columns=["AGE", "GENDER"])

    lab_features = admit.pivot_table(
        index=['SUBJECT_ID','HADM_ID'], 
        columns='ITEMID', 
        values='VALUENUM', 
        # aggfunc=np.mean
    ).reset_index()

    lab_features = pd.merge(lab_features, patient_demo, on=["SUBJECT_ID", "HADM_ID"], how="left")

    grouped_diagnoses = diagnoses.groupby(['SUBJECT_ID', 'HADM_ID'], as_index=False).agg({
        'SEQ_NUM': 'first',  # Example aggregation, adjust as needed
        'ICD9_CODE': list,   # Combine all ICD9_CODEs into a list
        'SHORT_TITLE': list, # Combine SHORT_TITLEs into a list
        'LONG_TITLE': list,  # Combine LONG_TITLEs into a list
        'rare': 'max'        # Keep True if any row has True; False otherwise
    })
    diagnoses_final = grouped_diagnoses[["SUBJECT_ID", "HADM_ID", "rare"]]
    final_data = pd.merge(diagnoses_final, lab_features, on=["SUBJECT_ID", "HADM_ID"], how="left")
    model_data = final_data.drop(['SUBJECT_ID', 'HADM_ID'], axis=1)
    model_data.columns = model_data.columns.map(str)
    scaler = MinMaxScaler()
    model_data_normalized = pd.DataFrame(
        scaler.fit_transform(model_data),
        index=model_data.index,
        columns=model_data.columns
    )
    model_data_normalized = model_data_normalized.apply(lambda col: col.fillna(col.median()), axis=0)
    return model_data_normalized

In [13]:
model_data = get_model_data(10,1)
model_data

rare_diagnosis_count: 1
common_lab_count: 10


Unnamed: 0,rare,50868,50882,50902,50912,50971,50983,51006,51221,51265,51301,AGE,GENDER
0,0.0,0.225753,0.418060,0.448407,0.004475,0.337838,0.537704,0.077092,0.252129,0.095640,0.023419,0.000000,0.0
1,0.0,0.266304,0.281250,0.493548,0.008759,0.338549,0.521280,0.115829,0.344577,0.132754,0.026592,0.765753,0.0
2,0.0,0.309783,0.369565,0.402054,0.002158,0.261824,0.523779,0.063759,0.379898,0.181638,0.019982,0.478740,1.0
3,0.0,0.225753,0.418060,0.448407,0.004475,0.337838,0.537704,0.077092,0.562181,0.196241,0.029515,0.000000,0.0
4,0.0,0.287724,0.259591,0.462475,0.017939,0.447536,0.480871,0.227736,0.360626,0.156743,0.028850,0.660000,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
58971,0.0,0.199275,0.413043,0.412652,0.004344,0.311522,0.486290,0.068998,0.303805,0.357439,0.039840,0.538411,0.0
58972,1.0,0.206522,0.409420,0.526779,0.005794,0.293694,0.591260,0.082007,0.346848,0.160982,0.015078,0.477616,0.0
58973,0.0,0.195652,0.402174,0.464417,0.004675,0.398649,0.517352,0.199806,0.277470,0.078367,0.032894,0.658137,1.0
58974,0.0,0.141304,0.581522,0.372707,0.002038,0.293919,0.520566,0.043704,0.302669,0.099630,0.015399,0.887589,1.0


Unnamed: 0,rare,50868,50882,50902,50912,50971,50983,51006,51221,51265,51301,AGE,GENDER
1,0.0,0.266304,0.281250,0.493548,0.008759,0.338549,0.521280,0.115829,0.344577,0.132754,0.026592,0.765753,0.0
2,0.0,0.309783,0.369565,0.402054,0.002158,0.261824,0.523779,0.063759,0.379898,0.181638,0.019982,0.478740,1.0
4,0.0,0.287724,0.259591,0.462475,0.017939,0.447536,0.480871,0.227736,0.360626,0.156743,0.028850,0.660000,1.0
7,0.0,0.208075,0.394410,0.408343,0.008152,0.231081,0.479434,0.092331,0.476055,0.173759,0.028659,0.418301,0.0
9,0.0,0.241419,0.449657,0.422906,0.003028,0.330725,0.544344,0.063217,0.468484,0.154632,0.020870,0.501781,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
58971,0.0,0.199275,0.413043,0.412652,0.004344,0.311522,0.486290,0.068998,0.303805,0.357439,0.039840,0.538411,0.0
58972,1.0,0.206522,0.409420,0.526779,0.005794,0.293694,0.591260,0.082007,0.346848,0.160982,0.015078,0.477616,0.0
58973,0.0,0.195652,0.402174,0.464417,0.004675,0.398649,0.517352,0.199806,0.277470,0.078367,0.032894,0.658137,1.0
58974,0.0,0.141304,0.581522,0.372707,0.002038,0.293919,0.520566,0.043704,0.302669,0.099630,0.015399,0.887589,1.0


In [7]:
@time_function
def run_model(model_name, model_info, X_train, y_train):
    print(f"Running GridSearchCV for {model_name}...")
    grid_search = GridSearchCV(
        estimator=model_info["model"],
        param_grid=model_info["params"],
        scoring="accuracy",
        cv=5,  # 5-fold cross-validation
        n_jobs=1
    )
    grid_search.fit(X_train, y_train)
    return grid_search

In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from imblearn.over_sampling import ADASYN
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

# Step 1: Handle Missing Data
# imputer = SimpleImputer(strategy='mean')  # Impute NaNs with the column mean
X = model_data.drop(columns=['rare'])  # Features
y = model_data['rare']  # Target

# X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Step 2: Normalize Data
# scaler = MinMaxScaler()
# X_normalized = pd.DataFrame(scaler.fit_transform(X_imputed), columns=X.columns)

# Step 3: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Step 4: Apply SMOTE to Handle Imbalance
# smote = SMOTE(random_state=42)
# X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Apply ADASYN for oversampling
adasyn = ADASYN(random_state=42, sampling_strategy='minority')  # You can adjust sampling_strategy as needed
X_train_adasyn, y_train_adasyn = adasyn.fit_resample(X_train, y_train)

# print("Original dataset shape:", {sum(y_train == 0): sum(y_train == 1)})
# print("Resampled dataset shape:", {sum(y_train_adasyn == 0): sum(y_train_adasyn == 1)})

# Step 5: Define Models and Hyperparameter Grids
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

models_and_parameters = {
    "RandomForest": {
        "model": RandomForestClassifier(random_state=42),
        "params": {
            "n_estimators": [200, 250],
            "max_depth": [10,14,12],
            "min_samples_split": [5, 8]
        }
    },
    # "LogisticRegression": {
    #     "model": LogisticRegression(random_state=42, max_iter=100),
    #     "params": {
    #         "C": [0.1, 1, 10],
    #         "solver": ["liblinear", "lbfgs"]
    #     }
    # },
    # "SVC": {
    #     "model": SVC(random_state=42),
    #     "params": {
    #         "C": [1],
    #         "kernel": [ "poly"],
    #         "gamma": ["scale"]
    #     }
    # },
    # "GradientBoosting": {
    #     "model": GradientBoostingClassifier(random_state=42),
    #     "params": {
    #         "n_estimators": [200],
    #         "learning_rate": [0.1],
    #         "max_depth": [5]
    #     }
    # },
    # "XGBoost": {
    #     "model": XGBClassifier(random_state=42, use_label_encoder=False, eval_metric="logloss"),
    #     "params": {
    #         "n_estimators": [200],
    #         "learning_rate": [0.1],
    #         "max_depth": [5],
    #         "subsample": [1.0]
    #         # "n_estimators": [100, 200],
    #         # "learning_rate": [0.01, 0.1],
    #         # "max_depth": [3, 5],
    #         # "subsample": [0.8, 1.0]
    #     }
    # }
}


# Step 6: Grid Search for Each Model
best_models = {}
for model_name, model_info in models_and_parameters.items():
    # print(f"Running GridSearchCV for {model_name}...")
    # grid_search = GridSearchCV(
    #     estimator=model_info["model"],
    #     param_grid=model_info["params"],
    #     scoring="accuracy",
    #     cv=5,  # 5-fold cross-validation
    #     n_jobs=1
    # )
    # grid_search.fit(X_train_adasyn, y_train_adasyn)
    grid_search = run_model(model_name, model_info, X_train_adasyn, y_train_adasyn)
    best_models[model_name] = grid_search.best_estimator_
    print(f"Best parameters for {model_name}: {grid_search.best_params_}")
    print(f"Best cross-validation accuracy for {model_name}: {grid_search.best_score_:.4f}")

# Step 7: Evaluate the Best Model on Test Data
print("\nEvaluating Best Models on Test Data:")
for model_name, model in best_models.items():
    print(f"\nEvaluating {model_name}...")
    y_pred = model.predict(X_test)
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print("\nAccuracy Score:")
    print(accuracy_score(y_test, y_pred))


Running GridSearchCV for RandomForest...


"Function name: 'run_model' took 34964.1540 seconds to complete."

Best parameters for RandomForest: {'max_depth': 14, 'min_samples_split': 5, 'n_estimators': 250}
Best cross-validation accuracy for RandomForest: 0.8224

Evaluating Best Models on Test Data:

Evaluating RandomForest...
Confusion Matrix:
[[10183  1340]
 [  198    75]]

Classification Report:
              precision    recall  f1-score   support

         0.0       0.98      0.88      0.93     11523
         1.0       0.05      0.27      0.09       273

    accuracy                           0.87     11796
   macro avg       0.52      0.58      0.51     11796
weighted avg       0.96      0.87      0.91     11796


Accuracy Score:
0.8696168192607664


In [None]:
# Best parameters for RandomForest: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 200}
# Best cross-validation accuracy for RandomForest: 0.7454

In [195]:
X = model_data.drop(columns="rare")
y = model_data["rare"]

In [197]:
y

0        0.0
1        0.0
2        0.0
3        0.0
4        0.0
        ... 
58971    0.0
58972    1.0
58973    0.0
58974    0.0
58975    0.0
Name: rare, Length: 58976, dtype: float64

In [12]:
import torch
from torch import nn, optim
from sklearn.model_selection import train_test_split

# Prepare data
X = model_data.drop(columns="rare")
y = model_data["rare"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# convert to tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).unsqueeze(1)

# quick simple model
class RareDiseasePredictor(nn.Module):
    def __init__(self, input_size):
        super(RareDiseasePredictor, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.sigmoid(self.fc3(x))
        return x

input_size = X_train.shape[1]
model = RareDiseasePredictor(input_size)

# train
criterion = nn.BCELoss()  # Binary Cross-Entropy for classification
optimizer = optim.Adam(model.parameters(), lr=0.001)

# training loop
epochs = 5
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    output = model(X_train_tensor)
    loss = criterion(output, y_train_tensor)
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch + 1}/{epochs}], Loss: {loss.item():.4f}")

# evaluate
model.eval()
with torch.no_grad():
    predictions = model(X_test_tensor)
    accuracy = (predictions.round() == y_test_tensor).float().mean()
    print(f"Accuracy: {accuracy.item():.4f}")


ValueError: could not determine the shape of object type 'DataFrame'