In [6]:
# Import required libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [7]:
# Load the diagnoses data
diagnoses_path = "PROJECT/Data/diagnoses_icd.csv"
demographics_path = "PROJECT/Data/demographics.csv"

if os.path.exists(diagnoses_path):
    diagnoses_df = pd.read_csv(diagnoses_path)
    print(f"Loaded {len(diagnoses_df)} diagnosis records")
else:
    raise FileNotFoundError(f"Diagnoses file not found at {diagnoses_path}")
    
if os.path.exists(demographics_path):
    demographics_df = pd.read_csv(demographics_path)
    print(f"Loaded {len(demographics_df)} patient demographic records")
else:
    print("Demographics file not found. Will only analyze diagnosis data.")

Loaded 200694 diagnosis records
Loaded 223452 patient demographic records


In [8]:
# Basic data exploration
print(f"Number of unique patients: {diagnoses_df['subject_id'].nunique()}")
print(f"Number of unique ICD codes: {diagnoses_df['icd_code'].nunique()}")

# Find the most common diagnoses
top_diagnoses = diagnoses_df['icd_code'].value_counts().head(10)
print("\nTop 10 most common diagnoses:")
print(top_diagnoses)

# Distribution of diagnoses per patient
diagnoses_per_patient = diagnoses_df.groupby('subject_id').size()
print("\nDiagnoses per patient statistics:")
print(f"Mean: {diagnoses_per_patient.mean():.2f}")
print(f"Median: {diagnoses_per_patient.median():.2f}")
print(f"Min: {diagnoses_per_patient.min()}")
print(f"Max: {diagnoses_per_patient.max()}")

Number of unique patients: 10318
Number of unique ICD codes: 11080

Top 10 most common diagnoses:
icd_code
4019      3463
I10       3173
E785      2918
2724      2142
Z87891    2042
K219      1721
53081     1409
25000     1294
F419      1278
F329      1278
Name: count, dtype: int64

Diagnoses per patient statistics:
Mean: 19.45
Median: 12.00
Min: 1
Max: 100


In [9]:
'''
223452 patients
gender
F    117736
M    105716

insurance
Private      86139
Medicare     85524
Medicaid     38022
Other         6952
No charge      187

language
English         202713
Spanish           6783
Chinese           3369
Russian           2415
Kabuverdianu      1750
Portuguese        1452
Haitian           1003
Other              608
Vietnamese         494
Arabic             303

marital_status
MARRIED     95614
SINGLE      82797
WIDOWED     20226
DIVORCED    14119

primary aggregation
race
WHITE                                        147528
BLACK/AFRICAN AMERICAN                        23400
UNKNOWN                                       10352
ASIAN                                          9660
OTHER                                          9469
HISPANIC/LATINO                                8758
HISPANIC OR LATINO                             3223
UNABLE TO OBTAIN                               2610
BLACK/CAPE VERDEAN                             2285
BLACK/CARIBBEAN ISLAND                         1499
BLACK/AFRICAN                                  1475
PATIENT DECLINED TO ANSWER                     1161
PORTUGUESE                                      684
AMERICAN INDIAN/ALASKA NATIVE                   481
SOUTH AMERICAN                                  343
MULTIPLE RACE/ETHNICITY                         270
NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER       254

secondary aggregation
race
WHITE                                        147528
BLACK                                         28659
UNKNOWN                                       10352
ASIAN                                          9660
OTHER                                          9469
HISPANIC                                       8758
HISPANIC OR LATINO                             3223
UNABLE TO OBTAIN                               2610
PATIENT DECLINED TO ANSWER                     1161
PORTUGUESE                                      684
AMERICAN INDIAN                                 481
SOUTH AMERICAN                                  343
MULTIPLE RACE                                   270
NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER       254
Name: count, dtype: int64

'''
# Get information about data types and missing values
print("\nDemographics Data Information:")
print(f"Number of rows: {len(demographics_df)}")
print(f"Number of columns: {len(demographics_df.columns)}")
print("\nMissing values per column:")
print(demographics_df.isnull().sum())

# Display distribution of categorical variables if present
categorical_cols = demographics_df.select_dtypes(include=['object', 'category']).columns
if len(categorical_cols) > 0:
    print("\nDistribution of categorical variables:")
    for col in categorical_cols:
        print(demographics_df[col].value_counts())  # Show top 10 categories


Demographics Data Information:
Number of rows: 223452
Number of columns: 6

Missing values per column:
subject_id            0
gender                0
insurance          6628
language            634
marital_status    10696
race                  0
dtype: int64

Distribution of categorical variables:
gender
F    117736
M    105716
Name: count, dtype: int64
insurance
Private      86139
Medicare     85524
Medicaid     38022
Other         6952
No charge      187
Name: count, dtype: int64
language
English                   202713
Spanish                     6783
Chinese                     3369
Russian                     2415
Kabuverdianu                1750
Portuguese                  1452
Haitian                     1003
Other                        608
Vietnamese                   494
Arabic                       303
Italian                      302
Modern Greek (1453-)         277
Persian                      166
Korean                       160
American Sign Language       152
Thai   

##  cross validation to find the best k value for precision-recall tradeoff

k=11 gives the best performance:

- Mean AUC-ROC: 0.865 ± 0.039
- Mean PR-AUC: 0.584 ± 0.076
- Mean Accuracy: 0.732 ± 0.047
- Mean F1 Score: 0.594 ± 0.053

A similarity threshold between 0.12-0.17 gives the best results. 0.15 is a good middle point
- Mean AUC-ROC: 0.867 ± 0.021
- Mean PR-AUC: 0.626 ± 0.042
- Mean Accuracy: 0.755 ± 0.019
- Mean F1 Score: 0.611 ± 0.020

'''