In [None]:
# Import required libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
# Load the diagnoses data
diagnoses_path = "Data/diagnoses_icd.csv"
demographics_path = "Data/demographics.csv"

if os.path.exists(diagnoses_path):
    diagnoses_df = pd.read_csv(diagnoses_path)
    print(f"Loaded {len(diagnoses_df)} diagnosis records")
else:
    raise FileNotFoundError(f"Diagnoses file not found at {diagnoses_path}")
    
if os.path.exists(demographics_path):
    demographics_df = pd.read_csv(demographics_path)
    print(f"Loaded {len(demographics_df)} patient demographic records")
else:
    print("Demographics file not found. Will only analyze diagnosis data.")

In [None]:
# Basic data exploration
print(f"Number of unique patients: {diagnoses_df['subject_id'].nunique()}")
print(f"Number of unique ICD codes: {diagnoses_df['icd_code'].nunique()}")

# Find the most common diagnoses
top_diagnoses = diagnoses_df['icd_code'].value_counts().head(10)
print("\nTop 10 most common diagnoses:")
print(top_diagnoses)

# Distribution of diagnoses per patient
diagnoses_per_patient = diagnoses_df.groupby('subject_id').size()
print("\nDiagnoses per patient statistics:")
print(f"Mean: {diagnoses_per_patient.mean():.2f}")
print(f"Median: {diagnoses_per_patient.median():.2f}")
print(f"Min: {diagnoses_per_patient.min()}")
print(f"Max: {diagnoses_per_patient.max()}")

In [None]:
'''
223452 patients
gender
F    117736
M    105716

insurance
Private      86139
Medicare     85524
Medicaid     38022
Other         6952
No charge      187

language
English         202713
Spanish           6783
Chinese           3369
Russian           2415
Kabuverdianu      1750
Portuguese        1452
Haitian           1003
Other              608
Vietnamese         494
Arabic             303

marital_status
MARRIED     95614
SINGLE      82797
WIDOWED     20226
DIVORCED    14119

race
WHITE                             138346
BLACK/AFRICAN AMERICAN             23400
UNKNOWN                            10352
OTHER                               9469
WHITE - OTHER EUROPEAN              5717
ASIAN                               4302
HISPANIC/LATINO - PUERTO RICAN      3530
ASIAN - CHINESE                     3398
HISPANIC OR LATINO                  3223
UNABLE TO OBTAIN                    2610
'''
# Get information about data types and missing values
print("\nDemographics Data Information:")
print(f"Number of rows: {len(demographics_df)}")
print(f"Number of columns: {len(demographics_df.columns)}")
print("\nMissing values per column:")
print(demographics_df.isnull().sum())

# Display distribution of categorical variables if present
categorical_cols = demographics_df.select_dtypes(include=['object', 'category']).columns
if len(categorical_cols) > 0:
    print("\nDistribution of categorical variables:")
    for col in categorical_cols:
        print(demographics_df[col].value_counts().head(10))  # Show top 10 categories

In [None]:
from EVAL.knn_diagnosis_eval import diagnosis_evaluation
# cross validation to find the best k value for precision-recall tradeoff
'''
k=11 gives the best performance
=== Overall Results ===
Mean AUC-ROC: 0.865 ± 0.039
Mean PR-AUC: 0.584 ± 0.076
Mean Accuracy: 0.732 ± 0.047
Mean F1 Score: 0.594 ± 0.053
a similarity threshold between 0.12-0.17 gives the best results. 0.15 is a good middle point

=== Overall Results ===
Mean AUC-ROC: 0.867 ± 0.021
Mean PR-AUC: 0.626 ± 0.042
Mean Accuracy: 0.755 ± 0.019
Mean F1 Score: 0.611 ± 0.020

'''

In [None]:
from EVAL.knn_diagnosis_eval import evaluate_threshold_curve
#
evaluate_threshold_curve(diagnoses_df, "I10")