In [None]:
import pandas as pd
import numpy as np


import os

import pyrootutils
root = pyrootutils.setup_root(
    search_from=os.path.abspath(''),
    indicator=[".git"],
    pythonpath=True, # add root directory to the PYTHONPATH (helps with imports)
    dotenv=True, # load environment variables from .env if exists in root directory
)

from utils.file_management.config_loader import load_yaml, process_config_values
from utils.file_management.file_manager import FileManager
from utils.query_utils.extractor import Extractor

from tableone import TableOne

In [None]:
# Load yaml file with dataset information
cohort_cfg_path = str(root) + '/code/config/LBP_cohort.yaml'
config = process_config_values(load_yaml(cohort_cfg_path))

# Load paths to data
PlumsFiles = FileManager(config.get('file_directory'))

# Identify Patients
patientdurablekey_list = pd.read_csv(PlumsFiles.get_datapath('patientdurablekey_csv'))
patientdurablekey_list = list(patientdurablekey_list['patientdurablekey'])
print(len(patientdurablekey_list))

# Identify Imaging ID (accessions)
accessionnumber_list = pd.read_csv(PlumsFiles.get_datapath('accessionnumber_csv'))
accessionnumber_list = list(accessionnumber_list['accessionnumber'])
print(len(accessionnumber_list))

# Initialize data extraction tools
check_query_flag = True       #axilluary checks to see if query makes sense
PlumsExtractor = Extractor(num_results_flag=True, display_results_flag=True)

# Load Demographic Predictors

In [None]:
# Predictor Labels

dataQuery = f'''
/*
Description: Patient demographics
*/

SELECT DISTINCT
    patientdurablekey,
    ageatfirstimaging,
    yearatfirstimaging,
    sex,
    preferredlanguage,
    raceethnicity,
    smokingstatus,
    religion,
    socialsupport
FROM 
  read_parquet('{PlumsFiles.get_datapath('patdurabledim_analysis_parquet')}')
WHERE 
  patientdurablekey IN {tuple(patientdurablekey_list)}

ORDER BY
  patientdurablekey
'''

# Run query and update relevant keys
results_df_pd = PlumsExtractor.run_query(dataQuery,runtime_flag=True,df_type='pandas')

#Check whether query makes sense
if check_query_flag==True:
    PlumsExtractor.col_to_list(results_df_pd, 'patientdurablekey')
    
df_predictors1 = results_df_pd.copy()

results_df_pd.head()

In [None]:
# Predictor Labels
dataQuery = f'''
/*
Description: Patient insurance
*/

SELECT DISTINCT
    a.patientdurablekey,
    b.primaryinsurance
FROM
(
    SELECT DISTINCT
        patientdurablekey,
        MAX(primaryinsurancekey) AS primaryinsurancekey, 
    FROM 
      read_parquet('{PlumsFiles.get_datapath('billingaccountfact_analysis_parquet')}') 
    WHERE 
      patientdurablekey IN {tuple(patientdurablekey_list)}
    GROUP BY
      patientdurablekey
) as a
INNER JOIN 
    read_parquet('{PlumsFiles.get_datapath('billingaccountfact_analysis_parquet')}') as b
ON a.primaryinsurancekey = b.primaryinsurancekey

ORDER BY
  a.patientdurablekey
'''

# Run query and update relevant keys
results_df_pd = PlumsExtractor.run_query(dataQuery,runtime_flag=True,df_type='pandas')

#Check whether query makes sense
# if check_query_flag==True:
#     PlumsExtractor.col_to_list(results_df_pd, 'patientdurablekey')

df_predictors2 = results_df_pd.copy()

results_df_pd.head()

In [None]:
# Identify patients without any interventions
missing_patients = list(set(patientdurablekey_list) - set(df_predictors2['patientdurablekey']))
print(len(missing_patients))
# Create a new DataFrame with the new keys and label set to 0
df_missing = pd.DataFrame({'patientdurablekey': missing_patients, 'primaryinsurance': 'unknown'})
# Append the new rows to the existing DataFrame
df_predictors2 = pd.concat([df_predictors2, df_missing], ignore_index=True)
df_predictors2

In [None]:
# Predictor Labels
dataQuery = f'''
/*
Description: Patient diagnoses
*/

SELECT DISTINCT
    *
FROM 
  read_parquet('{PlumsFiles.get_datapath('diagnosiseventfact_analysis_parquet')}')
WHERE 
  patientdurablekey IN {tuple(patientdurablekey_list)}
        
ORDER BY
  patientdurablekey
'''

# Run query and update relevant keys
results_df_pd = PlumsExtractor.run_query(dataQuery,runtime_flag=True,df_type='pandas')

#Check whether query makes sense
if check_query_flag==True:
    PlumsExtractor.col_to_list(results_df_pd, 'patientdurablekey')

df_predictors3 = results_df_pd.copy()

# Set 'key' as index if needed
df_predictors3 = results_df_pd.copy()
df_predictors3['lbpduration'][df_predictors3['lbpduration']=='unspecified'] = 0
df_predictors3['lbpduration'][df_predictors3['lbpduration']=='acute'] = 1
df_predictors3['lbpduration'][df_predictors3['lbpduration']=='chronic'] = 2
df_predictors3 = df_predictors3.groupby('patientdurablekey', as_index=True).max().reset_index()

results_df_pd.head()

In [None]:
# Predictor Labels
dataQuery = f'''
/*
Description: Combine patient info into a patient profile
*/

SELECT DISTINCT
    *
FROM 
  (SELECT DISTINCT
  a.*,
  b.primaryinsurance
  FROM df_predictors1 as a
  INNER JOIN df_predictors2 as b
    ON a.patientdurablekey = b.patientdurablekey
  ) as d
INNER JOIN df_predictors3 as c
ON d.patientdurablekey = c.patientdurablekey
        
ORDER BY
  c.patientdurablekey
'''

# Run query and update relevant keys
results_df_pd = PlumsExtractor.run_query(dataQuery,runtime_flag=True,df_type='pandas')

#Check whether query makes sense
if check_query_flag==True:
    PlumsExtractor.col_to_list(results_df_pd, 'patientdurablekey')

df = results_df_pd.copy().drop(['patientdurablekey_1'], axis=1)
df = df.drop(['anxiety','depression'],axis=1)
results_df_pd.head()

In [None]:
df_summary = TableOne(df, 
                      categorical=['sex', 'preferredlanguage', 'raceethnicity', 'smokingstatus', 'religion',
                                   'socialsupport', 'primaryinsurance', 'negativepsychstate', 'obesity',
                                   'lbpduration', 'sciatica', 'facetjointarthropathy', 'scoliosis',
                                   'discpathology', 'spinalstenosis', 'sacroiliacjoint', 'diabetes'],
                      
                      continuous=['ageatfirstimaging', 'yearatfirstimaging'])
df_summary

# Determine type of missing data

In [None]:
import pandas as pd
from collections import Counter

#from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, balanced_accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer

from missforest import MissForest

from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer

import seaborn as sns
import matplotlib.pyplot as plt



## Data preparation

In [None]:
missing_vars = ['smokingstatus', 'primaryinsurance', 'socialsupport', 'raceethnicity'] #'religion', 
predictors = ['sex', 'ageatfirstimaging', 'yearatfirstimaging', 'preferredlanguage',
             'smokingstatus', 'primaryinsurance', 'socialsupport', 'raceethnicity']
# predictors = ['sex', 'ageatfirstimaging', 'yearatfirstimaging', 'preferredlanguage']
# predictors = ['sex', 'ageatfirstimaging', 'yearatfirstimaging', 'preferredlanguage',
#              'negativepsychstate', 'obesity', 'lbpduration', 'sciatica', 'facetjointarthropathy', 
#              'scoliosis', 'discpathology', 'spinalstenosis', 'sacroiliacjoint', 'diabetes'] ##'religion', 

In [None]:
# Count the number of 'unknown' in each row
unknown_counts = df.apply(lambda row: (row == 'unknown').sum(), axis=1)

# Generate summary
summary = Counter(unknown_counts)

# Convert summary to a readable format
summary_df = pd.DataFrame.from_dict(summary, orient='index', columns=['Row Count']).sort_index()
summary_df.index.name = 'Number of Unknowns'

print(summary_df)

In [None]:
# Function to count other columns with 'unknown' when column 'a' is 'unknown'
def count_unknowns_when_col_unknown(row, col):
    if row[col] == 'unknown':
        return (row.drop(col) == 'unknown').sum()  # Count 'unknown' in other columns
    return 0  # Return 0 if column 'a' is not 'unknown'

# Apply the function to each row
for col in missing_vars:
    total_other_unknowns = df.apply(lambda row: count_unknowns_when_col_unknown(row, col), axis=1)
    print(f"When column '{col}' is 'unknown', total number of other 'unknown' fields:")
    print(total_other_unknowns.value_counts())
    print("\n")

In [None]:
for var in missing_vars:
    # Change unknown to nan variables to so that we can use dataframe functions to drop
    df[f'{var}'] = [np.nan if x=='unknown' else x for x in df[var]]
    # Create missingness indicator
    df[f'{var}_missing'] = df[var].isna().astype(int)

## Little's MCAR test
test the null hypothesis that data is MCAR. Significant results suggest data is not MCAR

In [None]:
for var in missing_vars:
    # Adjust predictors
    adjusted_predictors = [val for val in predictors if val!=var]
    categorical_predictors = [val for val in adjusted_predictors if val!='ageatfirstimaging' and val!='yearatfirstimaging']
    
    # Drop rows where the 'adjusted_predictors' columns have missing values
    df2 = df.dropna(subset=adjusted_predictors)  # Drop rows with missing predictors
    
    df_summary = TableOne(df2, 
                          groupby=f'{var}_missing', 
                          categorical=categorical_predictors,
                          continuous=['ageatfirstimaging', 'yearatfirstimaging'],
                          pval=True)
    print(df_summary)

## Logistic regression for each variable


Create binary indicators (missing = 1, non-missing = 0) for each variable.
Run logistic regression with observed variables as predictors. If missingness is significantly associated with observed variables, the data is likely MAR.

Hypotheses
* Patients who are smokers may refuse to answer (MNAR).
* Missingness may depend on observed variables like age or language (MAR).
* Patients refusing to disclose religion may be systematic (MNAR).
* Missingness could relate to observed variables like sex or age (MAR).
* Non-response might depend on language or demographic reluctance (MAR).

Interpretation
Logistic Regression Coefficients
* Coefficiens: Indicates the direction and strength of the relationship between a predictor and the outcome (missingness).
    * Positive (+): Increases the likelihood of the event (missing data).
    * Negative (−): Decreases the likelihood of the event (missing data).
* P-value: Shows the statistical significance of the predictor.
    * 𝑝<0.05: Statistically significant.
    * 𝑝≥0.05: Not statistically significant.
* Meaning:
    * +β, p<0.05 = The predictor increases the likelihood of missingness.
    * -β, p<0.05 = The predictor decreases the likelihood of missingness.
    * +β, p>0.05 = The predictor has a weak or no association with missingness (trend not significant).
    * -β, p>0.05 = The predictor has a weak or no association with missingness (trend not significant).
    * B~=0 = The predictor has almost no impact on missingness.
    
Metrics like Accuracy, Precision, and Recall from the classification_report give insight into how well the model predicts missingness.
* Good model performance: Supports MAR, as the missingness is explainable by observed data.
* Poor model performance: Suggests missingness may be completely random (MCAR) or dependent on unobserved factors (MNAR).


In [None]:
# Variation inflation factor (VIF)

"""
Variance Inflation Factor (VIF) is a metric used to detect multicollinearity in a regression model. Multicollinearity occurs 
when predictor variables are highly correlated, which can make it difficult to estimate the effect of each predictor accurately
due to inflate standard errors of coefficients, reduced statistical power, and unreliability regression estimates.

Interpreting VIF:
* VIF=1: No correlation with other variables.
* 1<VIF<5: Moderate correlation; generally acceptable.
* VIF≥5: High multicollinearity; consider addressing it.
* VIF≥10: Severe multicollinearity; problematic.
"""

for var in missing_vars:
    # Adjust predictors
    adjusted_predictors = [val for val in predictors if val!=var]
    categorical_predictors = [val for val in adjusted_predictors if val!='ageatfirstimaging' and val!='yearatfirstimaging']
    
    # Drop rows where the 'adjusted_predictors' columns have missing values
    df2 = df.dropna(subset=adjusted_predictors)  # Drop rows with missing predictors
    
    y = df2[f'{var}_missing'].to_numpy()
    X = df2[adjusted_predictors]
    
    # Encode the input variable and outcome variable as numbers
    for cat_var in categorical_predictors:
        X[cat_var] = label_encoder.fit_transform(X[cat_var].astype('str'))
    
    # Normalize each column/variable
    X = (X-X.min())/(X.max()-X.min())
    
    # Add a constant for the intercept
    X = sm.add_constant(X)
    
    # Calculate VIF for each feature
    vif_data = pd.DataFrame()
    vif_data["Variable"] = X.columns
    vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

    print(vif_data)


In [None]:
# PREDICTING MISSINGNESS WITH LOGISTIC REGRESSION
for var in missing_vars:
    # Adjust predictors
    adjusted_predictors = [val for val in predictors if val!=var]
    categorical_predictors = [val for val in adjusted_predictors if val!='ageatfirstimaging' and val!='yearatfirstimaging']
    
    # Drop rows where the 'adjusted_predictors' columns have missing values
    df2 = df.dropna(subset=adjusted_predictors)  # Drop rows with missing predictors
    
    y = df2[f'{var}_missing'].to_numpy()
    X = df2[adjusted_predictors]
    
    # Encode the input variable and outcome variable as numbers
    label_encoders = {}
    for cat_var in categorical_predictors:
        label_encoders[cat_var] = LabelEncoder()
        X[cat_var] = label_encoders.fit_transform(X[cat_var].astype('str'))
    
    # Normalize each column/variable
    X = (X-X.min())/(X.max()-X.min())
    
    # Add a constant for the intercept
    X = sm.add_constant(X)

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Fit the logistic regression model
    logit_model = sm.Logit(y_train, X_train)
    result = logit_model.fit()

    # Extract coefficients and p-values
    summary_table = pd.DataFrame({
        "Predictor": X.columns,
        "Coefficient": result.params.values,
        "P-value": result.pvalues.values
    })

    # Format table for better readability
    summary_table['P-value'] = summary_table['P-value'].apply(lambda p: f"{p:.3f}")
    summary_table['Coefficient'] = summary_table['Coefficient'].apply(lambda c: f"{c:.3f}")
    
    print(f"Missingness Analysis for {var}:")
    print(summary_table)

    # Predict probabilities on the test set
    y_pred_prob = result.predict(X_test)
    # Convert probabilities to binary predictions using a threshold (default = 0.5)
    y_pred = (y_pred_prob >= 0.5).astype(int)

    print(classification_report(y_test, y_pred))
    print(f"Balanced Accuracy: {balanced_accuracy_score(y_test, y_pred, adjusted=False):.2f}")
    print(f"Balanced Accuracy adjusted for chance: {balanced_accuracy_score(y_test, y_pred, adjusted=True):.2f}")
        
    # Generate the confusion matrix
    #cm = confusion_matrix(y_test, y_pred)
    #print("Confusion Matrix:\n", cm)

In [None]:
# PREDICTING MISSINGNESS WITH TREES

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
    
for var in missing_vars:
    # Adjust predictors
    adjusted_predictors = [val for val in predictors if val!=var]
    categorical_predictors = [val for val in adjusted_predictors if val!='ageatfirstimaging' and val!='yearatfirstimaging']
    
    # Drop rows where the 'adjusted_predictors' columns have missing values
    df2 = df.dropna(subset=adjusted_predictors)  # Drop rows with missing predictors
    
    y = df2[f'{var}_missing'].to_numpy()
    X = df2[adjusted_predictors]
    
    # Encode the input variable and outcome variable as numbers
    label_encoder = LabelEncoder()
    for cat_var in categorical_predictors:
        X[cat_var] = label_encoder.fit_transform(X[cat_var].astype('str'))

    # Normalize each column/variable
    X = (X-X.min())/(X.max()-X.min())
    
    # Add a constant for the intercept
    X = sm.add_constant(X)

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train and evaluate tree-based models
    models = {
        "Decision Tree": DecisionTreeClassifier(random_state=42),
        "Random Forest": RandomForestClassifier(random_state=42, n_estimators=100),
        "XGBoost": XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
    }
    
    print(f"Missingness Analysis for {var}:\n")
    for model_name, model in models.items():
        # Fit the model
        model.fit(X_train, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test)
        
        # Print performance metrics
        print(f"Model: {model_name}")
        print(classification_report(y_test, y_pred, zero_division=0))
        print(f"Balanced Accuracy: {balanced_accuracy_score(y_test, y_pred, adjusted=False):.2f}")
        print(f"Balanced Accuracy adjusted for chance: {balanced_accuracy_score(y_test, y_pred, adjusted=True):.2f}")
        # Generate the confusion matrix
        cm = confusion_matrix(y_test, y_pred)
        print("Confusion Matrix:\n", cm)

## Imputation Experimentation with Artifical Missingness

In [None]:


# Define the function to calculate accuracy for categorical variables
def calculate_accuracy(true_values, predicted_values, missing_values):
    """
    Calculate accuracy only for originally missing values.
    
    Parameters:
        true_values (array-like): Ground truth values.
        predicted_values (array-like): Imputed values.
        missing_values (array-like): Indicator of missing values in the original data.

    Returns:
        float: Accuracy for originally missing values.
    """
    mask = pd.isna(missing_values)  # Exclude nan values from the calculation
    true_values = true_values[mask]
    predicted_values = predicted_values[mask]
    return [accuracy_score(true_values, predicted_values), balanced_accuracy_score(true_values, predicted_values, adjusted=True)]

In [None]:
# Define predictors
predictors = ['sex', 'ageatfirstimaging', 'yearatfirstimaging', 'preferredlanguage',
              'smokingstatus', 'primaryinsurance', 'socialsupport', 'raceethnicity',
             'negativepsychstate', 'obesity', 'lbpduration', 'sciatica', 'facetjointarthropathy', 
             'scoliosis', 'discpathology', 'spinalstenosis', 'sacroiliacjoint', 'diabetes'] 
predictors = ['sex', 'ageatfirstimaging', 'yearatfirstimaging', 'preferredlanguage',
             'smokingstatus', 'primaryinsurance', 'socialsupport', 'raceethnicity']

### MissForest Imputation

In [None]:
# Drop rows where the 'adjusted_predictors' columns have missing values
df2 = df.dropna(subset=predictors)  # Drop rows with missing predictors

# Predictor type
categorical_predictors = [val for val in predictors if val!='ageatfirstimaging' and val!='yearatfirstimaging']

# Create a copy for the data to introduce artificial missingness
X = df2[list(set(predictors+missing_vars))]
X_missing = X.copy()

# Create missing values.
for var in missing_vars:    
    n = int(len(X_missing) * 0.1)
    rand_idx = np.random.choice(X_missing.index, n)
    X_missing.loc[rand_idx, var] = np.nan

# Split dataset into train and test sets.
train_gt, test_gt = train_test_split(X, test_size=.3, shuffle=True,
                               random_state=42)
train_missing, test_missing = train_test_split(X_missing, test_size=.3, shuffle=True,
                               random_state=42)
#train_missing.drop(categorical_predictors, axis=1)

# Default estimators are lgbm classifier and regressor (gradient boosting framework based on decision tree algorithms)
mf = MissForest()
mf.fit_transform(
    x=train_missing,
    categorical=categorical_predictors
)
train_imputed = mf.transform(x=train_missing)
test_imputed = mf.transform(x=test_missing)

# Calculate accuracy for categorical variables
for var in missing_vars:
    acc = calculate_accuracy(train_gt[var].values, train_imputed[var].values, train_missing[var].values)
    print(f"Train Accuracy for {var}: {acc[0]:.2f}, {acc[1]:.2f}")
    acc = calculate_accuracy(test_gt[var].values, test_imputed[var].values, test_missing[var].values)
    print(f"Test Accuracy for {var}: {acc[0]:.2f}, {acc[1]:.2f}")

In [None]:
# Compare distributions of original vs. imputed data
for idx, var in enumerate(missing_vars):
    # Plot distributions of training data
    plt.subplot(2,4,idx+1)
    label_encoder = LabelEncoder()
    x_missing = label_encoder.fit_transform(train_missing[var].dropna().astype('str'))
    x_gt      = label_encoder.fit_transform(train_gt[var].astype('str'))
    x_imputed = label_encoder.fit_transform(train_imputed[var].astype('str'))
    sns.kdeplot(x_missing, label="Training Data", color='green')
    #sns.kdeplot(x_gt, label="Ground Truth Data", color='blue')
    sns.kdeplot(x_imputed, label="Imputed Data", color='orange')
    
    # Set xticks for each integer
    plt.xticks(np.arange(x_gt.min(), x_gt.max()+1, 1))
    
    # Plot distributions of testing data
    plt.subplot(2,4,idx+5)
    x_missing = label_encoder.fit_transform(test_missing[var].dropna().astype('str'))
    x_gt      = label_encoder.fit_transform(test_gt[var].astype('str'))
    x_imputed = label_encoder.fit_transform(test_imputed[var].astype('str'))
    sns.kdeplot(x_missing, label="Training Data", color='green')
    #sns.kdeplot(x_gt, label="Ground Truth Data", color='blue')
    sns.kdeplot(x_imputed, label="Imputed Data", color='orange')
    
    # Set xticks for each integer
    plt.xticks(np.arange(x_gt.min(), x_gt.max()+1, 1))
    plt.xlabel(var)
    
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

### KNNImputer Imputation

In [None]:
# Drop rows where the 'adjusted_predictors' columns have missing values
df2 = df.dropna(subset=predictors)  # Drop rows with missing predictors

# Adjust predictors
categorical_predictors = [val for val in predictors if val!='ageatfirstimaging' and val!='yearatfirstimaging']
    
# Encode the input variable and outcome variable as numbers
label_encoder = LabelEncoder()
for cat_var in categorical_predictors:
    df2[cat_var] = label_encoder.fit_transform(df2[cat_var].astype('str'))
    
# Create a copy for the data to introduce artificial missingness
X = df2[list(set(predictors+missing_vars))]
X_missing = X.copy()

# Create missing values.
for var in missing_vars:    
    n = int(len(X_missing) * 0.1)
    rand_idx = np.random.choice(X_missing.index, n)
    X_missing.loc[rand_idx, var] = np.nan

# Split dataset into train and test sets.
train_gt, test_gt = train_test_split(X, test_size=.3, shuffle=True,
                               random_state=42)
train_missing, test_missing = train_test_split(X_missing, test_size=.3, shuffle=True,
                               random_state=42)

# Default estimators are lgbm classifier and regressor (gradient boosting framework based on decision tree algorithms)
mf = KNNImputer(n_neighbors=3)
# mf.fit for mf.fit_transform
mf.fit_transform(
    X=train_missing
)
train_imputed = mf.transform(X=train_missing)
train_imputed = pd.DataFrame(train_imputed, columns=X.columns)
test_imputed = mf.transform(X=test_missing)
test_imputed = pd.DataFrame(test_imputed, columns=X.columns)

# Encode the input variable and outcome variable as numbers
for cat_var in categorical_predictors:
    train_imputed[cat_var] = train_imputed[cat_var].astype(int)
    test_imputed[cat_var] = test_imputed[cat_var].astype(int)

# Calculate accuracy for categorical variables
for var in missing_vars:
    acc = calculate_accuracy(train_gt[var].values, train_imputed[var].values, train_missing[var].values)
    print(f"Train Accuracy for {var}: {acc[0]:.2f}, {acc[1]:.2f}")
    acc = calculate_accuracy(test_gt[var].values, test_imputed[var].values, test_missing[var].values)
    print(f"Test Accuracy for {var}: {acc[0]:.2f}, {acc[1]:.2f}")

In [None]:
# Compare distributions of original vs. imputed data
for idx, var in enumerate(missing_vars):
    # Plot distributions of training data
    plt.subplot(2,4,idx+1)
    label_encoder = LabelEncoder()
    x_missing = label_encoder.fit_transform(train_missing[var].dropna().astype('str'))
    x_gt      = label_encoder.fit_transform(train_gt[var].astype('str'))
    x_imputed = label_encoder.fit_transform(train_imputed[var].astype('str'))
    sns.kdeplot(x_missing, label="Training Data", color='green')
    #sns.kdeplot(x_gt, label="Ground Truth Data", color='blue')
    sns.kdeplot(x_imputed, label="Imputed Data", color='orange')
    
    # Set xticks for each integer
    plt.xticks(np.arange(x_gt.min(), x_gt.max()+1, 1))
    
    # Plot distributions of testing data
    plt.subplot(2,4,idx+5)
    x_missing = label_encoder.fit_transform(test_missing[var].dropna().astype('str'))
    x_gt      = label_encoder.fit_transform(test_gt[var].astype('str'))
    x_imputed = label_encoder.fit_transform(test_imputed[var].astype('str'))
    sns.kdeplot(x_missing, label="Training Data", color='green')
    #sns.kdeplot(x_gt, label="Ground Truth Data", color='blue')
    sns.kdeplot(x_imputed, label="Imputed Data", color='orange')
    
    # Set xticks for each integer
    plt.xticks(np.arange(x_gt.min(), x_gt.max()+1, 1))
    plt.xlabel(var)
    
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

### Iterative Imputer Imputation

In [None]:
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer


# Drop rows where the 'adjusted_predictors' columns have missing values
df2 = df.dropna(subset=predictors)  # Drop rows with missing predictors

# Adjust predictors
categorical_predictors = [val for val in predictors if val!='ageatfirstimaging' and val!='yearatfirstimaging']
    
# Encode the input variable and outcome variable as numbers
label_encoder = LabelEncoder()
for cat_var in categorical_predictors:
    X[cat_var] = label_encoder.fit_transform(X[cat_var].astype('str'))

# Create a copy for the data to introduce artificial missingness
X = df2[list(set(predictors+missing_vars))]
X_missing = X.copy()

# Create missing values.
for var in missing_vars:    
    n = int(len(X_missing) * 0.1)
    rand_idx = np.random.choice(X_missing.index, n)
    X_missing.loc[rand_idx, var] = np.nan

# Split dataset into train and test sets.
train_gt, test_gt = train_test_split(X, test_size=.3, shuffle=True,
                               random_state=42)
train_missing, test_missing = train_test_split(X_missing, test_size=.3, shuffle=True,
                               random_state=42)

# 
mf = IterativeImputer(random_state=42)
mf.fit_transform(
    X=train_missing
)
train_imputed = mf.transform(X=train_missing)
train_imputed = pd.DataFrame(train_imputed, columns=X.columns)
test_imputed = mf.transform(X=test_missing)
test_imputed = pd.DataFrame(test_imputed, columns=X.columns)

# Encode the input variable and outcome variable as numbers
for cat_var in categorical_predictors:
    train_imputed[cat_var] = train_imputed[cat_var].astype(int)
    test_imputed[cat_var] = test_imputed[cat_var].astype(int)

# Calculate accuracy for categorical variables
for var in missing_vars:
    acc = calculate_accuracy(train_gt[var].values, train_imputed[var].values, train_missing[var].values)
    print(f"Train Accuracy for {var}: {acc[0]:.2f}, {acc[1]:.2f}")
    acc = calculate_accuracy(test_gt[var].values, test_imputed[var].values, test_missing[var].values)
    print(f"Test Accuracy for {var}: {acc[0]:.2f}, {acc[1]:.2f}")

### Miceforest Imputation

In [None]:
import miceforest as micef


# Drop rows where the 'adjusted_predictors' columns have missing values
df2 = df.dropna(subset=predictors)  # Drop rows with missing predictors

# Adjust predictors
categorical_predictors = [val for val in predictors if val!='ageatfirstimaging' and val!='yearatfirstimaging']
    
# Encode the input variable and outcome variable as numbers
label_encoder = LabelEncoder()
for cat_var in categorical_predictors:
    df2[cat_var] = label_encoder.fit_transform(df2[cat_var].astype('str'))
    
# Create a copy for the data to introduce artificial missingness
X = df2[list(set(predictors+missing_vars))]
X_missing = X.copy()

# Create missing values.
for var in missing_vars:    
    n = int(len(X_missing) * 0.1)
    rand_idx = np.random.choice(X_missing.index, n)
    X_missing.loc[rand_idx, var] = np.nan

# Split dataset into train and test sets.
train_gt, test_gt = train_test_split(X, test_size=.3, shuffle=True,
                               random_state=42)
train_missing, test_missing = train_test_split(X_missing, test_size=.3, shuffle=True,
                               random_state=42)

# Create ImputationKernel
kernel = micef.ImputationKernel(
    train_missing.reset_index(drop=True),  # Exclude the first column (School DBN)
    save_all_iterations_data=True,
    random_state=42
)

# Perform MICE imputation
kernel.mice(2)

train_imputed = kernel.complete_data()
train_imputed = pd.DataFrame(train_imputed, columns=X.columns)
test_imputed = kernel.impute_new_data(test_missing.reset_index(drop=True))
test_imputed = test_imputed.complete_data()

# Encode the input variable and outcome variable as numbers
for cat_var in categorical_predictors:
    train_imputed[cat_var] = train_imputed[cat_var].astype(int)
    test_imputed[cat_var] = test_imputed[cat_var].astype(int)

# Calculate accuracy for categorical variables
for var in missing_vars:
    acc = calculate_accuracy(train_gt[var].values, train_imputed[var].values, train_missing[var].values)
    print(f"Train Accuracy for {var}: {acc[0]:.2f}, {acc[1]:.2f}")
    acc = calculate_accuracy(test_gt[var].values, test_imputed[var].values, test_missing[var].values)
    print(f"Test Accuracy for {var}: {acc[0]:.2f}, {acc[1]:.2f}")

In [None]:
kernel.impute_new_data(test_missing.reset_index(drop=True))

In [None]:


# # Default estimators are lightgbm classifier and regressor (gradient boosting framework based on decision tree algorithms)
# mf = MissForest()
# mf.fit(
#     x=X.dropna(subset=predictors),  # Drop rows with missing predictors,
#     categorical=categorical_predictors
# )
# LGBM_imputed_df = mf.transform(x=X)



# Mean absolute percentage error


## Imputation Exploration - Imputation Affect on Demographic distributions

In [None]:
# Predictors
predictors = ['sex', 'ageatfirstimaging', 'yearatfirstimaging', 'preferredlanguage',
             'smokingstatus', 'primaryinsurance', 'socialsupport', 'raceethnicity']

# Adjust predictors
categorical_predictors = [val for val in predictors if val!='ageatfirstimaging' and val!='yearatfirstimaging']
    
X = df[predictors].copy()
    
# Encode the input variable and preserve NaN
label_encoder = LabelEncoder()
for cat_var in categorical_predictors:
    # Fit on non-missing values and transform while preserving NaN
    non_nan_mask = ~X[cat_var].isna()
    X.loc[non_nan_mask, cat_var] = label_encoder.fit_transform(X.loc[non_nan_mask, cat_var].astype('str'))

X

### KNN

In [None]:
# KNN Imputation
'''
This class fills in missing values using the k-Nearest Neighbors algorithm. It estimates the missing value by averaging 
the values of the k-nearest neighbors in the feature space.
'''
imputer = KNNImputer(n_neighbors=3)
imputed_data = imputer.fit_transform(X)
imputed_df = pd.DataFrame(imputed_data, columns=X.columns)
imputed_df

### Iterative Impute

In [None]:
# Iterative Imputer
'''
This class models each feature with missing values as a function of other features, and uses iterative methods 
to estimate the missing values. It's more sophisticated than SimpleImputer and can be useful for complex datasets
'''
iter_imputer = IterativeImputer(random_state=42)
imputed_data_iter = iter_imputer.fit_transform(X)
imputed_df_iter = pd.DataFrame(imputed_data_iter, columns=X.columns)
imputed_df_iter

### MissForest

In [None]:
# Create missing values.
for c in X.columns:
    n = int(len(df) * 0.1)
    rand_idx = np.random.choice(X.index, n)
    X.loc[rand_idx, c] = np.nan

# Default estimators are lightgbm classifier and regressor (gradient boosting framework based on decision tree algorithms)
mf = MissForest()
mf.fit(
    x=X.dropna(subset=predictors),  # Drop rows with missing predictors,
    categorical=categorical_predictors
)
LGBM_imputed_df = mf.transform(x=X)


### Sensitivity Analysis

In [None]:
# Compare distributions of original vs. imputed data
import seaborn as sns
import matplotlib.pyplot as plt

missing_vars = ['smokingstatus',
 'primaryinsurance',
 'socialsupport',
 'raceethnicity']

for idx, var in enumerate(missing_vars):
    plt.subplot(2,3,idx+1)
    sns.kdeplot(X[var].dropna(), label="Observed Data", color='green')
    sns.kdeplot(imputed_df[var], label="Imputed Data", color='blue')
    sns.kdeplot(imputed_df_iter[var], label="Iter Imputed Data", color='orange')
    sns.kdeplot(LGBM_imputed_df[var], label="Light GBM Imputed Data", color='brown')
    
    # Set xticks for each integer
    plt.xticks(np.arange(X[var].min(), X[var].max()+1, 1))
    #plt.title(var)

plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

# Demographics Imputation for analysis
KNNImpute as the best method to fill missing values

In [None]:
# Adjust predictors
categorical_predictors = [val for val in predictors if val!='ageatfirstimaging' and val!='yearatfirstimaging']

X = df[predictors].copy()

# Encode the input variable and preserve NaN
label_encoders = {}
for cat_var in categorical_predictors:
    label_encoders[cat_var] = LabelEncoder()
    # Fit on non-missing values and transform while preserving NaN
    non_nan_mask = ~X[cat_var].isna()
    X.loc[non_nan_mask, cat_var] = label_encoders[cat_var].fit_transform(X.loc[non_nan_mask, cat_var].astype('str'))

X

In [None]:
# KNN Imputation
'''
This class fills in missing values using the k-Nearest Neighbors algorithm. It estimates the missing value by averaging 
the values of the k-nearest neighbors in the feature space.
'''
imputer = KNNImputer(n_neighbors=3)
imputed_data = imputer.fit_transform(X)
imputed_df = pd.DataFrame(imputed_data, columns=X.columns)
imputed_df

In [None]:
# Inverse transform encoded categorical variables
imputed_X = X.copy()
for cat_var in categorical_predictors:
    # Decode only valid encoded values, leave NaN as is
    imputed_X[cat_var] = imputed_df[cat_var].round().astype('int')  # Ensure int type for inverse transform
    valid_values_mask = df[cat_var] != imputed_df[cat_var].isna()  # Detect missing values
    imputed_X.loc[valid_values_mask, cat_var] = label_encoders[cat_var].inverse_transform(imputed_X.loc[valid_values_mask, cat_var].astype('int'))

imputed_X['patientdurablekey'] = df['patientdurablekey']
imputed_X

In [None]:
# Save imputed patdurabledim_analysis_parquet, and billingaccountfact_analysis_parquet
PlumsFiles.save_df_to_parquet(imputed_X,PlumsFiles.get_datapath('patdurabledim_analysis_imputed_parquet'))
PlumsFiles.save_df_to_csv(imputed_X,PlumsFiles.get_datapath('patdurabledim_analysis_imputed_csv'))

#Check whether query makes sense
if check_query_flag==True:
    PlumsExtractor.col_to_list(imputed_X, 'patientdurablekey')

imputed_X.head()