In [None]:
import importlib

import pdc_functions.assumption_utils as assumptions
import pdc_functions.class_utils as class_utils
import pdc_functions.data_utils as data
import pdc_functions.explore_utils as explore
import pdc_functions.pdc_1and2_utils as pdc
import pdc_functions.stats_utils as stats
from dotenv import load_dotenv

from phmlondon.snow_utils import SnowflakeConnection


In [None]:
importlib.reload(assumptions)

In [None]:
load_dotenv()

snowsesh = SnowflakeConnection()
snowsesh.use_database("INTELLIGENCE_DEV")
snowsesh.use_schema("AI_CENTRE_FEATURE_STORE")

In [None]:
df = data.get_data_by_cohort(snowsesh, "intelligence_dev.ai_centre_dev.compliance_cohort_1yr_top7")



In [None]:
medication_table = class_utils.MedicationTable(df)

Run checks on the dataframe using the Medication Table Class functions

In [None]:
medication_table.validate_columns()  # Validate columns
medication_table.normalise_missing_values()  # Standardise representation of missing data
medication_table.convert_dates()  # Convert date columns
medication_table.validate_data_types()  # Validate data types
medication_table.clean_dose()  # Clean and interpret the 'dose' column
medication_table.calculate_covered_days()

print("All validations passed successfully!")

In [None]:
pdc_data = pdc.compute_pdc_overall(medication_table.df)


In [None]:
print(pdc_data.head())



In [None]:
pdc_data_interval = pdc.compute_pdc_intervals(pdc_data)

In [None]:
print(pdc_data_interval.columns)

In [None]:
cohort_table = data.add_demographic_data(snowsesh, pdc_data_interval)


In [None]:
cohort_table_agg = data.agg_data_person_drug(cohort_table)

In [None]:
table_one = explore.get_table_one(cohort_table_agg,"medication_compliance")

# note about the missingness
missingness in pre and post pdc is due to participants being included but not having any orders before or after the compliance status date.

In [None]:
missing_pdc = cohort_table_agg[cohort_table_agg['overall_inclusive_pdc'].isna()]
missing_pdc['total_exposed_days'].value_counts(dropna=False)

Bug fixing

In [None]:
cohort_table_agg[['overall_inclusive_pdc', 'age_at_start', 'imd']].corr()

In [None]:
for col in ['gender', 'ethnicity', 'imd', 'drug_class']:
    print(f"\nColumn: {col}")
    print(cohort_table_agg[col].value_counts(dropna=False))

In [None]:
covariates_to_clean = ['gender', 'ethnicity', 'imd', 'drug_class', 'age_at_start' , 'total_pre_exposure_days', 'total_post_exposure_days']
analysis_df = stats.prep_for_regression(cohort_table_agg, outcome_col='medication_compliance',
                            predictor_col=['overall_inclusive_pdc',
                                      'overall_exclusive_pdc',
                                      'pre_inclusive_pdc',
                                      'pre_exclusive_pdc',
                                      'post_inclusive_pdc',
                                      'post_exclusive_pdc'],
                            covariates=covariates_to_clean,
                            cluster_col='person_id')

Running multiples models. Each model ran multiple time foe the multiple PDC outcomes.

In [None]:
covariates_to_fit = ['gender', 'ethnicity', 'imd', 'drug_class', 'age_at_start']
models = stats.fit_and_save_models_for_pdc(analysis_df, outcome_col='outcome_binary',
                            pdc_cols=['overall_inclusive_pdc',
                                      'overall_exclusive_pdc',
                                      'pre_inclusive_pdc',
                                      'pre_exclusive_pdc',
                                      'post_inclusive_pdc',
                                      'post_exclusive_pdc'],
                            covariates=covariates_to_fit,
                            cluster_col='person_id')

In [None]:
multiadj_model = models['multilevel_adjusted_pre_exclusive_pdc']
print(f"Original df shape: {analysis_df.shape}")
print(f"Rows used in model fitting: {len(multiadj_model.model.data.row_labels)}")
print(f"Difference (dropped rows): {analysis_df.shape[0] - len(multiadj_model.model.data.row_labels)}")

# check assumptions

In [None]:

# 1. Linearity check
assumptions.check_linearity_log_odds(cohort_table_agg, outcome_col='medication_compliance', continuous_predictors=['pre_exclusive_pdc', 'age_at_start'])

In [None]:
# 2. Multicollinearity
assumptions.check_vif(cohort_table_agg, predictors=['pre_exclusive_pdc', 'age_at_start', 'total_exposed_days'])

In [None]:
# 3. Within-cluster variation
assumptions.check_within_cluster_variation(cohort_table_agg, cluster_col='person_id', outcome_col='medication_compliance')

In [None]:
# 4. ICC
assumptions.estimate_icc(cohort_table_agg, outcome_col='medication_compliance', cluster_col='person_id')

In [None]:
print(analysis_df.head())

In [None]:
# 5. Residuals plot
fitted_data = multiadj_model.model.data.frame

assumptions.plot_residuals_vs_fitted(multiadj_model, fitted_data, outcome_col = 'outcome_binary')