In [None]:
import importlib

import pdc_functions.assumption_utils as assumptions
import pdc_functions.clean_utils as clean
import pdc_functions.data_utils as data
import pdc_functions.explore_utils as explore
import pdc_functions.pdc_1and2_utils as pdc
import pdc_functions.stats_utils as stats
from dotenv import load_dotenv
import pdc_functions.class_utils as class_utils




importlib.reload(class_utils)



from phmlondon.snow_utils import SnowflakeConnection


In [None]:
load_dotenv()

snowsesh = SnowflakeConnection()
snowsesh.use_database("INTELLIGENCE_DEV")
snowsesh.use_schema("AI_CENTRE_FEATURE_STORE")

In [None]:
df = data.get_data_by_cohort(snowsesh, "intelligence_dev.ai_centre_dev.compliance_cohort_1yr_top7")



In [None]:
medication_table = class_utils.MedicationTable(df)

Run checks on the dataframe using the Medication Table Class functions

In [None]:
medication_table.validate_columns()  # Validate columns
medication_table.normalise_missing_values()  # Standardize representation of missing data
medication_table.convert_dates()  # Convert date columns
medication_table.validate_data_types()  # Validate data types


print("All validations passed successfully!")

In [None]:
clean_df = clean.clean_dose(medication_table)
clean_df['covered_days'] = clean.covered_days(clean_df)
print(clean_df.head())

In [None]:
pdc_data = pdc.compute_pdc_with_overlap(clean_df)


In [None]:
print(pdc_data.head())



In [None]:
cohort_table = data.add_demographic_data(snowsesh, pdc_data)


In [None]:
cohort_table_agg = data.agg_data_person_drug(cohort_table)

In [None]:
table_one = explore.get_table_one(cohort_table_agg,"medication_compliance")

In [None]:
stats.unadjusted_logr(cohort_table_agg)


In [None]:
# List of covariates to include in model
covariates = ['gender', 'ethnicity', 'imd', 'drug_class', 'age_at_start', 'total_exposed_days']
stats.adjusted_logr(cohort_table_agg, covariates=covariates)


# Multilevel Logistic regression

In [None]:
stats.multilevel_unadjusted_logr(cohort_table_agg)

In [None]:
covariates = ['gender', 'ethnicity', 'imd', 'drug_class', 'age_at_start', 'total_exposed_days']
multiadj_model, multiadj_or = stats.multilevel_adjusted_logr(cohort_table_agg, covariates=covariates)

In [None]:
multiadj_or

# check assumptions

In [None]:

# 1. Linearity check
assumptions.check_linearity_log_odds(cohort_table_agg, outcome_col='medication_compliance', continuous_predictors=['dynamic_pdc', 'age_at_start'])

In [None]:
# 2. Multicollinearity
assumptions.check_vif(cohort_table_agg, predictors=['dynamic_pdc', 'age_at_start', 'total_exposed_days'])

In [None]:
# 3. Within-cluster variation
assumptions.check_within_cluster_variation(cohort_table_agg, cluster_col='person_id', outcome_col='medication_compliance')

In [None]:
# 4. ICC
assumptions.estimate_icc(cohort_table_agg, outcome_col='medication_compliance', cluster_col='person_id')

In [None]:
# 5. Residuals plot
# Drop rows with missing values
cohort_table_agg_clean = cohort_table_agg.dropna()

# Map 'good' to 0 and 'poor' to 1 in the outcome column (this step is applied to the dataframe directly)
cohort_table_agg_clean['medication_compliance'] = cohort_table_agg_clean['medication_compliance'].map({'good': 0, 'poor': 1})

# Now call the function with the cleaned dataframe and outcome column name as a string
assumptions.plot_residuals_vs_fitted(multiadj_model, cohort_table_agg_clean, outcome_col='medication_compliance')