In [None]:
import sys
import os

# Get the path of the current notebook's directory and append the parent directory to the path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

import importlib

import pdc_functions.assumption_utils as assumptions
import pdc_functions.data_utils as data
import pdc_functions.explore_utils as explore
import pdc_functions.pdc_1and2_utils as pdc
import pdc_functions.stats_utils as stats
from dotenv import load_dotenv
import pdc_functions.class_utils as class_utils
import pandas as pd


from phmlondon.snow_utils import SnowflakeConnection


In [None]:
importlib.reload(stats)

In [None]:
load_dotenv()

snowsesh = SnowflakeConnection()
snowsesh.use_database("INTELLIGENCE_DEV")
snowsesh.use_schema("AI_CENTRE_FEATURE_STORE")

In [None]:
ldl = data.get_data_by_cohort(snowsesh, "intelligence_dev.ai_centre_dev.ldl_all")


Clean the ldl column first

In [None]:
print(ldl)

In [None]:
statin = data.get_data_by_cohort(snowsesh, "intelligence_dev.ai_centre_dev.statin_all")

In [None]:
drug_table = class_utils.MedicationTable(statin)


In [None]:
drug_table.validate_columns()  # Validate columns
drug_table.normalise_missing_values()  # Standardise representation of missing data
drug_table.convert_dates()  # Convert date columns
drug_table.validate_data_types()  # Validate data types
drug_table.clean_dose()  # Clean and interpret the 'dose' column
drug_table.calculate_covered_days()

print("All validations passed successfully!")

In [None]:
person_drug_pdc = pdc.compute_pdc_overall(drug_table.df)

In [None]:
person_drug_pdc = data.add_demographic_data(snowsesh, person_drug_pdc)

In [None]:
person_drug_pdc = data.general_agg(person_drug_pdc)

In [None]:
print(person_drug_pdc)

Join to results table

In [None]:
data_joined = data.attach_closest_results(person_drug_pdc,ldl)

In [None]:
print(data_joined)


In [None]:
data_joined_2 =  data.attach_closest_results_2(person_drug_pdc,ldl)

Clean 
- remove people without at least a year of medication
- remove people with missing bloods - need at least 2 before and 2 after for average


In [None]:
clean_data = data.cohort_exclusions(data_joined)

In [None]:
print(clean_data)

Average the results before/after

In [None]:
analysis_data = data.avg_results(clean_data)
print(analysis_data)

In [None]:
# Unadjusted Linear regression

In [None]:
linr_unadj = stats.linear_reg(analysis_data, "overall_exclusive_pdc", "result_diff")

In [None]:
print(linr_unadj.summary())

In [None]:
categorical_cols = ['gender', 'ethnicity', 'drug_name', 'imd']
outcome_col = 'result_diff'

# Prepare the data
df_clean = stats.prepare_data_for_regression(analysis_data, categorical_cols, outcome_col)


In [None]:
print(df_clean.columns)

In [None]:
exclude_cols = [outcome_col, 'person_id', 'min_start_date', 'max_start_date', 'overall_inclusive_pdc', 'total_covered_days', 'total_exposed_days', 'before_result_1', 'after_result_1',
       'before_result_2', 'after_result_2', 'before_result_3',
       'after_result_3', 'before_avg', 'after_avg', 'result_diff']
covariates = [col for col in df_clean.columns if col not in exclude_cols]


linr_adj = stats.linear_reg(df_clean, covariates, outcome_col)

print(linr_adj.summary())



In [None]:
print(df_clean.dtypes)

In [None]:
print(linr_adj.summary())