In [205]:
import importlib

import pdc_functions.assumption_utils as assumptions
import pdc_functions.data_utils as data
import pdc_functions.explore_utils as explore
import pdc_functions.pdc_1and2_utils as pdc
import pdc_functions.stats_utils as stats
from dotenv import load_dotenv
import pdc_functions.class_utils as class_utils
import pandas as pd


from phmlondon.snow_utils import SnowflakeConnection


In [None]:
importlib.reload(stats) # Use when updating the functions

In [None]:
# Snowflake env needed
load_dotenv()

snowsesh = SnowflakeConnection()
snowsesh.use_database("INTELLIGENCE_DEV")
snowsesh.use_schema("AI_CENTRE_FEATURE_STORE")

In [None]:
# Uses function to select all from pre made snowflake table. See sql folder for the sql used to generate the cohort tables
# ldl_all table is a table of all the ldl measurements (using the open codelists ldl measurements definition ID
ldl = data.get_data_by_cohort(snowsesh, "intelligence_dev.ai_centre_dev.ldl_all")


Clean the ldl column first - discussed with Dan - awaiting pipeline updates

In [None]:
print(ldl)

In [None]:
# Uses function to select all from pre made snowflake table. See sql folder for the sql used to generate the cohort tables
# Statin_all table is a table of all orders of a Lipid-regulating drug as per the BNF class definition 
statin = data.get_data_by_cohort(snowsesh, "intelligence_dev.ai_centre_dev.statin_all")

In [None]:
print(statin['person_id'].nunique())
print(ldl['person_id'].nunique())

In [None]:
# This uses a medicationtable class to ensure all the orders information is in the table and in the correct format
drug_table = class_utils.MedicationTable(statin)


In [None]:
# This uses the functions in the medicationstable class.
drug_table.validate_columns()  # Validate columns
drug_table.normalise_missing_values()  # Standardise representation of missing data
drug_table.convert_dates()  # Convert date columns
drug_table.validate_data_types()  # Validate data types
drug_table.clean_dose()  # Clean and interpret the 'dose' column
drug_table.calculate_covered_days()

print("All validations passed successfully!")

In [None]:
# Compute overal pdc calsulate the inclusive and exclusive pdcs per person per drug in the time frame available
# this add the one row per order table so the pdc's will be the sae for the same person/drug combo
person_drug_pdc = pdc.compute_pdc_overall(drug_table.df)

Running checks - yay works.

Now, if there are any duration days that are negative - these are ignored when calculating covered_days

In [None]:
print(person_drug_pdc[person_drug_pdc['total_covered_days'] < 0])


In [None]:
# adds demographic data - updated with the new IMD source.

person_drug_pdc = data.add_demographic_data(snowsesh, person_drug_pdc)

In [None]:
# this then aggregates the table into one row per person per drug including columns like total_covered_days etc
person_drug_pdc = data.general_agg(person_drug_pdc)

In [None]:
print(person_drug_pdc)

Join to ldl results table

In [None]:
# this results attachment add the 3 closest results within a +/- a year of the drug start and drug end dates for each person/drug combo
data_joined = data.attach_closest_results(person_drug_pdc,ldl)

In [None]:
print(data_joined)


In [None]:
# Whereas this attchement selects the 3 results ONLY before the start date and ONLY after the end date
data_joined_2 =  data.attach_closest_results_2(person_drug_pdc,ldl)

Clean 
- remove people without at least a year of medication
- remove people with missing bloods - need at least 2 before and 2 after for average


Need to check the results with data_joined_2

In [None]:
clean_data = data.cohort_exclusions(data_joined)

In [None]:
print(clean_data)

Average the results before/after

In [None]:
# Creates a avg_before and avg_after col with a result_diff col per person per drug
analysis_data = data.avg_results(clean_data)
print(analysis_data)

EXPLORE THE ANALYSIS TABLE

In [None]:
print(analysis_data.describe())


In [None]:
nan_counts = analysis_data.isna().sum()
print(nan_counts)

Errors:
- negative and massive total covered days? - sorted
- leading to negative and massive overall_inclusive_pd - sorted
- results ldl max can be >100?! - await dan cleaning?
- some poeple on medication for 79 years....?... to do
  
Plan
- look at and ?remove rows where total covered days are negative - DONE (added flag)
- ldl needs cleaning - await dan
- 

In [None]:
# Unadjusted Linear regression



need to remoe the rows with no exlusive pdc

In [None]:
df_clean_1 = analysis_data.dropna(subset=["overall_exclusive_pdc", "result_diff"])

linr_unadj = stats.linear_reg(df_clean_1, "overall_exclusive_pdc", "result_diff")

In [None]:
print(linr_unadj.summary())

In [None]:
categorical_cols = ['gender', 'ethnicity', 'drug_name', 'imd']
outcome_col = 'result_diff'

# Prepare the data
df_clean = stats.prepare_data_for_regression(analysis_data, categorical_cols, outcome_col)


In [None]:
print(df_clean.columns)

In [None]:
exclude_cols = [outcome_col, 'person_id', 'min_start_date', 'max_start_date', 'overall_inclusive_pdc', 'total_covered_days', 'total_exposed_days', 'before_result_1', 'after_result_1',
       'before_result_2', 'after_result_2', 'before_result_3',
       'after_result_3', 'before_avg', 'after_avg', 'result_diff']
covariates = [col for col in df_clean.columns if col not in exclude_cols]


linr_adj = stats.linear_reg(df_clean, covariates, outcome_col)

print(linr_adj.summary())



In [None]:
print(df_clean.dtypes)

In [None]:
print(linr_adj.summary())