# Initial investigation of some survival analysis

In [None]:
import pandas as pd
import numpy as np
from dotenv import load_dotenv
from phmlondon.snow_utils import SnowflakeConnection
import statsmodels.api as sm
import statsmodels.formula.api as smf

## Read in the data
First we have to make a snowflake connection and join the admissions table onto the feature table/ yearly feature table

In [None]:
load_dotenv()
snowsesh = SnowflakeConnection()
snowsesh.use_database("INTELLIGENCE_DEV")
snowsesh.use_schema("AI_CENTRE_FEATURE_STORE")


In [None]:
cohort_query = """
with pheno_year as

-- Pull out the most recent year of observations for our cohort
(select distinct 
person_id,
max(observation_year) observation_year
from INTELLIGENCE_DEV.AI_CENTRE_FEATURE_STORE.PERSON_PHENOTYPE_BY_YEAR
where observation_year between 2019 and 2020
group by person_id)

select * from 
INTELLIGENCE_DEV.AI_CENTRE_FEATURE_STORE.PERSON_PHENOTYPE_BY_YEAR pheno
inner join pheno_year on pheno_year.person_id = pheno.person_id and pheno_year.observation_year = pheno.observation_year
join INTELLIGENCE_DEV.AI_CENTRE_FEATURE_STORE.COHORT_TABLE cohort on pheno.person_id = cohort.person_id
where cohort.admission_date between '2021-01-01' and '2022-01-01' or cohort.admission_date is null

"""
cohort_table = snowsesh.execute_query_to_df(cohort_query)


In [None]:
#Take only one admission per person
one_admission = cohort_table[~cohort_table.PERSON_ID.duplicated()]

In [None]:
one_admission.ADMISSION_DATE.isna()[~missing_rows].unique()

In [None]:
modelling_columns = ['LONDON_IMD_DECILE',        
                     'ASTHMA', 
                     'COPD', 
                     'DIABETES_TYPE2', 
                     'DIABETES_TYPE1', 
                     'HYPERTENSION',       
                     'CORONARY_HEART_DISEASE',
                     'STROKE',
                     'CKD_STAGE3', 
                     'SEVERE_MENTAL_ILLNESS',       
                     'CANCER', 
                     'DEMENTIA', 
                     'ATRIAL_FIBRILLATION', 
                     'PALLIATIVE_CARE',       
                     'HEART_FAILURE', 
                     'PATIENT_AGE_AT_ACTIVITY'
                     ]

#Make dummy cols for modelling - commented out as duplicate columns here
#one_admission_inputs = pd.get_dummies(one_admission.loc[:, modelling_columns], 
#                                      columns=['ETHNIC_AIC_CATEGORY', 'GENDER'], 
#                                      drop_first = True)
one_admission_inputs = one_admission.loc[:, modelling_columns]

#Drop any columns with missing data
missing_rows = one_admission_inputs.isna().any(axis=1)
logit_model = sm.Logit(~one_admission.ADMISSION_DATE.isna()[~missing_rows],
                       one_admission_inputs[~missing_rows].astype(float))

logit_res = logit_model.fit()

print(logit_res.summary())

In [None]:
logit_res.summary()