### Import Statements

In [1]:
import os
import pyodbc
import pandas as pd
from IPython.display import display, Markdown
from ebmdatalab import bq
from lib.data_gathering import codes_to_sql_where

### Server connection

In [2]:
server = 'covid.ebmdatalab.net,1433'
database = 'OPENCoronaExport' 
username = 'SA'
password = 'my_secret_password' 
cnxn = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server};SERVER='+server+';DATABASE='+database+';UID='+username+';PWD='+ password)
cursor = cnxn.cursor()

### Population 

Our study population is everyone in CHESS (dummy) dataset that had a positive covid-19 swab

In [3]:
chess = pd.read_csv('../data/chess.csv')
chess.head()

Unnamed: 0,Patient_ID,symptom_onset,swab_date,lab_test_date,result,admitted_itu,admission_date,died,death_date
0,6041,2020-03-17,2020-03-24,2020-03-24,A/non-subtyped,0,,0,
1,1421431,2020-02-03,2020-02-27,2020-02-27,COVID-19,0,,0,
2,50459,2020-02-29,2020-03-03,2020-03-03,COVID-19,0,,0,
3,1359784,2020-03-06,2020-03-13,2020-03-13,COVID-19,0,,0,
4,1358280,2020-03-10,2020-02-09,2020-02-09,A/unsubtypeable,0,,0,


In [4]:
study_population = chess[chess['result'] == 'COVID-19']

In [5]:
study_population.head()

Unnamed: 0,Patient_ID,symptom_onset,swab_date,lab_test_date,result,admitted_itu,admission_date,died,death_date
1,1421431,2020-02-03,2020-02-27,2020-02-27,COVID-19,0,,0,
2,50459,2020-02-29,2020-03-03,2020-03-03,COVID-19,0,,0,
3,1359784,2020-03-06,2020-03-13,2020-03-13,COVID-19,0,,0,
6,1031144,2020-02-20,2020-03-12,2020-03-12,COVID-19,0,,0,
10,1316105,2020-03-05,2020-03-09,2020-03-09,COVID-19,0,,0,


In [6]:
study_population.to_csv("../data/analysis/study_pop.csv")

### Outcome and Exposure definitions

#### Exposure: CVD disease ever

In this analysis, our exposure is Cardiovascular disease. We define this as a code on a patient's records ever that is for cardiovascular disease such as angina or a prescription of a cardiovascular drugs (from bnf)

##### Read codes

In [7]:
qof_clusters = pd.read_csv('../data/QoFClusteres_CTV3Codes - Sheet1.csv')
chd_codes = qof_clusters.loc[qof_clusters['ClusterId']=='CHD','CTV3Code']
chd_codes.head()

3562    14A3.
3563    14A4.
3564    G300.
3565    G301.
3566    G3010
Name: CTV3Code, dtype: object

##### Medicine

In [8]:
sql = '''
WITH bnf_codes AS (
  SELECT bnf_code FROM hscic.presentation WHERE 
    bnf_code LIKE '02%' #BNF cvd chapter 
)

SELECT "vmp" AS type, id, bnf_code, nm
FROM dmd.vmp
WHERE bnf_code IN (SELECT * FROM bnf_codes)

UNION ALL

SELECT "amp" AS type, id, bnf_code, descr
FROM dmd.amp
WHERE bnf_code IN (SELECT * FROM bnf_codes)

ORDER BY type, bnf_code, id
'''

cvd_medcodes = bq.cached_read(sql, csv_path=os.path.join('..','data','cvd_medcodes.csv'))
cvd_medcodes.head()

Unnamed: 0,type,id,bnf_code,nm
0,amp,4783111000001104,0201010AABBAAAA,Digibind 38mg powder for solution for injectio...
1,amp,20477811000001103,0201010AABCAAAB,DigiFab 40mg powder for solution for infusion ...
2,amp,3726311000001104,0201010D0AAAAAA,Digitoxin 100microgram tablets (A A H Pharmace...
3,amp,3726811000001108,0201010D0AAAAAA,Digitoxin 100microgram tablets (Focus Pharmace...
4,amp,5626411000001106,0201010D0AAAAAA,Digitoxin 100microgram tablets (Alliance Healt...


#### Find patients with coded events where code matches cvd disease

In [9]:
codes_where = codes_to_sql_where("CTV3Code", chd_codes)

query = f'''
SELECT DISTINCT Patient_ID, 1 AS chd_code
FROM CodedEvent
WHERE {codes_where}
ORDER BY Patient_ID
'''

clin_df = pd.read_sql(query, cnxn, index_col='Patient_ID')
clin_df.head()

Unnamed: 0_level_0,chd_code
Patient_ID,Unnamed: 1_level_1
84,1
201,1
228,1
301,1
401,1


#### Find all patients where cvd code on record

In [10]:
codes_where = codes_to_sql_where("DMD_ID", cvd_medcodes['id'])

query = f'''
SELECT 
  med.Patient_ID,
  COUNT(med.Patient_ID) AS cvd_meds

FROM MedicationDictionary AS dict

INNER JOIN MedicationIssue AS med
ON dict.MultilexDrug_ID = med.MultilexDrug_ID

WHERE ({codes_where})

GROUP BY med.Patient_ID

ORDER BY med.Patient_ID
'''

med_df = pd.read_sql(query, cnxn, index_col='Patient_ID')
med_df.head()

Unnamed: 0_level_0,cvd_meds
Patient_ID,Unnamed: 1_level_1
2,1
18,3
28,7
34,44
41,2


In [11]:
clin_df.join(med_df, how='outer').fillna(0).head()

Unnamed: 0_level_0,chd_code,cvd_meds
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
2,0.0,1.0
18,0.0,3.0
28,0.0,7.0
34,0.0,44.0
41,0.0,2.0


In [12]:
clin_df.to_csv("../data/analysis/cvd_dis.csv")

#### Outcome: Death

Our outcome of interest is death

1 - death
0 - alive

This is contained within the study pop csv

### Covariates defintions

- Age
- Gender
- Smoking status

In [15]:
df = pd.read_csv('../data/analysis/study_pop.csv')

In [16]:
df.head()

Unnamed: 0.1,Unnamed: 0,Patient_ID,symptom_onset,swab_date,lab_test_date,result,admitted_itu,admission_date,died,death_date
0,1,1421431,2020-02-03,2020-02-27,2020-02-27,COVID-19,0,,0,
1,2,50459,2020-02-29,2020-03-03,2020-03-03,COVID-19,0,,0,
2,3,1359784,2020-03-06,2020-03-13,2020-03-13,COVID-19,0,,0,
3,6,1031144,2020-02-20,2020-03-12,2020-03-12,COVID-19,0,,0,
4,10,1316105,2020-03-05,2020-03-09,2020-03-09,COVID-19,0,,0,


#### Demographics 

In [17]:
codes_where = codes_to_sql_where("Patient_ID", df['Patient_ID'])

In [18]:
query = f'''
SELECT * FROM Patient
WHERE {codes_where}
ORDER BY Patient_ID
'''

In [19]:
co_df = pd.read_sql(query, cnxn, index_col='Patient_ID')
co_df.head()

Unnamed: 0_level_0,DateOfBirth,DateOfDeath,Sex
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
254,1961-04-01,1961-04-01,F
572,1981-04-01,1981-04-01,M
592,1970-08-01,1970-08-01,M
647,1925-11-01,1925-11-01,F
659,1960-07-01,1960-07-01,F


In [20]:
import datetime 
now = datetime.date.today()
now

datetime.date(2020, 3, 30)

In [21]:
co_df['Age'] = (now - co_df['DateOfBirth']).astype('<m8[Y]')

In [22]:
co_df.head()

Unnamed: 0_level_0,DateOfBirth,DateOfDeath,Sex,Age
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
254,1961-04-01,1961-04-01,F,58.0
572,1981-04-01,1981-04-01,M,38.0
592,1970-08-01,1970-08-01,M,49.0
647,1925-11-01,1925-11-01,F,94.0
659,1960-07-01,1960-07-01,F,59.0


In [23]:
co_df.reset_index(inplace=True)

In [24]:
co_df = co_df[['Patient_ID', 'Sex', 'Age']]

In [25]:
co_df.to_csv('../data/analysis/demo.csv')

#### Smoking status

In [26]:
smoking_stat = pd.read_csv('../data/smoking_codes.csv')

In [27]:
smoking_stat.head()

Unnamed: 0.1,Unnamed: 0,ClusterId,ClusterName,CTV3Code,Description
0,3495,CESS,Smoking cessation codes,6791.,Health education - smoking
1,3496,CESS,Smoking cessation codes,67A3.,Pregnancy smoking advice
2,3497,CESS,Smoking cessation codes,9OO1.,Attends stop smoking monitoring
3,3498,CESS,Smoking cessation codes,9OO2.,Refuses stop smoking monitor
4,3499,CESS,Smoking cessation codes,9OO3.,Stop smoking monitor default


In [28]:
codes_where = codes_to_sql_where("CTV3Code", smoking_stat['CTV3Code'])

In [29]:
query = f'''
SELECT * --DISTINCT Patient_ID
FROM CodedEvent
WHERE ({codes_where}) AND ConsultationDate BETWEEN '2015-01-01' AND '2020-03-31'
'''
smok_df = pd.read_sql(query, cnxn, index_col='Patient_ID')
smok_df.head()

Unnamed: 0_level_0,Consultation_ID,CodedEvent_ID,CTV3Code,NumericValue,ConsultationDate,SnomedConceptId
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2406,2757977433,2199085828,Ua1Nz,0.0,2019-10-01 12:49:20,225323000
2428,2757948122,2198685810,Ua1Nz,0.0,2019-10-01 12:29:08,225323000
2428,2757977430,2199085825,Ua1Nz,0.0,2019-10-01 12:49:20,225323000
2686,2757977415,2199085870,Ua1Nz,0.0,2019-10-01 12:49:20,225323000
2709,2757948107,2198685795,Ua1Nz,0.0,2019-10-01 12:29:08,225323000


In [30]:
smok_df.reset_index(inplace=True)

In [31]:
smok_df['smoking_status'] = 1

In [32]:
smok_df = smok_df[['Patient_ID', 'smoking_status']]

In [33]:
smok_df.head()

Unnamed: 0,Patient_ID,smoking_status
0,2406,1
1,2428,1
2,2428,1
3,2686,1
4,2709,1


In [34]:
smok_df.to_csv('../data/analysis/smok.csv')

### Making final dataset 

In [52]:
study_pop = pd.read_csv('../data/analysis/study_pop.csv')

In [53]:
study_pop = study_pop[['Patient_ID', 'admitted_itu', 'died']]

In [54]:
study_pop.head()

Unnamed: 0,Patient_ID,admitted_itu,died
0,1421431,0,0
1,50459,0,0
2,1359784,0,0
3,1031144,0,0
4,1316105,0,0


##### Add in exposure

In [55]:
cvd = pd.read_csv('../data/analysis/cvd_dis.csv')

In [56]:
cvd.head()

Unnamed: 0,Patient_ID,chd_code
0,84,1
1,201,1
2,228,1
3,301,1
4,401,1


In [57]:
study_pop = study_pop.merge(cvd, how='left', on='Patient_ID')

In [58]:
study_pop.fillna(0, inplace=True)

In [59]:
study_pop.head()

Unnamed: 0,Patient_ID,admitted_itu,died,chd_code
0,1421431,0,0,1.0
1,50459,0,0,0.0
2,1359784,0,0,0.0
3,1031144,0,0,0.0
4,1316105,0,0,1.0


##### Add in demographics and other covariates

In [60]:
demo = pd.read_csv('../data/analysis/demo.csv')

In [61]:
demo = demo[['Patient_ID', 'Sex', 'Age']]

In [62]:
demo.head()

Unnamed: 0,Patient_ID,Sex,Age
0,254,F,58.0
1,572,M,38.0
2,592,M,49.0
3,647,F,94.0
4,659,F,59.0


In [63]:
study_pop = study_pop.merge(demo, how='left', on='Patient_ID')

In [64]:
study_pop.head()

Unnamed: 0,Patient_ID,admitted_itu,died,chd_code,Sex,Age
0,1421431,0,0,1.0,F,22.0
1,50459,0,0,0.0,M,18.0
2,1359784,0,0,0.0,M,55.0
3,1031144,0,0,0.0,F,56.0
4,1316105,0,0,1.0,F,22.0


##### Add in smoking

In [65]:
smok = pd.read_csv('../data/analysis/smok.csv')

In [66]:
smok = smok[['Patient_ID', 'smoking_status']]

In [67]:
smok.head()

Unnamed: 0,Patient_ID,smoking_status
0,2406,1
1,2428,1
2,2428,1
3,2686,1
4,2709,1


In [68]:
study_pop = study_pop.merge(smok, how='left', on='Patient_ID')
study_pop.fillna(0, inplace=True)

In [69]:
study_pop.head()

Unnamed: 0,Patient_ID,admitted_itu,died,chd_code,Sex,Age,smoking_status
0,1421431,0,0,1.0,F,22.0,0.0
1,50459,0,0,0.0,M,18.0,0.0
2,1359784,0,0,0.0,M,55.0,0.0
3,1031144,0,0,0.0,F,56.0,0.0
4,1316105,0,0,1.0,F,22.0,0.0


In [70]:
study_pop.to_csv('../data/analysis/final_dataset.csv')