In [1]:
import pyodbc
import pandas as pd
from IPython.display import display, Markdown
import numpy as np
from datetime import date
from datetime import datetime
import os

In [2]:
server = 'covid.ebmdatalab.net,1433'
database = 'OPENCoronaExport' 
username = 'SA'
password = 'ahsjdkaJAMSHDA123[' 
cnxn = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server};SERVER='+server+';DATABASE='+database+';UID='+username+';PWD='+ password)
cursor = cnxn.cursor()

### Inspect columns

In [3]:
# increase column display limit so that we can see all rows in summary data
sql = '''-- main ecds table
select TOP 1 *
from SGSS_Positive
'''

ecds = pd.read_sql(sql, cnxn)
pd.set_option('display.max_rows', 200)

pd.Series(ecds.columns)

0                Patient_ID
1                    PHE_ID
2     Organism_Species_Name
3    Earliest_Specimen_Date
4           Lab_Report_Date
5              Age_in_Years
6               Patient_Sex
7        County_Description
8           PostCode_Source
dtype: object

### Positive in SGSS

In [4]:
sql = '''-- SGSS positives:
select Patient_ID, max(Lab_Report_Date) as pos_Lab_Report_Date_latest, count(*) AS positives
from SGSS_Positive
group by Patient_ID
'''
p = pd.read_sql(sql, cnxn)
p["positive_flag"] = 1
p["Lab_Report_Date_latest"] = pd.to_datetime(p["pos_Lab_Report_Date_latest"])

display(Markdown(f"**SGSS Positive Summary**"))
print("No of patients: ", p["Patient_ID"].nunique())
print("Latest lab result date: ", p["Lab_Report_Date_latest"].max())

**SGSS Positive Summary**

No of patients:  7
Latest lab result date:  2020-04-23 00:00:00


### Negative in SGSS

In [5]:
sql = '''
-- SGSS negatives:
select Patient_ID, 
max(Lab_Report_Date) as neg_Lab_Report_Date_latest, count(*) AS negatives
from SGSS_Negative
group by Patient_ID
'''

n = pd.read_sql(sql, cnxn)
n["negative_flag"] = 1
n["Lab_Report_Date_latest"] = pd.to_datetime(n["neg_Lab_Report_Date_latest"])

display(Markdown(f"**SGSS Negative Summary**"))
print("No of patients: ", n["Patient_ID"].nunique())
print("Latest lab result date: ", n["Lab_Report_Date_latest"].max())

**SGSS Negative Summary**

No of patients:  7
Latest lab result date:  2020-04-23 00:00:00


In [6]:
sgss = p.merge(n, on=["Patient_ID","Lab_Report_Date_latest"], how="outer")
cols = ["positives","positive_flag","negatives","negative_flag"]
for c in cols:
    sgss[c].fillna(0, inplace=True)
sgss.agg({"Patient_ID":"nunique", "positive_flag":"sum", "negative_flag":"sum", "positives":"sum", "negatives":"sum"})


Patient_ID       14.0
positive_flag     7.0
negative_flag     7.0
positives         7.0
negatives         7.0
dtype: float64

# What is the split of ethnicities, gender and ages in SGSS?

In [7]:
sql = '''-- Ethnicity:
select Patient_ID, 
CTV3Code, 
ConsultationDate,
MAX(ConsultationDate) OVER (PARTITION BY Patient_ID) as latest_date
from EthnicityCodedEvent
WHERE CTV3Code NOT IN ('9SZ..','XaJRB','XE0oc','XaE4B','XactD','9S...','XaBEN','134O.')-- exclude unknown codes
group by Patient_ID, CTV3Code, ConsultationDate
'''
eth = pd.read_sql(sql, cnxn)
# take only the latest recorded ethnicity for each patient
eth = eth.loc[eth['ConsultationDate']==eth['latest_date']]
eth = eth.drop(['ConsultationDate','latest_date'], axis=1)

eth_groups = pd.read_csv(os.path.join('..','data','opensafely-ethnicity.csv'))
# descriptions:
eth_groups2 = pd.DataFrame([[1, "White"], [2, "Mixed"], [3, "Black"], [4, "Asian or Asian British"], [5, "Other"]], columns=["Grouping_6","ethnicity"])
eth_groups = eth_groups.merge(eth_groups2, on="Grouping_6").drop(["Grouping_16","Grouping_6"], axis=1)

# find patient ethnicity groups
eth2 = eth.merge(eth_groups[["Code", "ethnicity"]], left_on="CTV3Code", right_on="Code", how="left").drop(["Code","CTV3Code"], axis=1)
eth2["ethnicity"].fillna("Unknown", inplace=True)

# add useful dummy data
dummy = pd.DataFrame([[10007,"Asian or Asian British"]], columns=['Patient_ID', 'ethnicity'])
if password == 'ahsjdkaJAMSHDA123[':
    eth2 = eth2.append(dummy)

display(Markdown(f"**Population Age Group Summary**"))
eth2.groupby("ethnicity").count()

**Population Age Group Summary**

Unnamed: 0_level_0,Patient_ID
ethnicity,Unnamed: 1_level_1
Asian or Asian British,27
Black,107
Mixed,182
Other,10035
White,10399


In [8]:
from datetime import datetime

sql = '''-- DOB:
select Patient_ID, 
MIN(DateOfBirth) AS DateOfBirth -- use min in case of any duplicate entries
from Patient
GROUP BY Patient_ID
'''
age = pd.read_sql(sql, cnxn)
age["age"] = datetime.date(pd.to_datetime('2020-04-01')) - age["DateOfBirth"]
age["age"] = (age["age"] / np.timedelta64(1, "Y")).astype(int)

# assign age groups
conditions = [
    (age['age'] < 18 ),
    (age['age'] < 65 ),
    (age['age'] < 80 )]
choices = ['0_<18', '18_<65', '65_<80']
age['age_group'] = np.select(conditions, choices, default='80+')

age = age.drop(["DateOfBirth","age"], axis=1)

display(Markdown(f"**Population Age Group Summary**"))
age.groupby("age_group").count()

**Population Age Group Summary**

Unnamed: 0_level_0,Patient_ID
age_group,Unnamed: 1_level_1
0_<18,1755
18_<65,72898
65_<80,14622
80+,24174


In [9]:
from datetime import datetime

sql = '''-- DOB:
select Patient_ID, 
MAX(Sex)  AS Sex -- in case of duplicates
from Patient
WHERE Sex not in ('I','U')
GROUP BY Patient_ID
'''
sex = pd.read_sql(sql, cnxn)


display(Markdown(f"**Population Sex Summary**"))
sex.groupby(["Sex"]).count()

**Population Sex Summary**

Unnamed: 0_level_0,Patient_ID
Sex,Unnamed: 1_level_1
F,60961
M,51170


In [10]:
sgss2 = sgss.merge(age, on="Patient_ID", how="left")
sgss2 = sgss2.merge(sex, on="Patient_ID", how="left")
sgss2 = sgss2.merge(eth2, on="Patient_ID", how="left")

sgss2 = sgss2.rename(columns={"Patient_ID":"patient count"}).fillna("unknown")

cols = ["ethnicity"]
def summ(sgss2, cols):
    summary = sgss2.groupby(cols).agg({"patient count":"nunique", "positive_flag":"sum", "negative_flag":"sum", "positives":"sum", "negatives":"sum"})
    display(summary)

display(Markdown(f"**Summary of Ages, Genders and Ethnicities in tests**"))
summ(sgss2, cols)
summ(sgss2,["age_group","ethnicity"])
summ(sgss2,["age_group","Sex"])
summ(sgss2,["Sex"])

**Summary of Ages, Genders and Ethnicities in tests**

Unnamed: 0_level_0,patient count,positive_flag,negative_flag,positives,negatives
ethnicity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Asian or Asian British,1,1.0,0.0,1.0,0.0
unknown,13,6.0,7.0,6.0,7.0


Unnamed: 0_level_0,Unnamed: 1_level_0,patient count,positive_flag,negative_flag,positives,negatives
age_group,ethnicity,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
18_<65,Asian or Asian British,1,1.0,0.0,1.0,0.0
18_<65,unknown,13,6.0,7.0,6.0,7.0


Unnamed: 0_level_0,Unnamed: 1_level_0,patient count,positive_flag,negative_flag,positives,negatives
age_group,Sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
18_<65,F,10,6.0,4.0,6.0,4.0
18_<65,M,4,1.0,3.0,1.0,3.0


Unnamed: 0_level_0,patient count,positive_flag,negative_flag,positives,negatives
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
F,10,6.0,4.0,6.0,4.0
M,4,1.0,3.0,1.0,3.0


# Weekly date cut-offs

We know that in early data, negative results were not well reported

In [12]:
# create list of weekly dates to assess

latest_date = datetime.date( sgss["Lab_Report_Date_latest"].max() )

l = []
for i in range(4):
     l.append( [-i-1, latest_date + pd.Timedelta(days=-7*i-7)] )

l = pd.DataFrame(l, columns=["week_no", "date"])


# add previous weeks' summaries
for i,w in enumerate(l["date"]): 
    result = sgss2.loc[pd.to_datetime(sgss2["Lab_Report_Date_latest"]) > pd.to_datetime(w)]
    display(Markdown(f"## Summary of Ages, Genders and Ethnicities in tests since {w} ({i+1} weeks)"))
    display(Markdown(f"`patient count` = no of indiduals tested"), 
            Markdown(f"`positive/negative flag` = total patients testing pos/neg"), 
            Markdown(f"`positives`/`negatives` = total pos/neg results including repeats"))
    summ(result,["ethnicity"])
    summ(result,["age_group","ethnicity"])
    summ(result,["age_group","Sex"])
    summ(result,["Sex"])


## Summary of Ages, Genders and Ethnicities in tests since 2020-04-16 (1 weeks)

`patient count` = no of indiduals tested

`positive/negative flag` = total patients testing pos/neg

`positives`/`negatives` = total pos/neg results including repeats

Unnamed: 0_level_0,patient count,positive_flag,negative_flag,positives,negatives
ethnicity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
unknown,4,2.0,2.0,2.0,2.0


Unnamed: 0_level_0,Unnamed: 1_level_0,patient count,positive_flag,negative_flag,positives,negatives
age_group,ethnicity,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
18_<65,unknown,4,2.0,2.0,2.0,2.0


Unnamed: 0_level_0,Unnamed: 1_level_0,patient count,positive_flag,negative_flag,positives,negatives
age_group,Sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
18_<65,F,3,2.0,1.0,2.0,1.0
18_<65,M,1,0.0,1.0,0.0,1.0


Unnamed: 0_level_0,patient count,positive_flag,negative_flag,positives,negatives
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
F,3,2.0,1.0,2.0,1.0
M,1,0.0,1.0,0.0,1.0


## Summary of Ages, Genders and Ethnicities in tests since 2020-04-09 (2 weeks)

`patient count` = no of indiduals tested

`positive/negative flag` = total patients testing pos/neg

`positives`/`negatives` = total pos/neg results including repeats

Unnamed: 0_level_0,patient count,positive_flag,negative_flag,positives,negatives
ethnicity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
unknown,6,3.0,3.0,3.0,3.0


Unnamed: 0_level_0,Unnamed: 1_level_0,patient count,positive_flag,negative_flag,positives,negatives
age_group,ethnicity,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
18_<65,unknown,6,3.0,3.0,3.0,3.0


Unnamed: 0_level_0,Unnamed: 1_level_0,patient count,positive_flag,negative_flag,positives,negatives
age_group,Sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
18_<65,F,4,3.0,1.0,3.0,1.0
18_<65,M,2,0.0,2.0,0.0,2.0


Unnamed: 0_level_0,patient count,positive_flag,negative_flag,positives,negatives
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
F,4,3.0,1.0,3.0,1.0
M,2,0.0,2.0,0.0,2.0


## Summary of Ages, Genders and Ethnicities in tests since 2020-04-02 (3 weeks)

`patient count` = no of indiduals tested

`positive/negative flag` = total patients testing pos/neg

`positives`/`negatives` = total pos/neg results including repeats

Unnamed: 0_level_0,patient count,positive_flag,negative_flag,positives,negatives
ethnicity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
unknown,10,5.0,5.0,5.0,5.0


Unnamed: 0_level_0,Unnamed: 1_level_0,patient count,positive_flag,negative_flag,positives,negatives
age_group,ethnicity,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
18_<65,unknown,10,5.0,5.0,5.0,5.0


Unnamed: 0_level_0,Unnamed: 1_level_0,patient count,positive_flag,negative_flag,positives,negatives
age_group,Sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
18_<65,F,7,5.0,2.0,5.0,2.0
18_<65,M,3,0.0,3.0,0.0,3.0


Unnamed: 0_level_0,patient count,positive_flag,negative_flag,positives,negatives
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
F,7,5.0,2.0,5.0,2.0
M,3,0.0,3.0,0.0,3.0


## Summary of Ages, Genders and Ethnicities in tests since 2020-03-26 (4 weeks)

`patient count` = no of indiduals tested

`positive/negative flag` = total patients testing pos/neg

`positives`/`negatives` = total pos/neg results including repeats

Unnamed: 0_level_0,patient count,positive_flag,negative_flag,positives,negatives
ethnicity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
unknown,12,6.0,6.0,6.0,6.0


Unnamed: 0_level_0,Unnamed: 1_level_0,patient count,positive_flag,negative_flag,positives,negatives
age_group,ethnicity,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
18_<65,unknown,12,6.0,6.0,6.0,6.0


Unnamed: 0_level_0,Unnamed: 1_level_0,patient count,positive_flag,negative_flag,positives,negatives
age_group,Sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
18_<65,F,8,5.0,3.0,5.0,3.0
18_<65,M,4,1.0,3.0,1.0,3.0


Unnamed: 0_level_0,patient count,positive_flag,negative_flag,positives,negatives
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
F,8,5.0,3.0,5.0,3.0
M,4,1.0,3.0,1.0,3.0
