# Create dummy CHESS table
- While we are waiting for the real thing.
- Fields are taken from:
https://www.england.nhs.uk/coronavirus/wp-content/uploads/sites/52/2020/03/phe-letter-to-trusts-re-daily-covid-19-hospital-surveillance-11-march-2020.pdf
- Rather than try to replicate the duplication/missingness/chaos of the raw table, I've just created what I imagine is a tidied up version with a handful of useful fields for cohort selection.
- We can add to these fields as needed.

In [2]:
import pyodbc
import pandas as pd
from IPython.display import display, Markdown

In [3]:
server = 'covid.ebmdatalab.net,1433'
database = 'OPENCoronaExport' 
username = 'SA'
password = 'ahsjdkaJAMSHDA123[' 
cnxn = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server};SERVER='+server+';DATABASE='+database+';UID='+username+';PWD='+ password)
cursor = cnxn.cursor()

In [4]:
# select command
query = '''select name from sys.objects where type_desc='USER_TABLE' order by name'''
df = pd.read_sql(query, cnxn)
df

Unnamed: 0,name
0,CodedEvent
1,CodedEventRange
2,Consultation
3,DataDictionary
4,MedicationDictionary
5,MedicationIssue
6,MedicationSensitivity
7,Organisation
8,Patient
9,PatientAddress


## Get random sample of patient ids from TPP data

In [82]:
patient

Unnamed: 0_level_0,DateOfBirth,DateOfDeath,Sex
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1920-01-01,1920-01-01,M
2,1982-12-01,1982-12-01,F
3,1919-02-01,1919-02-01,F
6,1949-07-01,1949-07-01,M
8,1933-06-01,1933-06-01,M
...,...,...,...
247409351,1987-05-01,1987-05-01,F
247409504,1976-10-01,1976-10-01,F
247409521,1969-06-01,1969-06-01,F
247409522,2013-11-01,2013-11-01,M


In [84]:
sql = f"select * from Patient"
patient = pd.read_sql(sql, cnxn).set_index('Patient_ID')
rand_samp = patient.sample(n=1000,random_state=1234).index
rand_samp

Int64Index([   6041, 1421431,   50459, 1359784, 1358280,    8393, 1031144,
              48234, 1328276, 1361713,
            ...
               3363, 1327186, 1221078, 1211471, 1444400, 1316582, 1224315,
            1227866,   53091, 1317783],
           dtype='int64', name='Patient_ID', length=1000)

## Make random dates

In [60]:
import numpy as np
from datetime import datetime, date
def random_dates(start, end, n, unit='D', seed=None):
    if seed:
        np.random.seed(seed)
    else:
        np.random.seed(0)

    ndays = (end - start).days + 1
    return start + pd.to_timedelta(
        np.random.randint(0, ndays, n), unit=unit
    )

In [45]:
start = datetime(2020, 2, 1, 0, 0)
end = datetime.combine(date.today(), datetime.min.time())

## Make table and columns
- Estimated date of onset of symptoms
- Swab/specimen date
- Laboratory test date
- Result of laboratory tests (select all that apply): COVID-19, A/H1N1pdm2009, A/H3N2, B, A/non-subtyped, A/unsubtypeable, RSV, other (specify)

In [89]:
results = ['COVID-19', 'COVID-19', 'COVID-19', 'COVID-19', 'COVID-19', 'COVID-19', 'COVID-19', #so it's mostly covid
           'A/H1N1pdm2009',
           'A/H3N2',
           'B',
           'A/non-subtyped',
           'A/unsubtypeable',
           'RSV',
           'other (specify)'
          ]

chess = pd.DataFrame(index= rand_samp)
chess['symptom_onset'] = random_dates(start, end, len(chess),seed=123)
chess['swab_date'] = random_dates(start, end, len(chess),seed=321)
chess['lab_test_date'] = random_dates(start, end, len(chess),seed=321)
chess['result'] = np.random.randint(0, len(results), len(chess))
chess['result'] = chess['result'].apply(lambda i: results[i])
chess.to_csv('dummy_chess.csv')
chess.head()

Unnamed: 0_level_0,symptom_onset,swab_date,lab_test_date,result
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
6041,2020-03-17,2020-03-24,2020-03-24,A/H1N1pdm2009
1421431,2020-02-03,2020-02-27,2020-02-27,COVID-19
50459,2020-02-29,2020-03-03,2020-03-03,COVID-19
1359784,2020-03-06,2020-03-13,2020-03-13,A/H1N1pdm2009
1358280,2020-03-10,2020-02-09,2020-02-09,A/H3N2
