# describe_duration.ipynb

This notebook provides descriptive statistics (median and IQR) of the duration of sick notes associated with long COVID.

In [28]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from pathlib import Path
import seaborn as sns

from functools import reduce
from glob import glob

pd.options.mode.chained_assignment = None

## Import and Clean Data

In [29]:
# Read in and append input files
li = []

for file in glob('../output/cohorts/input*_with_duration.csv'):
    df_temp = pd.read_csv(file, low_memory=False)
    # Creates date variable based on file name
    df_temp['cohort'] = file[24:-18]
    # Create counts for population and incidence of first sick notes
    df_temp.loc[~df_temp.sick_note_1_date.isna(), 'first_sick_note_count'] = 1
    df_temp['population'] = 1
    li.append(df_temp)
    
df_input = pd.concat(li, axis=0, ignore_index=False).reset_index(drop=True)

In [30]:
# Columns to subset
subset_cols = ['cohort','age_group','sex',
               'ethnicity','imd','region',
               'first_sick_note_duration',
               'first_sick_note_count',
               'population']
df_clean = df_input[subset_cols]

_____

## Median & IQR

In [39]:
def compute_med_iqr(path, demo=''):
    if demo == '':
        df_pct_ct = df_clean.groupby(
            ['cohort'])[['first_sick_note_count','population']].sum().reset_index()
        df_med = df_clean.groupby(
            ['cohort'])[['first_sick_note_duration']].apply(np.nanmedian).reset_index()
        df_pct25 = df_clean.groupby(
            ['cohort'])[['first_sick_note_duration']].apply(lambda x: np.nanpercentile(x,25)).reset_index()
        df_pct75 = df_clean.groupby(
            ['cohort'])[['first_sick_note_duration']].apply(lambda x: np.nanpercentile(x,75)).reset_index()
        dfs = [df_pct_ct, df_med, df_pct25, df_pct75]
        df_out = reduce(
            lambda left,right: pd.merge(left,right,on='cohort'), dfs
        ).rename(columns={'0_x':'median','0_y':'pct25',0:'pct75'})
        df_out['iqr'] = df_out['pct75']-df_out['pct25']
    else:
        df_pct_ct = df_clean.groupby(
            ['cohort',demo])[['first_sick_note_count','population']].sum().reset_index()
        df_med = df_clean.groupby(
            ['cohort',demo])[['first_sick_note_duration']].apply(np.nanmedian).reset_index()
        df_pct25 = df_clean.groupby(
            ['cohort',demo])[['first_sick_note_duration']].apply(lambda x: np.nanpercentile(x,25)).reset_index()
        df_pct75 = df_clean.groupby(
            ['cohort',demo])[['first_sick_note_duration']].apply(lambda x: np.nanpercentile(x,75)).reset_index()
        dfs = [df_pct_ct, df_med, df_pct25, df_pct75]
        df_out = reduce(
            lambda left,right: pd.merge(left,right,on=['cohort',demo]), dfs
        ).rename(columns={'0_x':'median','0_y':'pct25',0:'pct75'})
        df_out['iqr'] = df_out['pct75']-df_out['pct25']
    Path("../output/tabfig/").mkdir(parents=True, exist_ok=True)
    df_out.to_csv('../output/tabfig/' + path + '.csv', index=False)
    return df_out

In [40]:
# Overall
compute_med_iqr('med_iqr_overall')

Unnamed: 0,cohort,first_sick_note_count,population,median,pct25,pct75,iqr
0,covid_2020,9001.0,10000,271.0,223.0,320.0,97.0
1,general_2019,9001.0,10000,269.0,222.0,318.0,96.0
2,general_2020,9001.0,10000,271.0,224.0,322.0,98.0
3,pneumonia_2019,9001.0,10000,269.0,221.0,318.0,97.0


In [41]:
# Age group
compute_med_iqr('med_iqr_age_group', 'age_group')

Unnamed: 0,cohort,age_group,first_sick_note_count,population,median,pct25,pct75,iqr
0,covid_2020,0-17,895.0,993,272.0,222.0,317.75,95.75
1,covid_2020,18-24,915.0,1016,270.0,219.25,325.0,105.75
2,covid_2020,25-34,897.0,1000,270.0,227.0,319.0,92.0
3,covid_2020,35-44,916.0,1022,274.0,229.0,323.0,94.0
4,covid_2020,45-54,1818.0,2010,268.0,223.5,317.0,93.5
5,covid_2020,55-69,1799.0,2010,273.0,223.0,324.0,101.0
6,covid_2020,70-79,888.0,976,265.5,221.0,310.0,89.0
7,covid_2020,80+,873.0,973,266.0,220.0,319.0,99.0
8,general_2019,0-17,962.0,1076,272.5,222.0,321.0,99.0
9,general_2019,18-24,852.0,937,266.0,224.75,310.25,85.5


In [42]:
# Sex
compute_med_iqr('med_iqr_sex', 'sex')

Unnamed: 0,cohort,sex,first_sick_note_count,population,median,pct25,pct75,iqr
0,covid_2020,F,4615.0,5131,272.0,224.0,319.0,95.0
1,covid_2020,M,4386.0,4869,269.0,222.25,321.0,98.75
2,general_2019,F,4515.0,5006,269.0,222.0,317.0,95.0
3,general_2019,M,4486.0,4994,268.0,222.0,318.0,96.0
4,general_2020,F,4615.0,5129,270.0,224.0,320.0,96.0
5,general_2020,M,4386.0,4871,273.0,224.0,323.0,99.0
6,pneumonia_2019,F,4569.0,5062,270.0,222.0,318.0,96.0
7,pneumonia_2019,M,4432.0,4938,269.0,220.0,317.0,97.0


In [43]:
# Ethnicity
compute_med_iqr('med_iqr_ethnicity', 'ethnicity')

Unnamed: 0,cohort,ethnicity,first_sick_note_count,population,median,pct25,pct75,iqr
0,covid_2020,1.0,5405.0,6006,271.0,223.0,319.0,96.0
1,covid_2020,3.0,661.0,738,272.0,222.0,321.0,99.0
2,covid_2020,5.0,687.0,756,271.0,225.5,326.0,100.5
3,general_2019,1.0,5310.0,5925,267.0,221.0,316.0,95.0
4,general_2019,3.0,717.0,792,269.0,223.0,317.0,94.0
5,general_2019,5.0,700.0,783,269.0,220.0,317.0,97.0
6,general_2020,1.0,5388.0,5969,271.0,225.0,322.0,97.0
7,general_2020,3.0,698.0,774,280.0,228.0,328.0,100.0
8,general_2020,5.0,684.0,757,267.5,218.25,318.0,99.75
9,pneumonia_2019,1.0,5417.0,6001,269.0,221.0,318.0,97.0


In [44]:
# IMD
compute_med_iqr('med_iqr_imd', 'imd')

Unnamed: 0,cohort,imd,first_sick_note_count,population,median,pct25,pct75,iqr
0,covid_2020,0,439.0,492,273.0,227.0,312.75,85.75
1,covid_2020,1,1732.0,1915,268.0,221.25,317.75,96.5
2,covid_2020,2,1756.0,1953,269.0,221.0,318.0,97.0
3,covid_2020,3,1691.0,1872,271.0,227.0,321.0,94.0
4,covid_2020,4,1642.0,1839,273.0,226.0,325.0,99.0
5,covid_2020,5,1741.0,1929,270.0,222.0,318.0,96.0
6,general_2019,0,420.0,471,269.0,225.0,315.25,90.25
7,general_2019,1,1647.0,1846,268.0,220.0,318.0,98.0
8,general_2019,2,1694.0,1862,267.0,219.5,317.0,97.5
9,general_2019,3,1819.0,2005,270.0,224.0,314.0,90.0


In [45]:
# Region
compute_med_iqr('med_iqr_region', 'region')

Unnamed: 0,cohort,region,first_sick_note_count,population,median,pct25,pct75,iqr
0,covid_2020,East,844.0,953,266.0,220.0,310.5,90.5
1,covid_2020,East Midlands,853.0,949,271.0,226.0,316.0,90.0
2,covid_2020,London,1809.0,2036,273.0,225.0,319.0,94.0
3,covid_2020,North East,923.0,1013,269.0,221.5,320.0,98.5
4,covid_2020,North West,925.0,991,270.0,220.5,320.0,99.5
5,covid_2020,South East,900.0,993,266.0,216.0,316.0,100.0
6,covid_2020,South West,898.0,1004,271.0,226.0,321.0,95.0
7,covid_2020,West Midlands,936.0,1051,270.0,225.0,324.25,99.25
8,covid_2020,Yorkshire and The Humber,913.0,1010,273.0,227.0,323.5,96.5
9,general_2019,East,900.0,1011,271.0,220.75,321.0,100.25
