# describe_duration.ipynb

This notebook provides descriptive statistics (median and IQR) of the duration of sick notes associated with long COVID.

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from pathlib import Path
import seaborn as sns

from functools import reduce
from glob import glob

pd.options.mode.chained_assignment = None

## Import and Clean Data

In [2]:
# Function to parse string
def find_nth(haystack, needle, n):
    start = haystack.find(needle)
    while start >= 0 and n > 1:
        start = haystack.find(needle, start+len(needle))
        n -= 1
    return start

In [3]:
# Read in and append input files
li = []

for file in glob('../output/cohorts/cohort_rates*.dta'):
    df_temp = pd.read_stata(file)
    # Creates date variable based on file name
    df_temp['cohort'] = file[find_nth(file, '_', 2)+1:-4]
    # Create population count
    df_temp['population'] = 1
    li.append(df_temp)
    
df_input = pd.concat(li, axis=0, ignore_index=False).reset_index(drop=True)

In [4]:
# Do not count those who had sick notes beyond end dates
df_input.loc[df_input.sick_note == 0, 'first_sick_note_duration'] = np.nan

In [5]:
# Columns to subset
subset_cols = ['cohort','age_group','sex',
               'ethnicity','imd','region_string',
               'first_sick_note_duration',
               'sick_note','population']

# Subset to relevant columns and do not count those beyond 
df_clean = df_input[subset_cols].rename(columns={'region_string':'region'})

_____

## Median & IQR

In [6]:
def compute_med_iqr(path, demo=''):
    if demo == '':
        df_pct_ct = df_clean.groupby(
            ['cohort'])[['sick_note','population']].sum().reset_index()
        df_med = df_clean.groupby(
            ['cohort'])[['first_sick_note_duration']].apply(np.nanmedian).reset_index()
        df_pct25 = df_clean.groupby(
            ['cohort'])[['first_sick_note_duration']].apply(lambda x: np.nanpercentile(x,25)).reset_index()
        df_pct75 = df_clean.groupby(
            ['cohort'])[['first_sick_note_duration']].apply(lambda x: np.nanpercentile(x,75)).reset_index()
        dfs = [df_pct_ct, df_med, df_pct25, df_pct75]
        df_out = reduce(
            lambda left,right: pd.merge(left,right,on='cohort'), dfs
        ).rename(columns={'0_x':'median_duration','0_y':'pct25',0:'pct75'})
        df_out['iqr'] = df_out['pct75']-df_out['pct25']
    else:
        df_pct_ct = df_clean.groupby(
            ['cohort',demo])[['sick_note','population']].sum().reset_index()
        df_med = df_clean.groupby(
            ['cohort',demo])[['first_sick_note_duration']].apply(np.nanmedian).reset_index()
        df_pct25 = df_clean.groupby(
            ['cohort',demo])[['first_sick_note_duration']].apply(lambda x: np.nanpercentile(x,25)).reset_index()
        df_pct75 = df_clean.groupby(
            ['cohort',demo])[['first_sick_note_duration']].apply(lambda x: np.nanpercentile(x,75)).reset_index()
        dfs = [df_pct_ct, df_med, df_pct25, df_pct75]
        df_out = reduce(
            lambda left,right: pd.merge(left,right,on=['cohort',demo]), dfs
        ).rename(columns={'0_x':'median_duration','0_y':'pct25',0:'pct75'})
        df_out['iqr'] = df_out['pct75']-df_out['pct25']
    Path("../output/tabfig/").mkdir(parents=True, exist_ok=True)
    df_out.to_csv('../output/tabfig/' + path + '.csv', index=False)
    return df_out

In [7]:
# Overall
compute_med_iqr('med_iqr_overall')

Unnamed: 0,cohort,sick_note,population,median_duration,pct25,pct75,iqr
0,covid_2020,878.0,1470,268.0,226.0,318.75,92.75
1,general_2019,4625.0,10000,271.0,225.0,319.0,94.0
2,general_2020,6392.0,10000,271.0,223.0,321.0,98.0
3,pneumonia_2019,240.0,512,263.0,222.5,318.25,95.75


In [8]:
# Age group
compute_med_iqr('med_iqr_age_group', 'age_group')

Unnamed: 0,cohort,age_group,sick_note,population,median_duration,pct25,pct75,iqr
0,covid_2020,0-17,78.0,137,278.0,220.5,316.5,96.0
1,covid_2020,18-24,82.0,142,269.0,242.0,315.0,73.0
2,covid_2020,25-34,69.0,140,265.0,225.0,335.0,110.0
3,covid_2020,35-44,90.0,145,271.0,221.25,367.25,146.0
4,covid_2020,45-54,199.0,307,264.0,221.5,317.0,95.5
5,covid_2020,55-69,165.0,273,267.0,222.0,316.0,94.0
6,covid_2020,70-79,101.0,157,276.0,232.0,323.0,91.0
7,covid_2020,80+,94.0,169,267.0,239.75,306.75,67.0
8,general_2019,0-17,470.0,1065,272.0,225.75,318.0,92.25
9,general_2019,18-24,428.0,991,274.0,219.0,324.0,105.0


In [9]:
# Sex
compute_med_iqr('med_iqr_sex', 'sex')

Unnamed: 0,cohort,sex,sick_note,population,median_duration,pct25,pct75,iqr
0,covid_2020,F,455.0,749,266.0,224.5,315.0,90.5
1,covid_2020,M,423.0,721,273.0,229.5,322.5,93.0
2,general_2019,F,2367.0,5044,270.0,225.0,317.25,92.25
3,general_2019,M,2258.0,4956,273.0,224.0,322.0,98.0
4,general_2020,F,3272.0,5158,271.0,223.0,322.0,99.0
5,general_2020,M,3120.0,4842,270.0,223.0,320.0,97.0
6,pneumonia_2019,F,128.0,270,263.0,222.5,314.0,91.5
7,pneumonia_2019,M,112.0,242,264.5,220.75,321.0,100.25


In [10]:
# Ethnicity
compute_med_iqr('med_iqr_ethnicity', 'ethnicity')

Unnamed: 0,cohort,ethnicity,sick_note,population,median_duration,pct25,pct75,iqr
0,covid_2020,White,512.0,860,266.5,225.75,315.0,89.25
1,covid_2020,Asian or Asian British,63.0,115,272.0,221.5,318.0,96.5
2,covid_2020,Other,71.0,113,256.0,212.5,300.5,88.0
3,covid_2020,Unknown,232.0,382,281.0,238.5,334.0,95.5
4,general_2019,White,2786.0,6004,272.0,224.0,319.0,95.0
5,general_2019,Asian or Asian British,348.0,753,278.0,223.75,321.0,97.25
6,general_2019,Other,341.0,743,268.0,226.0,318.0,92.0
7,general_2019,Unknown,1150.0,2500,268.0,225.0,319.0,94.0
8,general_2020,White,3812.0,5955,270.5,222.0,321.0,99.0
9,general_2020,Asian or Asian British,524.0,800,273.0,228.0,314.75,86.75


In [11]:
# IMD
compute_med_iqr('med_iqr_imd', 'imd')

Unnamed: 0,cohort,imd,sick_note,population,median_duration,pct25,pct75,iqr
0,covid_2020,0,49.0,85,296.0,225.0,334.0,109.0
1,covid_2020,1,172.0,284,270.0,223.5,311.5,88.0
2,covid_2020,2,166.0,273,261.5,215.0,308.75,93.75
3,covid_2020,3,181.0,305,266.0,228.0,323.0,95.0
4,covid_2020,4,153.0,269,284.0,239.0,326.0,87.0
5,covid_2020,5,157.0,254,265.0,226.0,305.0,79.0
6,general_2019,0,224.0,483,280.0,231.0,317.0,86.0
7,general_2019,1,862.0,1897,270.0,228.25,317.0,88.75
8,general_2019,2,880.0,1895,270.0,224.0,319.5,95.5
9,general_2019,3,871.0,1880,272.0,221.25,321.0,99.75


In [12]:
# Region
compute_med_iqr('med_iqr_region', 'region')

Unnamed: 0,cohort,region,sick_note,population,median_duration,pct25,pct75,iqr
0,covid_2020,East,91.0,162,269.0,233.5,303.0,69.5
1,covid_2020,East Midlands,86.0,156,281.0,235.25,332.25,97.0
2,covid_2020,London,171.0,272,268.0,204.5,325.0,120.5
3,covid_2020,North East,94.0,154,275.0,231.25,323.75,92.5
4,covid_2020,North West,85.0,151,261.0,229.0,308.0,79.0
5,covid_2020,South East,84.0,130,268.0,233.5,325.5,92.0
6,covid_2020,South West,92.0,160,289.5,253.25,320.75,67.5
7,covid_2020,West Midlands,86.0,130,254.5,218.25,294.5,76.25
8,covid_2020,Yorkshire and The Humber,89.0,155,262.0,215.0,314.0,99.0
9,general_2019,East,492.0,1030,270.0,223.5,317.0,93.5
