# describe_duration.ipynb

This notebook provides descriptive statistics (median and IQR) of the duration of sick notes associated with long COVID.

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns

from functools import reduce
from glob import glob

pd.options.mode.chained_assignment = None

## Import and Clean Data

In [2]:
# Read in and append input files
li = []

for file in glob('../output/cohorts/input*_with_duration.csv'):
    df_temp = pd.read_csv(file)
    # Creates date variable based on file name
    df_temp['cohort'] = file[24:-18]
    li.append(df_temp)
    
df_input = pd.concat(li, axis=0, ignore_index=False).reset_index(drop=True)

In [3]:
# Columns to subset
subset_cols = ['cohort','age_group','sex',
               'ethnicity','imd','region',
               'first_sick_note_duration']
df_clean = df_input[subset_cols]

In [4]:
# Function to create charts displaying median & IQR by demographics
df_clean['first_sick_note_duration'] = df_clean['first_sick_note_duration'].fillna(0)

_____

## Median & IQR

In [5]:
def compute_med_iqr(path, demo=''):
    if demo == '':
        df_med = df_clean.groupby(
            ['cohort'])[['first_sick_note_duration']].apply(np.median).reset_index()
        df_pct25 = df_clean.groupby(
            ['cohort'])[['first_sick_note_duration']].apply(lambda x: np.percentile(x,25)).reset_index()
        df_pct75 = df_clean.groupby(
            ['cohort'])[['first_sick_note_duration']].apply(lambda x: np.percentile(x,75)).reset_index()
        dfs = [df_med, df_pct25, df_pct75]
        df_out = reduce(
            lambda left,right: pd.merge(left,right,on='cohort'), dfs
        ).rename(columns={'0_x':'median','0_y':'pct25',0:'pct75'})
        df_out['iqr'] = df_out['pct75']-df_out['pct25']
    else:
        df_med = df_clean.groupby(
            ['cohort',demo])[['first_sick_note_duration']].apply(np.median).reset_index()
        df_pct25 = df_clean.groupby(
            ['cohort',demo])[['first_sick_note_duration']].apply(lambda x: np.percentile(x,25)).reset_index()
        df_pct75 = df_clean.groupby(
            ['cohort',demo])[['first_sick_note_duration']].apply(lambda x: np.percentile(x,75)).reset_index()
        dfs = [df_med, df_pct25, df_pct75]
        df_out = reduce(
            lambda left,right: pd.merge(left,right,on=['cohort',demo]), dfs
        ).rename(columns={'0_x':'median','0_y':'pct25',0:'pct75'})
        df_out['iqr'] = df_out['pct75']-df_out['pct25']
    df_out.to_csv('../output/tabfig/' + path + '.csv', index=False)
    return df_out

In [6]:
# Overall
compute_med_iqr('med_iqr_overall')

Unnamed: 0,cohort,median,pct25,pct75,iqr
0,covid_2020,262.0,199.0,312.0,113.0
1,general_2019,259.0,197.0,311.0,114.0
2,general_2020,262.0,199.0,315.0,116.0
3,pneumonia_2019,260.0,196.0,311.0,115.0


In [7]:
# Age group
compute_med_iqr('med_iqr_age_group', 'age_group')

Unnamed: 0,cohort,age_group,median,pct25,pct75,iqr
0,covid_2020,0-17,264.0,200.0,310.0,110.0
1,covid_2020,18-24,261.0,191.0,315.0,124.0
2,covid_2020,25-34,260.0,204.5,314.0,109.5
3,covid_2020,35-44,266.0,195.0,315.75,120.75
4,covid_2020,45-54,261.0,201.25,311.0,109.75
5,covid_2020,55-69,265.0,197.0,316.75,119.75
6,covid_2020,70-79,260.0,200.75,305.0,104.25
7,covid_2020,80+,256.0,197.0,309.0,112.0
8,general_2019,0-17,261.0,196.5,313.25,116.75
9,general_2019,18-24,259.0,203.0,304.0,101.0


In [8]:
# Sex
compute_med_iqr('med_iqr_sex', 'sex')

Unnamed: 0,cohort,sex,median,pct25,pct75,iqr
0,covid_2020,F,262.0,199.0,312.0,113.0
1,covid_2020,M,261.0,198.0,313.0,115.0
2,general_2019,F,259.5,198.0,310.0,112.0
3,general_2019,M,259.0,196.0,311.0,115.0
4,general_2020,F,261.0,199.0,313.0,114.0
5,general_2020,M,263.0,199.0,316.0,117.0
6,pneumonia_2019,F,261.0,197.0,312.0,115.0
7,pneumonia_2019,M,260.0,194.0,311.0,117.0


In [9]:
# Ethnicity
compute_med_iqr('med_iqr_ethnicity', 'ethnicity')

Unnamed: 0,cohort,ethnicity,median,pct25,pct75,iqr
0,covid_2020,1.0,262.0,199.25,312.0,112.75
1,covid_2020,3.0,263.0,194.0,311.75,117.75
2,covid_2020,5.0,264.5,206.0,320.0,114.0
3,general_2019,1.0,258.0,194.0,309.0,115.0
4,general_2019,3.0,259.0,204.75,310.25,105.5
5,general_2019,5.0,257.0,196.0,309.5,113.5
6,general_2020,1.0,262.0,201.0,315.0,114.0
7,general_2020,3.0,268.5,202.0,320.0,118.0
8,general_2020,5.0,258.0,194.0,311.0,117.0
9,pneumonia_2019,1.0,260.0,196.0,311.0,115.0


In [10]:
# IMD
compute_med_iqr('med_iqr_imd', 'imd')

Unnamed: 0,cohort,imd,median,pct25,pct75,iqr
0,covid_2020,0,264.0,200.75,305.0,104.25
1,covid_2020,1,260.0,197.0,311.0,114.0
2,covid_2020,2,259.0,198.0,310.0,112.0
3,covid_2020,3,264.0,201.0,314.0,113.0
4,covid_2020,4,264.0,197.0,317.0,120.0
5,covid_2020,5,262.0,200.0,312.0,112.0
6,general_2019,0,261.0,198.5,310.0,111.5
7,general_2019,1,257.0,192.0,311.0,119.0
8,general_2019,2,259.0,193.25,311.0,117.75
9,general_2019,3,260.0,201.0,307.0,106.0


In [11]:
# Region
compute_med_iqr('med_iqr_region', 'region')

Unnamed: 0,cohort,region,median,pct25,pct75,iqr
0,covid_2020,East,257.0,184.0,303.0,119.0
1,covid_2020,East Midlands,261.0,196.0,309.0,113.0
2,covid_2020,London,264.0,196.0,312.0,116.0
3,covid_2020,North East,261.0,203.0,314.0,111.0
4,covid_2020,North West,264.0,204.0,316.0,112.0
5,covid_2020,South East,257.0,191.0,310.0,119.0
6,covid_2020,South West,262.0,203.0,314.0,111.0
7,covid_2020,West Midlands,260.0,198.0,318.0,120.0
8,covid_2020,Yorkshire and The Humber,264.0,205.25,318.0,112.75
9,general_2019,East,259.0,190.5,313.0,122.5
