# demographics.ipynb

This notebook generates tables that describes the population included in the analyses.

In [1]:
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from datetime import datetime
from functools import reduce
from glob import glob

pd.options.mode.chained_assignment = None

## Data Import

In [2]:
# Variables to import
import_vars = ['patient_id', 'took_hba1c', 'diabetes_type', 'hba1c_mmol_per_mol',
              'age_group', 'sex', 'ethnicity', 'region', 'imd', 'learning_disability',
              'mental_illness']

In [3]:
# Read in and append input files
li = []

for file in glob('../output/data/input_all*.csv'):
    df_temp = pd.read_csv(file)[import_vars]
    # Generates a count column
    df_temp['population'] = 1
    li.append(df_temp)
    
# Get only unique patients by ID
df_input = pd.concat(li, axis=0, ignore_index=False).drop_duplicates(
        subset=['patient_id'], keep='last').reset_index(drop=True)

In [4]:
# Recode variables
dict_eth = {1: 'White', 2: 'Mixed', 3: 'Asian',
            4: 'Black', 5: 'Other', np.nan: 'Unknown',
            0: 'Unknown'}

dict_imd = {0: 'Unknown', 1: '1 Most deprived', 2: '2',
            3: '3', 4: '4', 5: '5 Least deprived'}

dict_ld = {1:'Yes', 0:'No'}

dict_diabetes = {'NO_DM':'No Diabetes', 'T1DM': 'Type 1 Diabetes',
                'T2DM': 'Type 2 Diabetes', 'UNKNOWN_DM': 'Unknown Diabetes'}

In [5]:
df_input = df_input.replace({"ethnicity": dict_eth,
                             "imd": dict_imd, 
                             "learning_disability": dict_ld, 
                             "diabetes_type": dict_diabetes}).rename(columns={'population':'Population'})

### Population and HbA1c by DM Type

In [6]:
df_pop_hba1c = df_input[['diabetes_type', 'took_hba1c', 'Population']]

# Sum by DM
df_pop_hba1c_agg = df_pop_hba1c.groupby(['diabetes_type']).sum().reset_index()

# Sum all and append with by DM
df_pop_hba1c_agg['Characteristic'] = 'Total'
df_pop_hba1c_all = df_pop_hba1c_agg.groupby(['Characteristic']).sum().reset_index()
df_pop_hba1c_agg['Characteristic'] = 'Diabetes Status'
df_pop_hba1c_all = df_pop_hba1c_all.append(df_pop_hba1c_agg).fillna('')

# Create percentages
df_pop_hba1c_all['pct_pop'] = round((df_pop_hba1c_all['Population']/df_pop_hba1c_all['Population'].iloc[0])*100,1)
df_pop_hba1c_all['pct_hba1c'] = round((df_pop_hba1c_all['took_hba1c']/df_pop_hba1c_all['Population'])*100,1)
df_pop_hba1c_all['population_2'] = df_pop_hba1c_all['Population'].astype(str) + " (" + df_pop_hba1c_all['pct_pop'].astype(str) + ")"
df_pop_hba1c_all['took_hba1c_2'] = df_pop_hba1c_all['took_hba1c'].astype(str) + " (" + df_pop_hba1c_all['pct_hba1c'].astype(str) + ")"

df_by_dm = df_pop_hba1c_all[['Characteristic','diabetes_type','population_2','took_hba1c_2']].rename(
    columns={'diabetes_type':'Category', 'population_2':'Population (Column %)', 'took_hba1c_2':'Took HbA1c (% of Population)'}).reset_index(drop=True)
display(df_by_dm)

Unnamed: 0,Characteristic,Category,Population (Column %),Took HbA1c (% of Population)
0,Total,,95813 (100.0),9579 (10.0)
1,Diabetes Status,No Diabetes,72074 (75.2),7241 (10.0)
2,Diabetes Status,Type 1 Diabetes,2784 (2.9),287 (10.3)
3,Diabetes Status,Type 2 Diabetes,19091 (19.9),1888 (9.9)
4,Diabetes Status,Unknown Diabetes,1864 (1.9),163 (8.7)


In [7]:
# Type 2 diabetes 
df_t2dm_all = df_by_dm.loc[df_by_dm['Category'].isin(
    ['Type 2 Diabetes'])].reset_index(drop=True)
df_t2dm_all = df_pop_hba1c_all.loc[
        df_pop_hba1c_all['diabetes_type']=='Type 2 Diabetes'
    ].drop(columns=['pct_pop','pct_hba1c', 'population_2', 'took_hba1c'])
df_t2dm_all = df_t2dm_all[
        ['Characteristic','diabetes_type','Population','took_hba1c_2']
    ].rename(columns={'diabetes_type': "Category", 'took_hba1c_2':'Took HbA1c (% of Population)'})

In [8]:
# Make sub-df for T2DM
df_t2dm = df_input.loc[df_input.diabetes_type.isin(['Type 2 Diabetes'])]

sub_dfs = []

def sub_df(var, group):
    df_temp = df_t2dm[[var, 'took_hba1c', 'Population']]
    df_temp_agg = df_temp.groupby([var]).sum().reset_index()

    df_temp_agg['pct_hba1c'] = round((df_temp_agg['took_hba1c']/df_temp_agg['Population'])*100,1)
    df_temp_agg['Characteristic'] = group
    df_temp_agg['took_hba1c_2'] = df_temp_agg['took_hba1c'].astype(str) + " (" + df_temp_agg['pct_hba1c'].astype(str) + ")"
    df_out = df_temp_agg[['Characteristic',var,'Population','took_hba1c_2']].rename(
        columns={var:'Category', 'took_hba1c_2':'Took HbA1c (% of Population)'})
    sub_dfs.append(df_out)

sub_df('age_group','Age Group')
sub_df('sex','Sex')
sub_df('ethnicity','Ethnicity')
sub_df('region','Region')
sub_df('imd','IMD')
sub_df('learning_disability','Learning Disability')
sub_df('mental_illness','Mental Illness')

In [9]:
# Append
for df in sub_dfs:
    df_t2dm_all = df_t2dm_all.append(df)

df_t2dm_all['pct_pop'] = round((df_t2dm_all['Population']/df_t2dm_all['Population'].iloc[0])*100,1)
df_t2dm_all['population_2'] = df_t2dm_all['Population'].astype(str) + " (" + df_t2dm_all['pct_pop'].astype(str) + ")"

df_by_demo = df_t2dm_all[['Characteristic','Category','population_2','Took HbA1c (% of Population)']].rename(
    columns={'population_2':'Population (Column %)'}).reset_index(drop=True)
display(df_by_demo)

Unnamed: 0,Characteristic,Category,Population (Column %),Took HbA1c (% of Population)
0,Diabetes Status,Type 2 Diabetes,19091 (100.0),1888 (9.9)
1,Age Group,0-15,3723 (19.5),389 (10.4)
2,Age Group,16-24,1858 (9.7),177 (9.5)
3,Age Group,25-34,1885 (9.9),188 (10.0)
4,Age Group,35-44,2882 (15.1),280 (9.7)
5,Age Group,45-54,1912 (10.0),180 (9.4)
6,Age Group,55-64,1926 (10.1),197 (10.2)
7,Age Group,65-74,1944 (10.2),180 (9.3)
8,Age Group,75+,2579 (13.5),258 (10.0)
9,Age Group,missing,382 (2.0),39 (10.2)
