In [25]:
import glob
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import gridspec
import math
import numpy as np
import os
import seaborn as sns
from IPython.display import display, Markdown

pd.options.mode.chained_assignment = None

%matplotlib inline
plt.rcParams["figure.figsize"] = [16, 12]

path = "../output/released"
path2 = "../output/validation/tables"

ethnicity_dict = {
    "True": "White",
    "1": "White",
    "2": "Mixed",
    "3": "Asian",
    "4": "Black",
    "5": "Other",
    np.nan: "Unknown",
    "0": "Unknown",
}

imd_dict = {
    "False": "Unknown",
    "0": "Unknown",
    "1": "1 Most deprived",
    "True": "1 Most deprived",
    "2": "2",
    "3": "3",
    "4": "4",
    "5": "5 Least deprived",
}

### Out-of-range Values

The means for CTV3-calculated and SNOMED-calculated BMI derivations by month were often unexpectedly high, with spikes at particular time points. For instance, the mean SNOMED-calculated BMI was 1.8 million in July 2015. To investigate the extent to which the definitions contained unexpected BMI values, values below 4 and above 200 were counted as out-of-range values. A standard BMI chart shows values from 9 to 65, but we have chosen the end points of implausible extremes. The means of out-of-range values were also computed to gauge the magnitude of the errors.

In [26]:
def display_oob(unit,scale):
    df_ct = pd.read_csv(f"{path}/{unit}.csv", index_col=0)
    df_ct.loc[df_ct["category"] == "population", "sort"] = 1
    
    # Rename subcategories 
    df_ct.loc[df_ct["category"] == "ethnicity", "subcategory"] = df_ct["subcategory"].map(ethnicity_dict)
    df_ct.loc[df_ct["category"] == "imd", "subcategory"] = df_ct["subcategory"].map(imd_dict)
    
    # Clean category
    df_ct["category"] = df_ct["category"].apply(lambda x: x.replace("_", " ").title())
    df_ct["subcategory"] = df_ct["subcategory"].apply(lambda x: x.replace("_", " ").title())
    df_ct.loc[df_ct["category"] == "Imd", "category"] = "IMD"
    
    # Sort 
    df_ct = df_ct.sort_values(
        by=["sort","category","subcategory"]
    ).drop(columns=["sort"]).rename(
        columns={
            "category":"Category",
            "subcategory":"Subcategory"
        }
    ).set_index(["Category","Subcategory"])
    
    df_ct = df_ct.replace("-",np.nan)
    
    # Format column values
    for col in df_ct.columns:
        if "count" in col:
            df_ct[col] = df_ct[col].apply(lambda x: np.nan if pd.isnull(x) else round(float(x)))
            df_ct[col] = df_ct[col].apply(lambda x: np.nan if pd.isnull(x) else "{:,.0f}".format(x))
        if scale == "greater":
            if "mean" in col:
                df_ct[col] = df_ct[col].apply(lambda x: np.nan if pd.isnull(x) else "{:,.0f}".format(float(x)))
        if scale == "less":
            if "mean" in col:
                df_ct[col] = df_ct[col].apply(lambda x: np.nan if pd.isnull(x) else round(float(x),2))
                
                
    df_ct = df_ct.fillna("-")

    df_ct = df_ct.rename(
        columns = {
            "count_derived_bmi":"Composite BMI Count",
            "mean_derived_bmi":"Composite BMI Mean",
            "count_recorded_bmi":"SNOMED-recorded BMI Count",
            "mean_recorded_bmi":"SNOMED-recorded BMI Mean",
            "count_computed_bmi":"SNOMED-calculated BMI Count",
            "mean_computed_bmi":"SNOMED-calculated BMI Mean",
            "count_backend_computed_bmi":"CTV3-calculated BMI Count",
            "mean_backend_computed_bmi":"CTV3-calculated BMI Mean",
        }                       
    )
    df_ct.to_csv(f"../output/released/made_locally/local_{unit}.csv")    
    display(df_ct)

In [27]:
display_oob("greater_than_max","greater")

Unnamed: 0_level_0,Unnamed: 1_level_0,Composite BMI Count,Composite BMI Mean,SNOMED-recorded BMI Count,SNOMED-recorded BMI Mean,CTV3-calculated BMI Count,CTV3-calculated BMI Mean,SNOMED-calculated BMI Count,SNOMED-calculated BMI Mean
Category,Subcategory,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Population,All,123265,13811688,8250,812168,127620,167655907,127675,167578597
Age Band,18-29,26580,376774,3460,1586504,21125,161195,21105,161198
Age Band,30-39,23820,189554,1715,180555,22395,38584597,22395,38581260
Age Band,40-49,18790,89137730,950,192588,19490,193697274,19505,193528729
Age Band,50-59,15710,213431,750,199504,17595,231337470,17605,231193047
Age Band,60-69,14800,276182,585,189942,17615,356421777,17650,355735737
Age Band,70-79,12805,245619,380,206865,16585,336145003,16585,336124778
Age Band,80+,10760,214397,410,924416,12815,64774618,12830,64714312
Age Band,Missing,-,-,-,-,-,-,-,-
Dementia,False,122475,13897094,8180,780682,126770,168777428,126825,168701735


In [28]:
display_oob("less_than_min","less")

Unnamed: 0_level_0,Unnamed: 1_level_0,Composite BMI Count,Composite BMI Mean,SNOMED-recorded BMI Count,SNOMED-recorded BMI Mean,CTV3-calculated BMI Count,CTV3-calculated BMI Mean,SNOMED-calculated BMI Count,SNOMED-calculated BMI Mean
Category,Subcategory,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Population,All,149830,1.23,110505,1.9,145955,0.78,152620,0.47
Age Band,18-29,34745,1.56,25255,2.08,26280,1.36,27370,1.29
Age Band,30-39,29180,1.51,19455,1.9,26990,1.18,28025,1.01
Age Band,40-49,21510,1.33,16180,1.8,21950,1.06,23100,0.72
Age Band,50-59,19365,0.5,15330,1.75,20505,0.25,21435,0.21
Age Band,60-69,18125,1.37,14370,1.78,19430,0.7,20330,0.66
Age Band,70-79,15440,1.18,12050,1.89,17820,0.23,18570,-0.5
Age Band,80+,11475,0.46,7865,2.01,12985,0.01,13790,-1.22
Age Band,Missing,-,-,-,-,-,-,-,-
Dementia,False,149220,1.23,110075,1.89,145360,0.78,151975,0.47
