Merge pull request #6 from opensafely/charts

Tables and charts
opensafely · Jun 13, 2023 · 5167dc7 · 5167dc7
2 parents 94f45ce + 8b15894
commit 5167dc7
Show file tree

Hide file tree

Showing 15 changed files with 2,455 additions and 58 deletions.
diff --git a/analysis/data_presentation.py b/analysis/data_presentation.py
@@ -5,92 +5,136 @@
 import pandas as pd
 import numpy as np
 import os
-import matplotlib
 import matplotlib.pyplot as plt
 
+
+custom_sort = ["Glomerular filtration rate",
+                "GFR calculated by abbreviated MDRD",
+                "EGFR calculated using creatinine CKDEC equation",
+                "Nucleated red blood cell count",
+                "Rheumatoid factor",
+                "Tissue transglutaminase immunoglobulin A level",
+                "Plasma C reactive protein",
+                "Urine albumin level",
+                "Urine microalbumin level",
+                "Quantitative faecal immunochemical test",
+                "Faecal calprotectin content"]
+
 BASE_DIR = Path(__file__).parents[1]
-OUTPUT_DIR = BASE_DIR / "output"
+INPUT_DIR = BASE_DIR / "released_outputs"
+OUTPUT_DIR = BASE_DIR / "publication_outputs" 
+if not os.path.exists(OUTPUT_DIR):
+   os.makedirs(OUTPUT_DIR)
 
 # import path test codelist to get code descriptions
-path_tests = pd.read_csv(os.path.join(f'codelists/user-helen-curtis-tests-with-comparators.csv'))
+path_tests = pd.read_csv(os.path.join(f'codelists/user-helen-curtis-tests-with-comparators.csv')).set_index("code")
 # replace very long names with shorter ones
-path_tests.loc[path_tests["code"]==1011481000000105, "term"] = "EGFR calculated using creatinine CKDEC equation"
-path_tests.loc[path_tests["code"]==1020291000000106, "term"] = "GFR calculated by abbreviated MDRD"
-#measures_df = measures_df.merge(path_tests, on="code")
+path_tests.loc[1011481000000105, "term"] = "EGFR calculated using creatinine CKDEC equation"
+path_tests.loc[1020291000000106, "term"] = "GFR calculated by abbreviated MDRD"
 
 # # 1. comparator_rate_per_test
-#def simple_plot(factor=None):
 # # load data 
-# df = pd.read_csv(os.path.join(OUTPUT_DIR, 'comparator_rate_per_test.csv')).sort_values(by="denominator", ascending=False)
-
-# fig, ax1 = plt.subplots()
-# # plot numerator and denominator as bars
-# ax1.bar(df["description"], df["denominator"])
-# ax1.bar(df["description"], df["numerator"])
-# ax1.set_ylabel("N")
-# ax1.legend(title='Legend', labels=["Without comparators", "With comparators"], bbox_to_anchor=(1.1, 0.5))
+df = pd.read_csv(os.path.join(INPUT_DIR, 'comparator_rate_per_test.csv'), index_col=0).sort_values(by="denominator", ascending=False)
+df = df.join(path_tests)
+df.to_csv(os.path.join(OUTPUT_DIR, 'comparator_rate_per_test_with_names.csv'))
 
-# ax2 = ax1.twinx()
-# ax2.scatter(df["description"], df["rate"], c='k', marker='x')
-# ax2.set_ylim([0, 1])
-# ax2.set_ylabel("proportion with comparator")
-# plt.xticks(rotation=45)
-# plt.savefig(os.path.join(OUTPUT_DIR, 'comparator_rate_per_test.png'))
 
 
 # 2. comparator_rate_per_test by region
 # load data 
 def rate_breakdown(factor="region", indices=2):
     '''Produce a chart by tests and by specified factor (region, comparator,)'''
 
-    df = pd.read_csv(os.path.join(OUTPUT_DIR, f'comparator_rate_by_{factor}_per_test.csv'), index_col=[0,1])
-    #flatten index
-    df.index = df.index.map('{0[0]}|{0[1]}'.format) 
-
-    #df = df.set_index("item", append=True)
-    df.index = df.index.rename("item")
-    print(df)
-
-    # keep "rate" only (i.e. proportion with comparators) and rearrange dataframe
-    idx = pd.IndexSlice
-    df = df.loc[idx[:, 'rate'], :]
-    df = df.stack().unstack(1).reset_index()
-    df = df.rename(columns={'level_1':factor})
+    df = pd.read_csv(os.path.join(INPUT_DIR, f'comparator_rate_by_{factor}_per_test.csv'), index_col=[0,1])
+    df = df.stack().unstack(level=1).rename(columns={"flag":"total"})
+    df = df.join(path_tests, on=None)
+
+    # filter tests to only keep regions with >=100 tests
+    df = df.loc[df["total"]>=100]
+    df.to_csv(os.path.join(OUTPUT_DIR, f'comparator_rate_by_{factor}_per_test_with_names.csv'))
+
+    df2 = df.copy()[["rate","term"]].reset_index().drop("code", 1)
+    df2 = df2.rename(columns={"level_1":factor})
+    df2 = df2.set_index(["term",factor]).unstack()
+    df2.columns = df2.columns.droplevel()
+    #df2["max"] = df2.max(axis=1)
+
+    # sort tests:
+    #df2 = df2.sort_values(by="max").drop("max", axis=1)
+    df2 = df2.transpose()[custom_sort].transpose()
 
-    # count tests and number of different values for specified factor
-    ntests = df["test"].nunique()
-    length = df[factor].nunique()
+    print(df2)
 
+    # plot heatmap    
     fig, ax = plt.subplots(figsize=(15,8))
+    plt.imshow(df2, cmap='autumn_r', interpolation='nearest')
+
+    plt.xticks(np.arange(0,9), df2.columns, size=8, rotation=90)
+    plt.yticks(np.arange(0,len(df2.index)), df2.index, size=8)
+    plt.colorbar()
+
+    plt.tight_layout()
+    plt.savefig(os.path.join(OUTPUT_DIR, f'comparator_rate_per_{factor}_per_test.png'))
+    plt.show()
 
-    # create a number sequence with the number of points on x axis required (allowing for space between groups)
-    ind = np.arange(0,df["test"].count()+ df["test"].nunique()-1)
-    # create a sequence of the positions where no data will be plotted (ie. the gaps between categories)
-    spaces = np.arange(length,len(ind), length+1)
-    new_ind = np.delete(ind, spaces)
 
-    width = 0.5 # bar width
+rate_breakdown(factor="region")
+
+
+######### Comparators
+df = pd.read_csv(os.path.join(INPUT_DIR, f'comparator_rate_by_comparator_per_test.csv'), index_col=0)
+df = df.join(path_tests) # join test names
+df = df[["term", "<=_rate", ">=_rate"]]
+df.to_csv(os.path.join(OUTPUT_DIR, f'comparator_rate_by_comparator_per_test_with_names.csv'))
+
+
+######### Values
+df = pd.read_csv(os.path.join(INPUT_DIR, f'comparator_rate_by_value_per_test.csv'), index_col=[0,1])
+df = df.join(path_tests) # join test names
+
+# make two different output dfs (one for tests with mostly greater-than comparators, and one for less-than)
+df_out_gt = pd.DataFrame(columns=["num_value","term","total",">=",">=_rate"])
+df_out_lt = pd.DataFrame(columns=["num_value","term","total","<=","<=_rate"])
+
+for n, t in enumerate(custom_sort):
+    if n<3:
+        # for GFR tests include only greater-than comparators
+        main_comparator = ">="
+    else:
+        main_comparator = "<="
+
+    main_comparator_rate = f"{main_comparator}_rate"
+    comparator_subset = [main_comparator_rate,"="]
+    df_t = df.loc[df["term"]==t].sort_values(by=main_comparator_rate)
+    df_t = df_t[["term", "total", main_comparator, main_comparator_rate]]
 
-    for n, r in enumerate(df[factor].unique()):
-        # filter df
-        dfp = df.loc[df[factor]==r]
+    # filter out low count values / those not associated with comparators
+    df_t = df_t.loc[(df_t["total"]>=100)]
+    df_t = df_t.loc[(df_t[main_comparator_rate]>0)]
+    df_t = df_t.reset_index().set_index("code")
 
-        # starting from n, plot data every 'n'th position
-        locs = new_ind[n::length]
-        # plot chart
-        plt.bar(locs, dfp["value"], width, label=r)
+    if n<3:
+        df_out_gt = df_out_gt.append(df_t)
+    else:
+        df_out_lt = df_out_lt.append(df_t)
 
-    plt.legend(loc="lower right")
-    plt.xticks(new_ind[4::length], path_tests, size=8, rotation=45)
-    plt.xlabel("Test code", size=20)
-    plt.ylabel("Proportion with comparators", size=16)
+df_out_gt.to_csv(os.path.join(OUTPUT_DIR, f'gfr_tests_comparator_associated_values_with_names.csv'), index=False)
+df_out_lt.to_csv(os.path.join(OUTPUT_DIR, f'all_non_gfr_tests_comparator_associated_values_with_names.csv'), index=False)
 
 
-    plt.savefig(os.path.join(OUTPUT_DIR, f'comparator_rate_per_{factor}_per_test.png'))
 
-rate_breakdown(factor="region")
-#rate_breakdown(factor="value")
-#rate_breakdown(factor="comparator")
 
+######### Upper and lower limits (reference values returned alongside tests)
+for limit in ["upper", "lower"]:
+    df = pd.read_csv(os.path.join(INPUT_DIR, f'{limit}_bound_rate_per_test.csv'), index_col=[0,1])
+    df = df.join(path_tests) # join test names
+
+    # calculate rank for each limit value to find top 2 only
+    df = df.reset_index()
+    df["rank"] = df.groupby('code')['rate'].rank(method='dense', ascending=False).astype(int)
+    df = df[["code","term","rank",limit,"rate"]].loc[df["rank"]<3]
+    df = df.set_index(["code","term","rank"]).stack().unstack(level=2).unstack()
+
+    df.to_csv(os.path.join(OUTPUT_DIR, f'all_tests_{limit}_limit_with_names.csv'))
 
 
diff --git a/publication_outputs/all_non_gfr_tests_comparator_associated_values_with_names.csv b/publication_outputs/all_non_gfr_tests_comparator_associated_values_with_names.csv
@@ -0,0 +1,49 @@
+num_value,term,total,<=,<=_rate
+0.2,Nucleated red blood cell count,12060.0,11970.0,0.9925373134328358
+0.5,Nucleated red blood cell count,6630.0,6610.0,0.9969834087481146
+10.5,Rheumatoid factor,280.0,30.0,0.10714285714285714
+11.0,Rheumatoid factor,280.0,160.0,0.5714285714285714
+9.0,Rheumatoid factor,260.0,160.0,0.6153846153846154
+12.0,Rheumatoid factor,260.0,180.0,0.6923076923076923
+13.0,Rheumatoid factor,190.0,140.0,0.7368421052631579
+10.0,Rheumatoid factor,2120.0,1930.0,0.9103773584905659
+7.0,Rheumatoid factor,380.0,350.0,0.9210526315789472
+20.0,Rheumatoid factor,1030.0,1010.0,0.9805825242718448
+8.4,Rheumatoid factor,210.0,210.0,1.0
+0.0,Tissue transglutaminase immunoglobulin A level,368640.0,10.0,2.7126736111111113e-05
+2.0,Tissue transglutaminase immunoglobulin A level,170.0,10.0,0.05882352941176471
+0.2,Tissue transglutaminase immunoglobulin A level,780.0,300.0,0.38461538461538464
+1.0,Tissue transglutaminase immunoglobulin A level,1980.0,1300.0,0.6565656565656566
+0.1,Tissue transglutaminase immunoglobulin A level,390.0,270.0,0.6923076923076923
+0.5,Tissue transglutaminase immunoglobulin A level,1680.0,1180.0,0.7023809523809523
+1.9,Tissue transglutaminase immunoglobulin A level,300.0,260.0,0.8666666666666667
+5.0,Tissue transglutaminase immunoglobulin A level,230.0,220.0,0.9565217391304348
+4.0,Plasma C reactive protein,510.0,90.0,0.17647058823529413
+1.0,Plasma C reactive protein,2100.0,1370.0,0.6523809523809524
+5.0,Plasma C reactive protein,3580.0,3170.0,0.8854748603351955
+0.5,Plasma C reactive protein,400.0,370.0,0.925
+6.0,Urine albumin level,580.0,250.0,0.4310344827586207
+10.0,Urine albumin level,630.0,350.0,0.5555555555555556
+4.0,Urine albumin level,700.0,440.0,0.6285714285714286
+7.0,Urine albumin level,2000.0,1600.0,0.8
+6.6,Urine albumin level,450.0,370.0,0.8222222222222222
+3.2,Urine albumin level,640.0,540.0,0.84375
+5.0,Urine albumin level,2570.0,2180.0,0.8482490272373541
+3.0,Urine albumin level,4200.0,3860.0,0.919047619047619
+6.0,Urine microalbumin level,550.0,270.0,0.4909090909090909
+7.0,Urine microalbumin level,790.0,570.0,0.7215189873417721
+5.0,Urine microalbumin level,1550.0,1120.0,0.7225806451612903
+2.0,Urine microalbumin level,520.0,410.0,0.7884615384615384
+3.0,Urine microalbumin level,2000.0,1760.0,0.88
+6.0,Quantitative faecal immunochemical test,120.0,40.0,0.3333333333333333
+4.0,Quantitative faecal immunochemical test,1100.0,990.0,0.9
+10.0,Quantitative faecal immunochemical test,970.0,930.0,0.9587628865979382
+2.0,Quantitative faecal immunochemical test,1480.0,1420.0,0.9594594594594594
+7.0,Quantitative faecal immunochemical test,1990.0,1920.0,0.9648241206030149
+10.0,Faecal calprotectin content,100.0,60.0,0.6
+5.0,Faecal calprotectin content,100.0,70.0,0.7
+15.0,Faecal calprotectin content,130.0,100.0,0.7692307692307693
+4.0,Faecal calprotectin content,110.0,100.0,0.9090909090909092
+26.0,Faecal calprotectin content,400.0,370.0,0.925
+30.0,Faecal calprotectin content,560.0,520.0,0.9285714285714286
+20.0,Faecal calprotectin content,430.0,400.0,0.9302325581395348
diff --git a/publication_outputs/all_tests_lower_limit_with_names.csv b/publication_outputs/all_tests_lower_limit_with_names.csv
@@ -0,0 +1,14 @@
+rank,,1,1,2,2
+,,lower,rate,lower,rate
+code,term,,,,
+80274001,Glomerular filtration rate,0.0,0.6551724137931034,60.0,0.3448275862068966
+992081000000101,Rheumatoid factor,0.0,0.9852941176470588,10.0,0.013071895424836602
+999651000000107,Plasma C reactive protein,0.0,0.8697604790419161,0.1,0.13023952095808386
+1002571000000102,Faecal calprotectin content,0.0,1.0,,
+1003301000000109,Urine albumin level,0.0,0.9569300518134716,3.0,0.02590673575129533
+1010251000000109,Urine microalbumin level,0.0,0.9986149584487536,0.2,0.0013850415512465374
+1011481000000105,EGFR calculated using creatinine CKDEC equation,0.0,0.5763478945297127,60.0,0.24330972058244785
+1013671000000106,Tissue transglutaminase immunoglobulin A level,0.0,0.8983425414364641,0.1,0.055248618784530384
+1020291000000106,GFR calculated by abbreviated MDRD,0.0,0.7903539759209862,60.0,0.09473001225578544
+1022461000000100,Nucleated red blood cell count,0.0,1.0,,
+1049361000000101,Quantitative faecal immunochemical test,0.0,1.0,,
diff --git a/publication_outputs/all_tests_upper_limit_with_names.csv b/publication_outputs/all_tests_upper_limit_with_names.csv
@@ -0,0 +1,14 @@
+rank,,1,1,2,2
+,,upper,rate,upper,rate
+code,term,,,,
+80274001,Glomerular filtration rate,0.0,0.6551724137931034,150.0,0.3448275862068966
+992081000000101,Rheumatoid factor,14.0,0.4599018003273322,15.0,0.10474631751227496
+999651000000107,Plasma C reactive protein,10.0,0.4801795063575168,5.0,0.4637247569184742
+1002571000000102,Faecal calprotectin content,50.0,0.5181347150259067,0.0,0.2176165803108808
+1003301000000109,Urine albumin level,0.0,0.7924894787957267,20.0,0.11395273551311105
+1010251000000109,Urine microalbumin level,0.0,0.8400277008310251,20.0,0.1523545706371191
+1011481000000105,EGFR calculated using creatinine CKDEC equation,0.0,0.7519307393378917,200.0,0.12504304195976193
+1013671000000106,Tissue transglutaminase immunoglobulin A level,6.9,0.2560706401766004,7.0,0.12472406181015452
+1020291000000106,GFR calculated by abbreviated MDRD,0.0,0.9286332179930796,500.0,0.030493079584775082
+1022461000000100,Nucleated red blood cell count,0.2,0.551660516605166,0.5,0.3058118081180812
+1049361000000101,Quantitative faecal immunochemical test,10.0,0.4625850340136055,0.0,0.4462585034013605
diff --git a/publication_outputs/comparator_rate_by_comparator_per_test_with_names.csv b/publication_outputs/comparator_rate_by_comparator_per_test_with_names.csv
@@ -0,0 +1,12 @@
+,term,<=_rate,>=_rate
+1002571000000102,Faecal calprotectin content,0.32068965517241377,0.013793103448275862
+1003301000000109,Urine albumin level,0.3107801877630301,0.0012949174490126255
+1010251000000109,Urine microalbumin level,0.2853185595567867,0.0041551246537396115
+1011481000000105,EGFR calculated using creatinine CKDEC equation,0.0,0.3132962762555955
+1013671000000106,Tissue transglutaminase immunoglobulin A level,0.3907284768211921,0.004415011037527594
+1020291000000106,GFR calculated by abbreviated MDRD,0.0,0.32840663302090844
+1022461000000100,Nucleated red blood cell count,0.8566159520516367,
+1049361000000101,Quantitative faecal immunochemical test,0.7210884353741497,0.0217687074829932
+80274001,Glomerular filtration rate,,0.4482758620689655
+992081000000101,Rheumatoid factor,0.7058823529411765,0.0016339869281045752
+999651000000107,Plasma C reactive protein,0.37471952131638,