Skip to content

Commit

Permalink
Merge pull request #6 from opensafely/charts
Browse files Browse the repository at this point in the history
Tables and charts
  • Loading branch information
HelenCEBM committed Jun 13, 2023
2 parents 94f45ce + 8b15894 commit 5167dc7
Show file tree
Hide file tree
Showing 15 changed files with 2,455 additions and 58 deletions.
160 changes: 102 additions & 58 deletions analysis/data_presentation.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,92 +5,136 @@
import pandas as pd
import numpy as np
import os
import matplotlib
import matplotlib.pyplot as plt


custom_sort = ["Glomerular filtration rate",
"GFR calculated by abbreviated MDRD",
"EGFR calculated using creatinine CKDEC equation",
"Nucleated red blood cell count",
"Rheumatoid factor",
"Tissue transglutaminase immunoglobulin A level",
"Plasma C reactive protein",
"Urine albumin level",
"Urine microalbumin level",
"Quantitative faecal immunochemical test",
"Faecal calprotectin content"]

BASE_DIR = Path(__file__).parents[1]
OUTPUT_DIR = BASE_DIR / "output"
INPUT_DIR = BASE_DIR / "released_outputs"
OUTPUT_DIR = BASE_DIR / "publication_outputs"
if not os.path.exists(OUTPUT_DIR):
os.makedirs(OUTPUT_DIR)

# import path test codelist to get code descriptions
path_tests = pd.read_csv(os.path.join(f'codelists/user-helen-curtis-tests-with-comparators.csv'))
path_tests = pd.read_csv(os.path.join(f'codelists/user-helen-curtis-tests-with-comparators.csv')).set_index("code")
# replace very long names with shorter ones
path_tests.loc[path_tests["code"]==1011481000000105, "term"] = "EGFR calculated using creatinine CKDEC equation"
path_tests.loc[path_tests["code"]==1020291000000106, "term"] = "GFR calculated by abbreviated MDRD"
#measures_df = measures_df.merge(path_tests, on="code")
path_tests.loc[1011481000000105, "term"] = "EGFR calculated using creatinine CKDEC equation"
path_tests.loc[1020291000000106, "term"] = "GFR calculated by abbreviated MDRD"

# # 1. comparator_rate_per_test
#def simple_plot(factor=None):
# # load data
# df = pd.read_csv(os.path.join(OUTPUT_DIR, 'comparator_rate_per_test.csv')).sort_values(by="denominator", ascending=False)

# fig, ax1 = plt.subplots()
# # plot numerator and denominator as bars
# ax1.bar(df["description"], df["denominator"])
# ax1.bar(df["description"], df["numerator"])
# ax1.set_ylabel("N")
# ax1.legend(title='Legend', labels=["Without comparators", "With comparators"], bbox_to_anchor=(1.1, 0.5))
df = pd.read_csv(os.path.join(INPUT_DIR, 'comparator_rate_per_test.csv'), index_col=0).sort_values(by="denominator", ascending=False)
df = df.join(path_tests)
df.to_csv(os.path.join(OUTPUT_DIR, 'comparator_rate_per_test_with_names.csv'))

# ax2 = ax1.twinx()
# ax2.scatter(df["description"], df["rate"], c='k', marker='x')
# ax2.set_ylim([0, 1])
# ax2.set_ylabel("proportion with comparator")
# plt.xticks(rotation=45)
# plt.savefig(os.path.join(OUTPUT_DIR, 'comparator_rate_per_test.png'))


# 2. comparator_rate_per_test by region
# load data
def rate_breakdown(factor="region", indices=2):
'''Produce a chart by tests and by specified factor (region, comparator,)'''

df = pd.read_csv(os.path.join(OUTPUT_DIR, f'comparator_rate_by_{factor}_per_test.csv'), index_col=[0,1])
#flatten index
df.index = df.index.map('{0[0]}|{0[1]}'.format)

#df = df.set_index("item", append=True)
df.index = df.index.rename("item")
print(df)

# keep "rate" only (i.e. proportion with comparators) and rearrange dataframe
idx = pd.IndexSlice
df = df.loc[idx[:, 'rate'], :]
df = df.stack().unstack(1).reset_index()
df = df.rename(columns={'level_1':factor})
df = pd.read_csv(os.path.join(INPUT_DIR, f'comparator_rate_by_{factor}_per_test.csv'), index_col=[0,1])
df = df.stack().unstack(level=1).rename(columns={"flag":"total"})
df = df.join(path_tests, on=None)

# filter tests to only keep regions with >=100 tests
df = df.loc[df["total"]>=100]
df.to_csv(os.path.join(OUTPUT_DIR, f'comparator_rate_by_{factor}_per_test_with_names.csv'))

df2 = df.copy()[["rate","term"]].reset_index().drop("code", 1)
df2 = df2.rename(columns={"level_1":factor})
df2 = df2.set_index(["term",factor]).unstack()
df2.columns = df2.columns.droplevel()
#df2["max"] = df2.max(axis=1)

# sort tests:
#df2 = df2.sort_values(by="max").drop("max", axis=1)
df2 = df2.transpose()[custom_sort].transpose()

# count tests and number of different values for specified factor
ntests = df["test"].nunique()
length = df[factor].nunique()
print(df2)

# plot heatmap
fig, ax = plt.subplots(figsize=(15,8))
plt.imshow(df2, cmap='autumn_r', interpolation='nearest')

plt.xticks(np.arange(0,9), df2.columns, size=8, rotation=90)
plt.yticks(np.arange(0,len(df2.index)), df2.index, size=8)
plt.colorbar()

plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, f'comparator_rate_per_{factor}_per_test.png'))
plt.show()

# create a number sequence with the number of points on x axis required (allowing for space between groups)
ind = np.arange(0,df["test"].count()+ df["test"].nunique()-1)
# create a sequence of the positions where no data will be plotted (ie. the gaps between categories)
spaces = np.arange(length,len(ind), length+1)
new_ind = np.delete(ind, spaces)

width = 0.5 # bar width
rate_breakdown(factor="region")


######### Comparators
df = pd.read_csv(os.path.join(INPUT_DIR, f'comparator_rate_by_comparator_per_test.csv'), index_col=0)
df = df.join(path_tests) # join test names
df = df[["term", "<=_rate", ">=_rate"]]
df.to_csv(os.path.join(OUTPUT_DIR, f'comparator_rate_by_comparator_per_test_with_names.csv'))


######### Values
df = pd.read_csv(os.path.join(INPUT_DIR, f'comparator_rate_by_value_per_test.csv'), index_col=[0,1])
df = df.join(path_tests) # join test names

# make two different output dfs (one for tests with mostly greater-than comparators, and one for less-than)
df_out_gt = pd.DataFrame(columns=["num_value","term","total",">=",">=_rate"])
df_out_lt = pd.DataFrame(columns=["num_value","term","total","<=","<=_rate"])

for n, t in enumerate(custom_sort):
if n<3:
# for GFR tests include only greater-than comparators
main_comparator = ">="
else:
main_comparator = "<="

main_comparator_rate = f"{main_comparator}_rate"
comparator_subset = [main_comparator_rate,"="]
df_t = df.loc[df["term"]==t].sort_values(by=main_comparator_rate)
df_t = df_t[["term", "total", main_comparator, main_comparator_rate]]

for n, r in enumerate(df[factor].unique()):
# filter df
dfp = df.loc[df[factor]==r]
# filter out low count values / those not associated with comparators
df_t = df_t.loc[(df_t["total"]>=100)]
df_t = df_t.loc[(df_t[main_comparator_rate]>0)]
df_t = df_t.reset_index().set_index("code")

# starting from n, plot data every 'n'th position
locs = new_ind[n::length]
# plot chart
plt.bar(locs, dfp["value"], width, label=r)
if n<3:
df_out_gt = df_out_gt.append(df_t)
else:
df_out_lt = df_out_lt.append(df_t)

plt.legend(loc="lower right")
plt.xticks(new_ind[4::length], path_tests, size=8, rotation=45)
plt.xlabel("Test code", size=20)
plt.ylabel("Proportion with comparators", size=16)
df_out_gt.to_csv(os.path.join(OUTPUT_DIR, f'gfr_tests_comparator_associated_values_with_names.csv'), index=False)
df_out_lt.to_csv(os.path.join(OUTPUT_DIR, f'all_non_gfr_tests_comparator_associated_values_with_names.csv'), index=False)


plt.savefig(os.path.join(OUTPUT_DIR, f'comparator_rate_per_{factor}_per_test.png'))

rate_breakdown(factor="region")
#rate_breakdown(factor="value")
#rate_breakdown(factor="comparator")

######### Upper and lower limits (reference values returned alongside tests)
for limit in ["upper", "lower"]:
df = pd.read_csv(os.path.join(INPUT_DIR, f'{limit}_bound_rate_per_test.csv'), index_col=[0,1])
df = df.join(path_tests) # join test names

# calculate rank for each limit value to find top 2 only
df = df.reset_index()
df["rank"] = df.groupby('code')['rate'].rank(method='dense', ascending=False).astype(int)
df = df[["code","term","rank",limit,"rate"]].loc[df["rank"]<3]
df = df.set_index(["code","term","rank"]).stack().unstack(level=2).unstack()

df.to_csv(os.path.join(OUTPUT_DIR, f'all_tests_{limit}_limit_with_names.csv'))


Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
num_value,term,total,<=,<=_rate
0.2,Nucleated red blood cell count,12060.0,11970.0,0.9925373134328358
0.5,Nucleated red blood cell count,6630.0,6610.0,0.9969834087481146
10.5,Rheumatoid factor,280.0,30.0,0.10714285714285714
11.0,Rheumatoid factor,280.0,160.0,0.5714285714285714
9.0,Rheumatoid factor,260.0,160.0,0.6153846153846154
12.0,Rheumatoid factor,260.0,180.0,0.6923076923076923
13.0,Rheumatoid factor,190.0,140.0,0.7368421052631579
10.0,Rheumatoid factor,2120.0,1930.0,0.9103773584905659
7.0,Rheumatoid factor,380.0,350.0,0.9210526315789472
20.0,Rheumatoid factor,1030.0,1010.0,0.9805825242718448
8.4,Rheumatoid factor,210.0,210.0,1.0
0.0,Tissue transglutaminase immunoglobulin A level,368640.0,10.0,2.7126736111111113e-05
2.0,Tissue transglutaminase immunoglobulin A level,170.0,10.0,0.05882352941176471
0.2,Tissue transglutaminase immunoglobulin A level,780.0,300.0,0.38461538461538464
1.0,Tissue transglutaminase immunoglobulin A level,1980.0,1300.0,0.6565656565656566
0.1,Tissue transglutaminase immunoglobulin A level,390.0,270.0,0.6923076923076923
0.5,Tissue transglutaminase immunoglobulin A level,1680.0,1180.0,0.7023809523809523
1.9,Tissue transglutaminase immunoglobulin A level,300.0,260.0,0.8666666666666667
5.0,Tissue transglutaminase immunoglobulin A level,230.0,220.0,0.9565217391304348
4.0,Plasma C reactive protein,510.0,90.0,0.17647058823529413
1.0,Plasma C reactive protein,2100.0,1370.0,0.6523809523809524
5.0,Plasma C reactive protein,3580.0,3170.0,0.8854748603351955
0.5,Plasma C reactive protein,400.0,370.0,0.925
6.0,Urine albumin level,580.0,250.0,0.4310344827586207
10.0,Urine albumin level,630.0,350.0,0.5555555555555556
4.0,Urine albumin level,700.0,440.0,0.6285714285714286
7.0,Urine albumin level,2000.0,1600.0,0.8
6.6,Urine albumin level,450.0,370.0,0.8222222222222222
3.2,Urine albumin level,640.0,540.0,0.84375
5.0,Urine albumin level,2570.0,2180.0,0.8482490272373541
3.0,Urine albumin level,4200.0,3860.0,0.919047619047619
6.0,Urine microalbumin level,550.0,270.0,0.4909090909090909
7.0,Urine microalbumin level,790.0,570.0,0.7215189873417721
5.0,Urine microalbumin level,1550.0,1120.0,0.7225806451612903
2.0,Urine microalbumin level,520.0,410.0,0.7884615384615384
3.0,Urine microalbumin level,2000.0,1760.0,0.88
6.0,Quantitative faecal immunochemical test,120.0,40.0,0.3333333333333333
4.0,Quantitative faecal immunochemical test,1100.0,990.0,0.9
10.0,Quantitative faecal immunochemical test,970.0,930.0,0.9587628865979382
2.0,Quantitative faecal immunochemical test,1480.0,1420.0,0.9594594594594594
7.0,Quantitative faecal immunochemical test,1990.0,1920.0,0.9648241206030149
10.0,Faecal calprotectin content,100.0,60.0,0.6
5.0,Faecal calprotectin content,100.0,70.0,0.7
15.0,Faecal calprotectin content,130.0,100.0,0.7692307692307693
4.0,Faecal calprotectin content,110.0,100.0,0.9090909090909092
26.0,Faecal calprotectin content,400.0,370.0,0.925
30.0,Faecal calprotectin content,560.0,520.0,0.9285714285714286
20.0,Faecal calprotectin content,430.0,400.0,0.9302325581395348
14 changes: 14 additions & 0 deletions publication_outputs/all_tests_lower_limit_with_names.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
rank,,1,1,2,2
,,lower,rate,lower,rate
code,term,,,,
80274001,Glomerular filtration rate,0.0,0.6551724137931034,60.0,0.3448275862068966
992081000000101,Rheumatoid factor,0.0,0.9852941176470588,10.0,0.013071895424836602
999651000000107,Plasma C reactive protein,0.0,0.8697604790419161,0.1,0.13023952095808386
1002571000000102,Faecal calprotectin content,0.0,1.0,,
1003301000000109,Urine albumin level,0.0,0.9569300518134716,3.0,0.02590673575129533
1010251000000109,Urine microalbumin level,0.0,0.9986149584487536,0.2,0.0013850415512465374
1011481000000105,EGFR calculated using creatinine CKDEC equation,0.0,0.5763478945297127,60.0,0.24330972058244785
1013671000000106,Tissue transglutaminase immunoglobulin A level,0.0,0.8983425414364641,0.1,0.055248618784530384
1020291000000106,GFR calculated by abbreviated MDRD,0.0,0.7903539759209862,60.0,0.09473001225578544
1022461000000100,Nucleated red blood cell count,0.0,1.0,,
1049361000000101,Quantitative faecal immunochemical test,0.0,1.0,,
14 changes: 14 additions & 0 deletions publication_outputs/all_tests_upper_limit_with_names.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
rank,,1,1,2,2
,,upper,rate,upper,rate
code,term,,,,
80274001,Glomerular filtration rate,0.0,0.6551724137931034,150.0,0.3448275862068966
992081000000101,Rheumatoid factor,14.0,0.4599018003273322,15.0,0.10474631751227496
999651000000107,Plasma C reactive protein,10.0,0.4801795063575168,5.0,0.4637247569184742
1002571000000102,Faecal calprotectin content,50.0,0.5181347150259067,0.0,0.2176165803108808
1003301000000109,Urine albumin level,0.0,0.7924894787957267,20.0,0.11395273551311105
1010251000000109,Urine microalbumin level,0.0,0.8400277008310251,20.0,0.1523545706371191
1011481000000105,EGFR calculated using creatinine CKDEC equation,0.0,0.7519307393378917,200.0,0.12504304195976193
1013671000000106,Tissue transglutaminase immunoglobulin A level,6.9,0.2560706401766004,7.0,0.12472406181015452
1020291000000106,GFR calculated by abbreviated MDRD,0.0,0.9286332179930796,500.0,0.030493079584775082
1022461000000100,Nucleated red blood cell count,0.2,0.551660516605166,0.5,0.3058118081180812
1049361000000101,Quantitative faecal immunochemical test,10.0,0.4625850340136055,0.0,0.4462585034013605
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
,term,<=_rate,>=_rate
1002571000000102,Faecal calprotectin content,0.32068965517241377,0.013793103448275862
1003301000000109,Urine albumin level,0.3107801877630301,0.0012949174490126255
1010251000000109,Urine microalbumin level,0.2853185595567867,0.0041551246537396115
1011481000000105,EGFR calculated using creatinine CKDEC equation,0.0,0.3132962762555955
1013671000000106,Tissue transglutaminase immunoglobulin A level,0.3907284768211921,0.004415011037527594
1020291000000106,GFR calculated by abbreviated MDRD,0.0,0.32840663302090844
1022461000000100,Nucleated red blood cell count,0.8566159520516367,
1049361000000101,Quantitative faecal immunochemical test,0.7210884353741497,0.0217687074829932
80274001,Glomerular filtration rate,,0.4482758620689655
992081000000101,Rheumatoid factor,0.7058823529411765,0.0016339869281045752
999651000000107,Plasma C reactive protein,0.37471952131638,

0 comments on commit 5167dc7

Please sign in to comment.