Skip to content

Commit

Permalink
Adjust fit of axis labels in figure
Browse files Browse the repository at this point in the history
Also save all processed outputs to new folder for publication
  • Loading branch information
HelenCEBM committed Jun 8, 2023
1 parent 75c22cd commit d9f1972
Show file tree
Hide file tree
Showing 9 changed files with 206 additions and 9 deletions.
23 changes: 14 additions & 9 deletions analysis/data_presentation.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,10 @@
"Faecal calprotectin content"]

BASE_DIR = Path(__file__).parents[1]
OUTPUT_DIR = BASE_DIR / "output/released_outputs"
INPUT_DIR = BASE_DIR / "output/released_outputs"
OUTPUT_DIR = BASE_DIR / "publication_outputs"
if not os.path.exists(OUTPUT_DIR):
os.makedirs(OUTPUT_DIR)

# import path test codelist to get code descriptions
path_tests = pd.read_csv(os.path.join(f'codelists/user-helen-curtis-tests-with-comparators.csv')).set_index("code")
Expand All @@ -34,7 +37,7 @@
# # 1. comparator_rate_per_test
#def simple_plot(factor=None):
# # load data
df = pd.read_csv(os.path.join(OUTPUT_DIR, 'comparator_rate_per_test.csv'), index_col=0).sort_values(by="denominator", ascending=False)
df = pd.read_csv(os.path.join(INPUT_DIR, 'comparator_rate_per_test.csv'), index_col=0).sort_values(by="denominator", ascending=False)
df = df.join(path_tests)
df.to_csv(os.path.join(OUTPUT_DIR, 'comparator_rate_per_test_with_names.csv'))

Expand All @@ -58,7 +61,7 @@
def rate_breakdown(factor="region", indices=2):
'''Produce a chart by tests and by specified factor (region, comparator,)'''

df = pd.read_csv(os.path.join(OUTPUT_DIR, f'comparator_rate_by_{factor}_per_test.csv'), index_col=[0,1])
df = pd.read_csv(os.path.join(INPUT_DIR, f'comparator_rate_by_{factor}_per_test.csv'), index_col=[0,1])
df = df.stack().unstack(level=1).rename(columns={"flag":"total"})
df = df.join(path_tests, on=None)

Expand All @@ -70,7 +73,7 @@ def rate_breakdown(factor="region", indices=2):
df2 = df2.rename(columns={"level_1":factor})
df2 = df2.set_index(["term",factor]).unstack()
df2.columns = df2.columns.droplevel()
df2["max"] = df2.max(axis=1)
#df2["max"] = df2.max(axis=1)

# sort tests:
#df2 = df2.sort_values(by="max").drop("max", axis=1)
Expand All @@ -86,6 +89,8 @@ def rate_breakdown(factor="region", indices=2):
plt.yticks(np.arange(0,len(df2.index)), df2.index, size=8)
plt.colorbar()

plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, f'comparator_rate_per_{factor}_per_test.png'))
plt.show()

# # count tests and number of different values for specified factor
Expand Down Expand Up @@ -118,20 +123,20 @@ def rate_breakdown(factor="region", indices=2):
# plt.xlabel("Test code", size=20)
# plt.ylabel("Proportion with comparators", size=16)

plt.savefig(os.path.join(OUTPUT_DIR, f'comparator_rate_per_{factor}_per_test.png'))


#rate_breakdown(factor="region")
rate_breakdown(factor="region")


######### Comparators
df = pd.read_csv(os.path.join(OUTPUT_DIR, f'comparator_rate_by_comparator_per_test.csv'), index_col=0)
df = pd.read_csv(os.path.join(INPUT_DIR, f'comparator_rate_by_comparator_per_test.csv'), index_col=0)
df = df.join(path_tests) # join test names
df = df[["term", "<=_rate", ">=_rate"]]
df.to_csv(os.path.join(OUTPUT_DIR, f'comparator_rate_by_comparator_per_test_with_names.csv'))


######### Values
df = pd.read_csv(os.path.join(OUTPUT_DIR, f'comparator_rate_by_value_per_test.csv'), index_col=[0,1])
df = pd.read_csv(os.path.join(INPUT_DIR, f'comparator_rate_by_value_per_test.csv'), index_col=[0,1])
df = df.join(path_tests) # join test names

# make two different output dfs (one for tests with mostly greater-than comparators, and one for less-than)
Expand Down Expand Up @@ -168,7 +173,7 @@ def rate_breakdown(factor="region", indices=2):

######### Upper and lower limits (reference values returned alongside tests)
for limit in ["upper", "lower"]:
df = pd.read_csv(os.path.join(OUTPUT_DIR, f'{limit}_bound_rate_per_test.csv'), index_col=[0,1])
df = pd.read_csv(os.path.join(INPUT_DIR, f'{limit}_bound_rate_per_test.csv'), index_col=[0,1])
df = df.join(path_tests) # join test names

# calculate rank for each limit value to find top 2 only
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
num_value,term,total,<=,<=_rate
0.2,Nucleated red blood cell count,12060.0,11970.0,0.9925373134328358
0.5,Nucleated red blood cell count,6630.0,6610.0,0.9969834087481146
10.5,Rheumatoid factor,280.0,30.0,0.10714285714285714
11.0,Rheumatoid factor,280.0,160.0,0.5714285714285714
9.0,Rheumatoid factor,260.0,160.0,0.6153846153846154
12.0,Rheumatoid factor,260.0,180.0,0.6923076923076923
13.0,Rheumatoid factor,190.0,140.0,0.7368421052631579
10.0,Rheumatoid factor,2120.0,1930.0,0.9103773584905659
7.0,Rheumatoid factor,380.0,350.0,0.9210526315789472
20.0,Rheumatoid factor,1030.0,1010.0,0.9805825242718448
8.4,Rheumatoid factor,210.0,210.0,1.0
0.0,Tissue transglutaminase immunoglobulin A level,368640.0,10.0,2.7126736111111113e-05
2.0,Tissue transglutaminase immunoglobulin A level,170.0,10.0,0.05882352941176471
0.2,Tissue transglutaminase immunoglobulin A level,780.0,300.0,0.38461538461538464
1.0,Tissue transglutaminase immunoglobulin A level,1980.0,1300.0,0.6565656565656566
0.1,Tissue transglutaminase immunoglobulin A level,390.0,270.0,0.6923076923076923
0.5,Tissue transglutaminase immunoglobulin A level,1680.0,1180.0,0.7023809523809523
1.9,Tissue transglutaminase immunoglobulin A level,300.0,260.0,0.8666666666666667
5.0,Tissue transglutaminase immunoglobulin A level,230.0,220.0,0.9565217391304348
4.0,Plasma C reactive protein,510.0,90.0,0.17647058823529413
1.0,Plasma C reactive protein,2100.0,1370.0,0.6523809523809524
5.0,Plasma C reactive protein,3580.0,3170.0,0.8854748603351955
0.5,Plasma C reactive protein,400.0,370.0,0.925
6.0,Urine albumin level,580.0,250.0,0.4310344827586207
10.0,Urine albumin level,630.0,350.0,0.5555555555555556
4.0,Urine albumin level,700.0,440.0,0.6285714285714286
7.0,Urine albumin level,2000.0,1600.0,0.8
6.6,Urine albumin level,450.0,370.0,0.8222222222222222
3.2,Urine albumin level,640.0,540.0,0.84375
5.0,Urine albumin level,2570.0,2180.0,0.8482490272373541
3.0,Urine albumin level,4200.0,3860.0,0.919047619047619
6.0,Urine microalbumin level,550.0,270.0,0.4909090909090909
7.0,Urine microalbumin level,790.0,570.0,0.7215189873417721
5.0,Urine microalbumin level,1550.0,1120.0,0.7225806451612903
2.0,Urine microalbumin level,520.0,410.0,0.7884615384615384
3.0,Urine microalbumin level,2000.0,1760.0,0.88
6.0,Quantitative faecal immunochemical test,120.0,40.0,0.3333333333333333
4.0,Quantitative faecal immunochemical test,1100.0,990.0,0.9
10.0,Quantitative faecal immunochemical test,970.0,930.0,0.9587628865979382
2.0,Quantitative faecal immunochemical test,1480.0,1420.0,0.9594594594594594
7.0,Quantitative faecal immunochemical test,1990.0,1920.0,0.9648241206030149
10.0,Faecal calprotectin content,100.0,60.0,0.6
5.0,Faecal calprotectin content,100.0,70.0,0.7
15.0,Faecal calprotectin content,130.0,100.0,0.7692307692307693
4.0,Faecal calprotectin content,110.0,100.0,0.9090909090909092
26.0,Faecal calprotectin content,400.0,370.0,0.925
30.0,Faecal calprotectin content,560.0,520.0,0.9285714285714286
20.0,Faecal calprotectin content,430.0,400.0,0.9302325581395348
14 changes: 14 additions & 0 deletions publication_outputs/all_tests_lower_limit_with_names.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
rank,,1,1,2,2
,,lower,rate,lower,rate
code,term,,,,
80274001,Glomerular filtration rate,0.0,0.6551724137931034,60.0,0.3448275862068966
992081000000101,Rheumatoid factor,0.0,0.9852941176470588,10.0,0.013071895424836602
999651000000107,Plasma C reactive protein,0.0,0.8697604790419161,0.1,0.13023952095808386
1002571000000102,Faecal calprotectin content,0.0,1.0,,
1003301000000109,Urine albumin level,0.0,0.9569300518134716,3.0,0.02590673575129533
1010251000000109,Urine microalbumin level,0.0,0.9986149584487536,0.2,0.0013850415512465374
1011481000000105,EGFR calculated using creatinine CKDEC equation,0.0,0.5763478945297127,60.0,0.24330972058244785
1013671000000106,Tissue transglutaminase immunoglobulin A level,0.0,0.8983425414364641,0.1,0.055248618784530384
1020291000000106,GFR calculated by abbreviated MDRD,0.0,0.7903539759209862,60.0,0.09473001225578544
1022461000000100,Nucleated red blood cell count,0.0,1.0,,
1049361000000101,Quantitative faecal immunochemical test,0.0,1.0,,
14 changes: 14 additions & 0 deletions publication_outputs/all_tests_upper_limit_with_names.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
rank,,1,1,2,2
,,upper,rate,upper,rate
code,term,,,,
80274001,Glomerular filtration rate,0.0,0.6551724137931034,150.0,0.3448275862068966
992081000000101,Rheumatoid factor,14.0,0.4599018003273322,15.0,0.10474631751227496
999651000000107,Plasma C reactive protein,10.0,0.4801795063575168,5.0,0.4637247569184742
1002571000000102,Faecal calprotectin content,50.0,0.5181347150259067,0.0,0.2176165803108808
1003301000000109,Urine albumin level,0.0,0.7924894787957267,20.0,0.11395273551311105
1010251000000109,Urine microalbumin level,0.0,0.8400277008310251,20.0,0.1523545706371191
1011481000000105,EGFR calculated using creatinine CKDEC equation,0.0,0.7519307393378917,200.0,0.12504304195976193
1013671000000106,Tissue transglutaminase immunoglobulin A level,6.9,0.2560706401766004,7.0,0.12472406181015452
1020291000000106,GFR calculated by abbreviated MDRD,0.0,0.9286332179930796,500.0,0.030493079584775082
1022461000000100,Nucleated red blood cell count,0.2,0.551660516605166,0.5,0.3058118081180812
1049361000000101,Quantitative faecal immunochemical test,10.0,0.4625850340136055,0.0,0.4462585034013605
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
,term,<=_rate,>=_rate
1002571000000102,Faecal calprotectin content,0.32068965517241377,0.013793103448275862
1003301000000109,Urine albumin level,0.3107801877630301,0.0012949174490126255
1010251000000109,Urine microalbumin level,0.2853185595567867,0.0041551246537396115
1011481000000105,EGFR calculated using creatinine CKDEC equation,0.0,0.3132962762555955
1013671000000106,Tissue transglutaminase immunoglobulin A level,0.3907284768211921,0.004415011037527594
1020291000000106,GFR calculated by abbreviated MDRD,0.0,0.32840663302090844
1022461000000100,Nucleated red blood cell count,0.8566159520516367,
1049361000000101,Quantitative faecal immunochemical test,0.7210884353741497,0.0217687074829932
80274001,Glomerular filtration rate,,0.4482758620689655
992081000000101,Rheumatoid factor,0.7058823529411765,0.0016339869281045752
999651000000107,Plasma C reactive protein,0.37471952131638,
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
code,,comparator_flag,total,rate,term
80274001,East,3060.0,3730.0,0.8203753351206434,Glomerular filtration rate
80274001,South West,1750.0,6910.0,0.2532561505065123,Glomerular filtration rate
992081000000101,Yorkshire and The Humber,510.0,990.0,0.5151515151515151,Rheumatoid factor
992081000000101,East Midlands,820.0,1000.0,0.82,Rheumatoid factor
992081000000101,East,1180.0,1540.0,0.7662337662337663,Rheumatoid factor
992081000000101,London,410.0,430.0,0.9534883720930232,Rheumatoid factor
992081000000101,North West,180.0,610.0,0.29508196721311475,Rheumatoid factor
992081000000101,South East,190.0,240.0,0.7916666666666666,Rheumatoid factor
992081000000101,South West,460.0,620.0,0.7419354838709677,Rheumatoid factor
992081000000101,North East,250.0,330.0,0.7575757575757576,Rheumatoid factor
992081000000101,West Midlands,340.0,360.0,0.9444444444444444,Rheumatoid factor
999651000000107,Yorkshire and The Humber,30.0,140.0,0.21428571428571427,Plasma C reactive protein
999651000000107,East Midlands,3780.0,7480.0,0.5053475935828877,Plasma C reactive protein
999651000000107,East,30.0,130.0,0.2307692307692308,Plasma C reactive protein
999651000000107,South East,130.0,720.0,0.18055555555555555,Plasma C reactive protein
999651000000107,South West,600.0,3070.0,0.1954397394136808,Plasma C reactive protein
999651000000107,North East,340.0,1550.0,0.21935483870967745,Plasma C reactive protein
999651000000107,West Midlands,100.0,180.0,0.5555555555555556,Plasma C reactive protein
1002571000000102,Yorkshire and The Humber,270.0,880.0,0.3068181818181818,Faecal calprotectin content
1002571000000102,East Midlands,320.0,1140.0,0.2807017543859649,Faecal calprotectin content
1002571000000102,East,520.0,1530.0,0.33986928104575165,Faecal calprotectin content
1002571000000102,London,280.0,620.0,0.4516129032258064,Faecal calprotectin content
1002571000000102,North West,160.0,390.0,0.41025641025641024,Faecal calprotectin content
1002571000000102,South East,100.0,290.0,0.3448275862068966,Faecal calprotectin content
1002571000000102,South West,120.0,480.0,0.25,Faecal calprotectin content
1002571000000102,North East,20.0,220.0,0.09090909090909093,Faecal calprotectin content
1002571000000102,West Midlands,160.0,250.0,0.64,Faecal calprotectin content
1003301000000109,Yorkshire and The Humber,2110.0,6130.0,0.3442088091353997,Urine albumin level
1003301000000109,East Midlands,1510.0,6200.0,0.2435483870967742,Urine albumin level
1003301000000109,East,2150.0,6750.0,0.3185185185185185,Urine albumin level
1003301000000109,London,930.0,2970.0,0.31313131313131315,Urine albumin level
1003301000000109,North West,820.0,2460.0,0.3333333333333333,Urine albumin level
1003301000000109,South East,810.0,2360.0,0.3432203389830508,Urine albumin level
1003301000000109,South West,750.0,2690.0,0.2788104089219331,Urine albumin level
1003301000000109,North East,250.0,460.0,0.5434782608695652,Urine albumin level
1003301000000109,West Midlands,300.0,860.0,0.3488372093023256,Urine albumin level
1010251000000109,Yorkshire and The Humber,240.0,1270.0,0.1889763779527559,Urine microalbumin level
1010251000000109,East Midlands,740.0,2450.0,0.3020408163265306,Urine microalbumin level
1010251000000109,East,800.0,2800.0,0.2857142857142857,Urine microalbumin level
1010251000000109,North West,780.0,2280.0,0.34210526315789475,Urine microalbumin level
1010251000000109,South East,260.0,580.0,0.4482758620689655,Urine microalbumin level
1010251000000109,South West,310.0,1640.0,0.18902439024390244,Urine microalbumin level
1010251000000109,North East,660.0,2170.0,0.30414746543778803,Urine microalbumin level
1010251000000109,West Midlands,390.0,1240.0,0.3145161290322581,Urine microalbumin level
1011481000000105,Yorkshire and The Humber,2410.0,25030.0,0.09628445864962046,EGFR calculated using creatinine CKDEC equation
1011481000000105,East Midlands,14320.0,44600.0,0.3210762331838565,EGFR calculated using creatinine CKDEC equation
1011481000000105,East,19790.0,58420.0,0.3387538514207463,EGFR calculated using creatinine CKDEC equation
1011481000000105,London,8180.0,16020.0,0.5106117353308365,EGFR calculated using creatinine CKDEC equation
1011481000000105,North West,2700.0,8210.0,0.32886723507917176,EGFR calculated using creatinine CKDEC equation
1011481000000105,South East,3250.0,9090.0,0.35753575357535755,EGFR calculated using creatinine CKDEC equation
1011481000000105,South West,8180.0,28340.0,0.2886379675370501,EGFR calculated using creatinine CKDEC equation
1011481000000105,North East,4850.0,13080.0,0.3707951070336392,EGFR calculated using creatinine CKDEC equation
1011481000000105,West Midlands,0.0,440.0,0.0,EGFR calculated using creatinine CKDEC equation
1013671000000106,Yorkshire and The Humber,1680.0,2010.0,0.8358208955223879,Tissue transglutaminase immunoglobulin A level
1013671000000106,East Midlands,1150.0,2010.0,0.572139303482587,Tissue transglutaminase immunoglobulin A level
1013671000000106,East,280.0,2580.0,0.10852713178294572,Tissue transglutaminase immunoglobulin A level
1013671000000106,London,20.0,170.0,0.1176470588235294,Tissue transglutaminase immunoglobulin A level
1013671000000106,North West,100.0,550.0,0.18181818181818185,Tissue transglutaminase immunoglobulin A level
1013671000000106,South East,150.0,460.0,0.32608695652173914,Tissue transglutaminase immunoglobulin A level
1013671000000106,South West,140.0,300.0,0.4666666666666667,Tissue transglutaminase immunoglobulin A level
1013671000000106,North East,10.0,460.0,0.021739130434782608,Tissue transglutaminase immunoglobulin A level
1013671000000106,West Midlands,70.0,510.0,0.13725490196078433,Tissue transglutaminase immunoglobulin A level
1020291000000106,Yorkshire and The Humber,5980.0,23060.0,0.2593235039028621,GFR calculated by abbreviated MDRD
1020291000000106,East Midlands,17080.0,31560.0,0.541191381495564,GFR calculated by abbreviated MDRD
1020291000000106,East,1770.0,16770.0,0.10554561717352416,GFR calculated by abbreviated MDRD
1020291000000106,London,260.0,860.0,0.3023255813953488,GFR calculated by abbreviated MDRD
1020291000000106,North West,7380.0,24700.0,0.2987854251012146,GFR calculated by abbreviated MDRD
1020291000000106,South East,4010.0,9940.0,0.403420523138833,GFR calculated by abbreviated MDRD
1020291000000106,South West,3570.0,14540.0,0.2455295735900963,GFR calculated by abbreviated MDRD
1020291000000106,North East,1420.0,4490.0,0.31625835189309576,GFR calculated by abbreviated MDRD
1020291000000106,West Midlands,4030.0,12660.0,0.3183254344391785,GFR calculated by abbreviated MDRD
1022461000000100,Yorkshire and The Humber,0.0,380.0,0.0,Nucleated red blood cell count
1022461000000100,East Midlands,11970.0,13560.0,0.8827433628318584,Nucleated red blood cell count
1022461000000100,East,6610.0,6900.0,0.9579710144927536,Nucleated red blood cell count
1022461000000100,South West,0.0,180.0,0.0,Nucleated red blood cell count
1022461000000100,West Midlands,0.0,460.0,0.0,Nucleated red blood cell count
1049361000000101,Yorkshire and The Humber,920.0,1050.0,0.8761904761904762,Quantitative faecal immunochemical test
1049361000000101,East Midlands,500.0,810.0,0.6172839506172839,Quantitative faecal immunochemical test
1049361000000101,East,850.0,1340.0,0.6343283582089553,Quantitative faecal immunochemical test
1049361000000101,London,30.0,120.0,0.25,Quantitative faecal immunochemical test
1049361000000101,North West,1560.0,1890.0,0.8253968253968254,Quantitative faecal immunochemical test
1049361000000101,South East,430.0,570.0,0.7543859649122807,Quantitative faecal immunochemical test
1049361000000101,South West,1160.0,1540.0,0.7532467532467533,Quantitative faecal immunochemical test
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
12 changes: 12 additions & 0 deletions publication_outputs/comparator_rate_per_test_with_names.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
,numerator,denominator,rate,term
1011481000000105,63690.0,203290.0,0.3132962762555955,EGFR calculated using creatinine CKDEC equation
1020291000000106,45560.0,138710.0,0.32845505010453463,GFR calculated by abbreviated MDRD
1003301000000109,9640.0,30880.0,0.3121761658031088,Urine albumin level
1022461000000100,18580.0,21690.0,0.8566159520516367,Nucleated red blood cell count
1010251000000109,4180.0,14440.0,0.2894736842105263,Urine microalbumin level
999651000000107,5010.0,13370.0,0.37471952131638,Plasma C reactive protein
80274001,4810.0,10730.0,0.4482758620689655,Glomerular filtration rate
1013671000000106,3590.0,9060.0,0.3962472406181016,Tissue transglutaminase immunoglobulin A level
1049361000000101,5460.0,7350.0,0.7428571428571429,Quantitative faecal immunochemical test
992081000000101,4340.0,6120.0,0.7091503267973857,Rheumatoid factor
1002571000000102,1940.0,5790.0,0.3350604490500864,Faecal calprotectin content
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
num_value,term,total,>=,>=_rate
90.0,Glomerular filtration rate,1890.0,1750.0,0.925925925925926
60.0,Glomerular filtration rate,3190.0,3050.0,0.9561128526645768
120.0,GFR calculated by abbreviated MDRD,130.0,110.0,0.8461538461538461
60.0,GFR calculated by abbreviated MDRD,14790.0,13240.0,0.8951994590939825
90.0,GFR calculated by abbreviated MDRD,33830.0,32210.0,0.9521135087200708
90.0,EGFR calculated using creatinine CKDEC equation,66230.0,63690.0,0.9616487996376264

0 comments on commit d9f1972

Please sign in to comment.