generated from opensafely/research-template
/
check_EGFR.py
152 lines (128 loc) · 6.15 KB
/
check_EGFR.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import pandas as pd
import re
import seaborn as sns
from utilities import OUTPUT_DIR, match_input_files
import numpy as np
import matplotlib.pyplot as plt
all_egfr_data = []
#print(f"Output directory is: {OUTPUT_DIR}")
# Collecting data for EGFR plots
for file in OUTPUT_DIR.iterdir():
if match_input_files(file.name):
df = pd.read_feather(OUTPUT_DIR / file.name)
df['source'] = file.stem
#print(f"Reading EGFR data from {file.stem} ({len(df)})")
all_egfr_data.append(df[['egfr_code', 'egfr', 'egfr_binary_flag',
'egfr_less_than_45_including_binary_flag', 'egfr_less_than_45', 'source']])
df = pd.concat(all_egfr_data)
#print(f"Final data frame size: {df.shape}")
#####################################################################
### Examining EGFR values across full dataset
#####################################################################
# Plot all EGFR data in one boxplot
plt.figure(figsize=(8,8))
df.boxplot(column='egfr')
plt.title("EGFR values (all months)", weight='bold')
plt.ylabel('EGFR value', weight='bold')
plt.tight_layout()
plt.savefig(f"{OUTPUT_DIR}/BOXPLOT_EGFRvalue.png", format="png")
plt.clf()
# Plot all EGFR data in one histogram
plt.figure(figsize=(8, 8))
df['egfr'].plot(kind='hist')
plt.title("EGFR values (all months)", weight='bold')
plt.ylabel('EGFR value', weight='bold')
plt.tight_layout()
plt.savefig(f"{OUTPUT_DIR}/HISTOGRAM_EGFRvalue.png", format="png")
plt.clf()
# Plot all EGFR data in one histogram, using a log scale
plt.figure(figsize=(8, 8))
df['egfr'].plot(kind='hist')
plt.xscale('log')
plt.title("EGFR values (all months), logscale", weight='bold')
plt.xlabel('EGFR value (log)', weight='bold')
plt.tight_layout()
plt.savefig(f"{OUTPUT_DIR}/HISTOGRAM_EGFRvalue-log.png", format="png")
plt.clf()
#####################################################################
### Examining EGFR values, separated by egfr_less_than_45 flag
#####################################################################
# Plot EGFR data separated by the less than 45 flag
plt.figure(figsize=(8, 8))
df.boxplot(column='egfr', by=['egfr_less_than_45'])
plt.title("EGFR values (separated by 'less_than_45' flag)", weight='bold')
plt.suptitle('')
plt.xlabel('')
plt.ylabel('EGFR value', weight='bold')
plt.tight_layout()
plt.savefig(f"{OUTPUT_DIR}/BOXPLOT_EGFRvalue-by-EGFR45flag.png", format="png")
plt.clf()
#####################################################################
### Examining EGFR values, separated by month
#####################################################################
### NB. The values will look similar over time as the cohort
### extractor requests values from any time in the past.
# Plot EGFR data separated using groupby() to group by input
# file source (i.e., month)
plt.figure(figsize=(16, 8))
df['date_labels'] = df['source']
df['date_labels'].replace(to_replace="input_(.*)",
value=r"\1", regex=True, inplace=True)
df.boxplot(column='egfr', by='date_labels')
plt.xticks(rotation='vertical')
plt.xlabel('Month', weight='bold')
plt.ylabel('EGFR value', weight='bold')
plt.title("EGFR values (separated by month)", weight='bold')
plt.suptitle('')
plt.tight_layout()
plt.savefig(f"{OUTPUT_DIR}/BOXPLOT_EGFRvalue-by-month.png", format="png")
plt.clf()
#####################################################################
### Examining EGFR values, separated by egfr_code
#####################################################################
# Plot EGFR data separated using groupby() to group by egfr_code - boxplot
plt.figure(figsize=(16, 8))
df.boxplot(column='egfr', by='egfr_code')
plt.xticks(rotation='vertical')
plt.xlabel('EFGR code', weight='bold')
plt.ylabel('EGFR value', weight='bold')
plt.title("EGFR values (separated by EGFR code)", weight='bold')
plt.suptitle('')
plt.tight_layout()
plt.savefig(f"{OUTPUT_DIR}/BOXPLOT_EGFRvalue-by-EGFRcode.png", format="png")
plt.clf()
# Plot EGFR data separated using groupby() to group by egfr_code - histogram
plt.figure(figsize=(16, 8))
egfr_hist = df.hist(column='egfr', by='egfr_code')
for ax in egfr_hist.flatten():
ax.set_xlabel('EFGR value', weight='bold')
ax.set_ylabel("Frequency", weight='bold')
plt.xticks(rotation='vertical')
plt.suptitle("EGFR values (separated by EGFR code)", weight='bold')
plt.tight_layout()
plt.savefig(f"{OUTPUT_DIR}/HISTOGRAM_EGFRvalue-by-EGFRcode.png", format="png")
plt.clf()
#####################################################################
### Generating crosstabulations
#####################################################################
### https://towardsdatascience.com/meet-the-hardest-functions-of-pandas-part-ii-f8029a2b0c9b
# Crosstabulations of source by egfr_less_than_45 flag
# Q: does the count of <= 45 vary by month?
EGFR_less_than_45_crosstabs = pd.crosstab(index=df['source'], columns=df['egfr_less_than_45'])
EGFR_less_than_45_crosstabs.to_csv( f"{OUTPUT_DIR}/CROSSTAB_source-by-EGFR45flag.csv")
# Crosstabulations of source by egfr_binary_flag
# Q: does the count of <= 45 vary by month?
EGFR_missing_crosstabs = pd.crosstab(index=df['source'], columns=df['egfr_binary_flag'])
EGFR_missing_crosstabs.to_csv(f"{OUTPUT_DIR}/CROSSTAB_source-by-EGFRbinary.csv")
# Crosstabulations of egfr_code by egfr_binary_flag
# Q: does the amount of missing data vary by egfr_code?
EGFRcode_by_EGFRbinary_crosstabs = pd.crosstab(index=df['egfr_code'], columns=df['egfr_binary_flag'])
EGFRcode_by_EGFRbinary_crosstabs.to_csv(f"{OUTPUT_DIR}/CROSSTAB_EGFRcode-by-EGFRbinary.csv")
# Crosstabulations of egfr_less_than_45 by egfr_less_than_45_including_binary_flag
# Q: to what extent do egfr_less_than_45 and egfr_less_than_45_including_binary_flag agree?
EGFRcode_by_EGFRbinary_crosstabs = pd.crosstab(index=df['egfr_less_than_45'], columns=df['egfr_less_than_45_including_binary_flag'])
EGFRcode_by_EGFRbinary_crosstabs.to_csv(f"{OUTPUT_DIR}/CROSSTAB_EGFR45flag-by-EGFR45flagbinary.csv")
df['negative_egfr'] = df['egfr']<0
df['egfr_over_200'] = df['egfr']>200
df['egfr_0'] = df['egfr']==0
df.groupby(by=['egfr_code','egfr_binary_flag','egfr_less_than_45','egfr_less_than_45_including_binary_flag', 'negative_egfr', 'egfr_over_200', 'egfr_0'])[['egfr_code']].count().to_csv(f"{OUTPUT_DIR}/CROSSTAB_combined.csv")