generated from opensafely/research-template
/
all_time_counts.py
124 lines (106 loc) · 4.13 KB
/
all_time_counts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import os
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from common_variables import demographic_variables, clinical_variables
pd.set_option("display.max_rows", 50)
results_path = "output/practice_summ.txt"
stratifiers = list(demographic_variables.keys())
long_covid_codelists = [
"opensafely-nice-managing-the-long-term-effects-of-covid-19",
"opensafely-referral-and-signposting-for-long-covid",
"opensafely-assessment-instruments-and-outcome-measures-for-long-covid",
"user-alex-walker-post-viral-syndrome",
]
combined_codelists = [
pd.read_csv(f"codelists/{path}.csv", index_col="code")
for path in long_covid_codelists
]
combined_codelists = pd.concat(combined_codelists)
def crosstab(idx):
cols = ["No long COVID", "Long COVID", "Rate per 100,000", "%"]
counts = pd.crosstab(idx, df["long_covid"], normalize=False, dropna=False)
rates = (
pd.crosstab(idx, df["long_covid"], normalize="index", dropna=False)[1] * 100000
).round(1)
percentages = (
pd.crosstab(idx, df["long_covid"], normalize="columns", dropna=False)[1] * 100
).round(1)
all_cols = pd.concat([counts, rates, percentages], axis=1)
all_cols.columns = cols
return all_cols
def redact_small_numbers(df, column):
mask = df[column].isin([1, 2, 3, 4, 5])
df.loc[mask, :] = np.nan
return df
def write_to_file(text_to_write, erase=False):
if erase and os.path.isfile(results_path):
os.remove(results_path)
with open(results_path, "a") as txt:
txt.writelines(f"{text_to_write}\n")
print(text_to_write)
txt.writelines("\n")
print("\n")
with open("output/input_cohort.csv") as f:
reader = csv.DictReader(f)
usecols = [
col
for col in reader.fieldnames
if not (col.startswith("snomed_") and col.endswith("_date"))
]
df = pd.read_csv(
"output/input_cohort.csv",
index_col="patient_id",
parse_dates=["first_long_covid_date", "first_post_viral_fatigue_date"],
usecols=usecols,
dtype={
"first_long_covid_code": "category",
# "age_group": "category",
"sex": "category",
"imd": "category",
"ethnicity": "category",
},
)
## Crosstabs
crosstabs = [crosstab(df[v]) for v in stratifiers]
all_together = pd.concat(
crosstabs, axis=0, keys=stratifiers + ["imd"], names=["Attribute", "Category"]
)
print(all_together)
redact_small_numbers(all_together, "Long COVID").to_csv("output/counts_table.csv")
## All long-covid codes table
codes = [str(code) for code in combined_codelists.index]
all_codes = df
all_codes.columns = all_codes.columns.str.lstrip("snomed_")
all_codes = all_codes[codes].sum().T
all_codes = all_codes.rename("Total records")
all_codes.index = all_codes.index.astype("int64")
all_codes = combined_codelists.join(all_codes)
all_codes["%"] = (all_codes["Total records"] / all_codes["Total records"].sum()) * 100
redact_small_numbers(all_codes, "Total records").to_csv(
"output/all_long_covid_codes.csv"
)
print(all_codes.columns)
## Descriptives by practice
by_practice = (
df[["long_covid", "practice_id"]].groupby("practice_id").sum()["long_covid"]
)
write_to_file(f"Total patients coded: {by_practice.sum()}", erase=True)
top_10_count = by_practice.sort_values().tail(10).sum()
write_to_file(f"Patients coded in the highest 10 practices: {top_10_count}")
practice_summ = by_practice.describe()
write_to_file(f"Summary stats by practice:\n{practice_summ}")
ranges = [-1, 0, 1, 2, 3, 4, 5, 10, 10000]
practice_distribution = by_practice.groupby(pd.cut(by_practice, ranges)).count()
write_to_file(f"Distribution of coding within practices: {practice_distribution}")
practice_distribution.to_csv("output/practice_distribution.csv")
def weekly_counts(variable):
weekly_counts = df.set_index(f"first_{variable}_date")[variable]
weekly_counts = weekly_counts.resample("W").count()
weekly_counts = weekly_counts.loc["2020-01-01":]
weekly_counts.loc[weekly_counts.isin([1, 2, 3, 4, 5])] = np.nan
print(weekly_counts)
weekly_counts.to_csv(f"output/code_use_per_week_{variable}.csv")
weekly_counts("long_covid")
weekly_counts("post_viral_fatigue")