generated from opensafely/research-template
/
combine_operators.py
105 lines (83 loc) · 3.45 KB
/
combine_operators.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import pandas as pd
import numpy as np
from variables import tests
from utilities import (
OUTPUT_DIR,
drop_and_round,
round_column,
match_input_files,
get_date_input_file,
combine_value_with_operator,
group_low_values_series,
)
numeric_value_operator_counts = {}
numeric_value_counts = {}
code_counts = {}
operator_counts = {}
for test in tests:
numeric_value_operator_counts[test] = []
numeric_value_counts[test] = []
code_counts[test] = []
operator_counts[test] = []
for file in (OUTPUT_DIR / "joined").iterdir():
if match_input_files(file.name):
df = pd.read_csv((OUTPUT_DIR / "joined") / file.name)
date = get_date_input_file(file.name)
# replace null operator with missing
for test in tests:
df[f"{test}_operator"].fillna("missing", inplace=True)
# how many numeric values have a matched operator?
num_with_numeric_value_and_operator = (
df.loc[
(
(df[f"{test}_numeric_value"].notnull())
& (df[f"{test}_numeric_value"] > 0)
),
:,
]
.groupby(f"{test}_operator")[[test]]
.sum()
)
operator_counts[test].append(
num_with_numeric_value_and_operator.replace(np.nan, "missing")
)
# combine value with operator (after rounding to nearest int)
df[f"{test}_numeric_value"] = round_column(df[f"{test}_numeric_value"], 1)
combine_value_with_operator(df, f"{test}_numeric_value", f"{test}_operator")
numeric_value_operator_counts[test].append(
df[f"{test}_numeric_value_with_operator"].value_counts(sort=True)
)
# find codes where attached numeric value
numeric_value_counts[test].append(
df.loc[
(
(df[f"{test}_numeric_value"].notnull())
& (df[f"{test}_numeric_value"] > 0)
),
f"{test}_code",
].value_counts()
)
code_counts[test].append(df[f"{test}_code"].value_counts())
for test in tests:
# combine numeric value operator counts
combined_values = pd.concat(numeric_value_operator_counts[test])
test_count = combined_values.groupby(combined_values.index).sum()
test_count = drop_and_round(test_count).to_csv(
OUTPUT_DIR / f"{test}_numeric_value_operator_count.csv"
)
# combine numeric value counts
test_codes = pd.concat(numeric_value_counts[test])
test_codes_count = test_codes.groupby(test_codes.index).sum()
test_codes_count = group_low_values_series(test_codes_count)
drop_and_round(test_codes_count).to_csv(
OUTPUT_DIR / f"{test}_numeric_value_count.csv"
)
# combine code counts
test_codes_all = pd.concat(code_counts[test])
test_codes_count_all = test_codes_all.groupby(test_codes_all.index).sum()
test_codes_count_all = group_low_values_series(test_codes_count_all)
drop_and_round(test_codes_count_all).to_csv(OUTPUT_DIR / f"{test}_codes_count.csv")
# combine operator counts
test_operators = pd.concat(operator_counts[test], axis=1, sort=False).sum(axis=1)
test_operators = group_low_values_series(test_operators)
drop_and_round(test_operators).to_csv(OUTPUT_DIR / f"{test}_operators_count.csv")