generated from opensafely/research-template
/
process.R
134 lines (85 loc) · 3.22 KB
/
process.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
# # # # # # # # # # # # # # # # # # # # #
# This script:
# define covid infection (case) & potiential control group
#
#
# # # # # # # # # # # # # # # # # # # # #
## Import libraries---
library('tidyverse')
library("ggplot2")
library('dplyr')
library('lubridate')
#### COVID INFECTION
# impoprt data
df1 <- read_csv(here::here("output", "input_covid_SGSS.csv"))
df2<- read_csv(here::here("output", "input_covid_primarycare.csv"))
# has covid infection record
df1 =df1%>%filter(patient_index_date>0) # SGSS case
df2 =df2%>%filter(patient_index_date>0) # primary care case
df=rbind(df1,df2)
# keep earlist covid infection date
df=df%>%
group_by(patient_id)%>%
arrange(patient_id,patient_index_date)%>%
distinct(patient_id, .keep_all = TRUE)
# exclude case has previous covid related history (variables before patient_index_date)
df=df%>%
filter(is.na(covid_admission_date),
#is.na(icu_date_admitted),
is.na(died_date_cpns),
is.na(died_date_ons_covid))
write_csv(df, here::here("output", "case_covid_infection.csv"))
df$cal_YM=format(df$patient_index_date,"%Y-%m")
write_csv(df, here::here("output", "control_covid_infection.csv"))
# split data by month (for matching general population)
list=sort(unique(df$cal_YM))
for (i in 1:length(list)){
DF=subset(df,cal_YM==list[i])
write_csv(DF, here::here("output", paste0("case_covid_infection_",list[i],".csv")))
}
rm(list=ls())
#### COVID admission
# impoprt data
df <- read_csv(here::here("output", "input_covid_admission.csv"))
# has covid admission record
df =df%>%filter(patient_index_date>0) # hosp admission case
# exclude case has previous covid related history (variables before patient_index_date)
df=df%>%
filter(
#is.na(icu_date_admitted),
is.na(died_date_cpns),
is.na(died_date_ons_covid))
df$cal_YM=format(df$patient_index_date,"%Y-%m")
write_csv(df, here::here("output", "case_covid_admission.csv"))
rm(list=ls())
#### COVID severe outcome (icu or death)
# impoprt data
#df1 <- read_csv(here::here("output", "input_covid_icu.csv"))
df2<- read_csv(here::here("output", "input_covid_death_cpns.csv"))
df3<- read_csv(here::here("output", "input_covid_death_ons.csv"))
# has covid infection record
#df1 =df1%>%filter(patient_index_date>0) # icu
df2 =df2%>%filter(patient_index_date>0) # cpns
df3 =df3%>%filter(patient_index_date>0) # ons_covid
df=rbind(df2,df3)
# keep earlist covid severe outcome date
df=df%>%
group_by(patient_id)%>%
arrange(patient_id,patient_index_date)%>%
distinct(patient_id, .keep_all = TRUE)
df$cal_YM=format(df$patient_index_date,"%Y-%m")
write_csv(df, here::here("output", "case_covid_icu_death.csv"))
# #### general population
# rm(list=ls())
# list=seq(as.Date("2020-02-01"), as.Date("2021-12-01"), "month")
# for (i in 1:length(list)){
# df=read_csv(here::here("output","measures", paste0("input_covid_general_population_",list[i],".csv.gz")))
# df$patient_index_date=as.Date(list[i])
# df=df%>%
# filter(is.na(covid_admission_date),
# is.na(icu_date_admitted),
# is.na(died_date_cpns),
# is.na(died_date_ons_covid))
# df$cal_YM=format(df$patient_index_date,"%Y-%m")
# write_csv(df, here::here("output", paste0("control_general_population_",list[i],".csv")))
# }