generated from opensafely/research-template
/
check_hhid.R
101 lines (78 loc) · 3.27 KB
/
check_hhid.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
################################################################################
# Description: Check uniqueness of household ID
#
################################################################################
################################################################################
#----------------------#
# SETUP ENVIRONMENT #
#----------------------#
library(tidyverse)
library(data.table)
sink("./check_hhid.txt", type = "output")
options(datatable.old.fread.datetime.character = TRUE)
# ---------------------------------------------------------------------------- #
#----------------------#
# LOAD DATA #
#----------------------#
# args <- c("input.csv")
args = commandArgs(trailingOnly = TRUE)
input <- fread(args[1], data.table = FALSE, na.strings = "") %>%
# Filter just to records from England
filter(grepl("E",msoa)) %>%
mutate(across(c(imd, rural_urban), function(x) na_if(x,-1)),
across(c(imd, household_size, household_id), function(x) na_if(x,0)),
HHID = paste(msoa, household_id, sep = ":"))
#----------------------------------------#
# UNIQUENESS OF HOUSEHOLD ID #
#----------------------------------------#
print("No. with missing household id")
summary(input$household_id == 0)
summary(is.na(input$household_id))
input <- filter(input, household_id > 0 & !is.na(household_id))
print("No. households, by household_id alone and by household_ID + MSOA")
input %>%
summarise(N_hhID = n_distinct(household_id),
N_msoa_hhID = n_distinct(HHID))
print("Uniqueness of household characteristics over residents:")
input %>%
group_by(household_id) %>%
summarise(msoa = n_distinct(msoa),
region = n_distinct(region),
household_size = n_distinct(household_size),
imd = n_distinct(imd),
rural_urban = n_distinct(rural_urban)) -> n_distinct_chars
# Should be one distinct value for every household
summary(n_distinct_chars)
# ---------------------------------------------------------------------------- #
#----------------------------------#
# CHECK HOUSEHOLD SIZES #
#----------------------------------#
print("Household size by care home type:")
input %>%
filter(!is.na(household_size)) %>%
group_by(care_home_type) %>%
summarise(mean = mean(household_size),
sd = sd(household_size),
median = median(household_size),
minmax = paste(min(household_size), max(household_size), sep = ", "))
print("Number of records by care home type (household_id):")
input %>%
group_by(care_home_type, household_id) %>%
summarise(n_resid = n()) %>%
group_by(care_home_type) %>%
summarise(mean = mean(n_resid),
sd = sd(n_resid),
median = median(n_resid),
minmax = paste(min(n_resid), max(n_resid), sep = ", "))
print("Number of records by care home type (HHID):")
input %>%
group_by(care_home_type, HHID) %>%
summarise(n_resid = n()) %>%
group_by(care_home_type) %>%
summarise(mean = mean(n_resid),
sd = sd(n_resid),
median = median(n_resid),
minmax = paste(min(n_resid), max(n_resid), sep = ", "))
################################################################################
sink()
################################################################################