-
Notifications
You must be signed in to change notification settings - Fork 21
/
wb_data.R
347 lines (305 loc) · 13 KB
/
wb_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
#' Download Data from the World Bank API
#'
#' This function downloads the requested information using the World Bank API
#'
#' @param indicator Character vector of indicator codes. These codes correspond
#' to the `indicator_id` column from the `indicators` tibble of [wb_cache()], [wb_cachelist], or
#' the result of running [wb_indicators()] directly
#' @param country Character vector of country, region, or special value codes for the
#' locations you want to return data for. Permissible values can be found in the
#' countries tibble in [wb_cachelist] or by running [wb_countries()] directly.
#' Specifically, values listed in the following fields `iso3c`, `iso2c`, `country`,
#' `region`, `admin_region`, `income_level` and all of the `region_*`,
#' `admin_region_*`, `income_level_*`, columns. As well as the following special values
#' * `"countries_only"` (Default)
#' * `"regions_only"`
#' * `"admin_regions_only"`
#' * `"income_levels_only"`
#' * `"aggregates_only"`
#' * `"all"`
#' @param start_date Numeric or character. If numeric it must be in `%Y` form (i.e. four digit year).
#' For data at the subannual granularity the API supports a format as follows: for monthly data, "2016M01"
#' and for quarterly data, "2016Q1". This also accepts a special value of "YTD", useful for more frequently
#' updated subannual indicators.
#' @param end_date Numeric or character. If numeric it must be in `%Y` form (i.e. four digit year).
#' For data at the subannual granularity the API supports a format as follows: for monthly data, "2016M01"
#' and for quarterly data, "2016Q1".
#' @param return_wide Logical. If `TRUE` data is returned in a wide format instead of long,
#' with a column named for each `indicator_id` or if the `indicator` argument is a named vector,
#' the [names()] given to the indicator will be the column names. To necessitate this transformation,
#' the `indicator` column that provides the human readable description is dropped, but provided as a column label.
#' Default is `TRUE`
#' @param mrv Numeric. The number of Most Recent Values to return. A replacement
#' of `start_date` and `end_date`, this number represents the number of observations
#' you which to return starting from the most recent date of collection. This may include missing values.
#' Useful in conjuction with `freq`
#' @param mrnev Numeric. The number of Most Recent Non Empty Values to return. A replacement
#' of `start_date` and `end_date`, similar in behavior as `mrv` but excludes locations with missing values.
#' Useful in conjuction with `freq`
#' @param cache List of tibbles returned from [wb_cache()]. If omitted, [wb_cachelist] is used
#' @param freq Character String. For fetching quarterly ("Q"), monthly("M") or yearly ("Y") values.
#' Useful for querying high frequency data.
#' @param gapfill Logical. If `TRUE` fills in missing values by carrying forward the last
#' available value until the next available period (max number of periods back tracked will be limited by `mrv` number).
#' Default is `FALSE`
#' @param date_as_class_date Logical. If `TRUE` the date field is returned as class [Date], useful when working with
#' non-annual data or data at mixed resolutions. Default is `FALSE`
#' available value until the next available period (max number of periods back tracked will be limited by `mrv` number).
#' Default is `FALSE`
#' @inheritParams wb_cache
#'
#' @return a [tibble][tibble::tibble-package] of all available requested data.
#'
#' @details
#' ## `obs_status` column
#' Indicates the observation status for location, indicator and date combination.
#' For example `"F"` in the response indicates that the observation status for
#' that data point is "forecast".
#'
#' @export
#' @importFrom rlang .data
#' @md
#'
#'
#' @examples
#'
#'
#' # gdp for all countries for all available dates
#' \donttest{df_gdp <- wb_data("NY.GDP.MKTP.CD")}
#'
#' # Brazilian gdp for all available dates
#' \donttest{df_brazil <- wb_data("NY.GDP.MKTP.CD", country = "br")}
#'
#' # Brazilian gdp for 2006
#' \donttest{
#' df_brazil_1 <- wb_data("NY.GDP.MKTP.CD", country = "brazil", start_date = 2006)
#' }
#'
#' # Brazilian gdp for 2006-2010
#' \donttest{
#' df_brazil_2 <- wb_data("NY.GDP.MKTP.CD", country = "BRA",
#' start_date = 2006, end_date = 2010)
#'}
#'
#' # Population, GDP, Unemployment Rate, Birth Rate (per 1000 people)
#' \donttest{
#' my_indicators <- c("SP.POP.TOTL",
#' "NY.GDP.MKTP.CD",
#' "SL.UEM.TOTL.ZS",
#' "SP.DYN.CBRT.IN")
#'}
#'
#' \donttest{df <- wb_data(my_indicators)}
#'
#' # you pass multiple country ids of different types
#' # Albania (iso2c), Georgia (iso3c), and Mongolia
#' \donttest{
#' my_countries <- c("AL", "Geo", "mongolia")
#' df <- wb_data(my_indicators, country = my_countries,
#' start_date = 2005, end_date = 2007)
#'}
#'
#' # same data as above, but in long format
#' \donttest{
#' df_long <- wb_data(my_indicators, country = my_countries,
#' start_date = 2005, end_date = 2007,
#' return_wide = FALSE)
#'}
#'
#' # regional population totals
#' # regions correspond to the region column in wb_cachelist$countries
#' \donttest{
#' df_region <- wb_data("SP.POP.TOTL", country = "regions_only",
#' start_date = 2010, end_date = 2014)
#' }
#'
#' # a specific region
#' \donttest{
#' df_world <- wb_data("SP.POP.TOTL", country = "world",
#' start_date = 2010, end_date = 2014)
#'}
#'
#' # if the indicator is part of a named vector the name will be the column name
#' my_indicators <- c("pop" = "SP.POP.TOTL",
#' "gdp" = "NY.GDP.MKTP.CD",
#' "unemployment_rate" = "SL.UEM.TOTL.ZS",
#' "birth_rate" = "SP.DYN.CBRT.IN")
#'\donttest{
#' df_names <- wb_data(my_indicators, country = "world",
#' start_date = 2010, end_date = 2014)
#'}
#'
#' # custom names are ignored if returning in long format
#' \donttest{
#' df_names_long <- wb_data(my_indicators, country = "world",
#' start_date = 2010, end_date = 2014,
#' return_wide = FALSE)
#'}
#'
#' # same as above but in Bulgarian
#' # note that not all indicators have translations for all languages
#' \donttest{
#' df_names_long_bg <- wb_data(my_indicators, country = "world",
#' start_date = 2010, end_date = 2014,
#' return_wide = FALSE, lang = "bg")
#'}
wb_data <- function(indicator, country = "countries_only", start_date, end_date,
return_wide = TRUE, mrv, mrnev, cache, freq, gapfill = FALSE,
date_as_class_date = FALSE, lang) {
if (missing(cache)) cache <- wbstats::wb_cachelist
base_url <- wb_api_parameters$base_url
# format country ----------------------------------------------------------
country_param <- format_wb_country(country, cache = cache)
country_path <- paste0(wb_api_parameters$country, "/", country_param)
# check dates ----------
date_query <- NULL
if (!missing(start_date) && missing(end_date)) date_query <- paste0(start_date, ":", start_date)
if (missing(start_date) && !missing(end_date)) date_query <- paste0(end_date, ":", end_date)
if (missing(start_date) == FALSE && missing(end_date) == FALSE) date_query <- paste0(start_date, ":", end_date)
# check freq ----------
freq_query <- NULL
if (!missing(freq)) {
if (!freq %in% c("Y", "Q", "M"))
stop("If supplied, values for freq must be one of the following 'Y' (yearly), 'Q' (Quarterly), or 'M' (Monthly)")
freq_query <- freq
}
# check mrv ----------
mrv_query <- NULL
if (!missing(mrv)) {
if (!is.numeric(mrv)) stop("If supplied, mrv must be numeric")
mrv_query <- paste0(round(mrv, digits = 0)) # just to make sure its a whole number
}
# check mrnev ----------
mrnev_query <- NULL
if (!missing(mrnev)) {
if (!is.numeric(mrnev)) stop("If supplied, mrnev must be numeric")
mrnev_query <- paste0(round(mrnev, digits = 0)) # just to make sure its a whole number
}
# check gapfill ----------
gapfill_query <- NULL
if (!missing(gapfill)) {
if (!is.logical(gapfill)) stop("If supplied, values for gapfill must be TRUE or FALSE")
if (missing(mrv)) stop("mrv must be supplied for gapfill to be used")
gapfill_query <- ifelse(gapfill, "Y", "N")
}
# check scale ----------
scale_query <- NULL
# remove support for scale parameter, it causes more problems that in solves
# if (!missing(scale)) {
# if (!is.logical(scale)) stop("Values for scale must be TRUE or FALSE")
#
# scale_query <- ifelse(scale, "Y", "N")
# }
# country should be part of the path list b/c other endpoint don't require it or need more things
path_list <- list(
version = wb_api_parameters$version,
lang = if_missing(lang, wb_default_lang(), lang),
country = country_path
)
# what is NULL just gets dropped in the build url step
query_list <- list(
date = date_query,
scale = scale_query,
frequency = freq_query,
mrv = mrv_query,
mrnev = mrnev_query,
gapfill = gapfill_query,
footnote = ifelse(return_wide, "y", "n"),
cntrycode = "y",
per_page = wb_api_parameters$per_page,
format = wb_api_parameters$format
)
# be able to return this for debugging
ind_url <- build_wb_url(
base_url = base_url, indicator = indicator,
path_list = path_list, query_list = query_list
)
# d_list <- lapply(ind_url, fetch_wb_url)
d_list <- lapply(seq_along(ind_url),
FUN = function(i){
fetch_wb_url(
url_string = ind_url[i],
indicator = indicator[i]
)
}
)
d_list <- d_list[sapply(d_list, is.data.frame)]
d <- dplyr::bind_rows(d_list)
if(!is.data.frame(d) | nrow(d) == 0) {
warning("No data was returned for your query. Returning an empty tibble")
return(tibble::tibble())
}
d <- format_wb_data(d, end_point = "data")
if (any(is.na(d$iso3c))) {
# country names are not replaced with with cached versions b/c
# some country names are subnational values where the iso3c and iso2c would
# be the same value across mutliple subnational units
d <- d %>%
dplyr::mutate(
iso3c = as.character(.data$iso3c),
iso2c = as.character(.data$iso2c),
iso3c = dplyr::if_else(is.na(.data$iso3c), .data$iso2c, .data$iso3c)
) %>%
dplyr::select(-.data$iso2c) %>%
dplyr::left_join(
dplyr::select(cache$countries, .data$iso3c, .data$iso2c),
by = "iso3c"
)
}
# country_only actually requests 'all' from the API, now remove non-countries
if (any(tolower(country) %in% c("countries", "countries_only", "countries only"))) {
country_only_iso3c <- unique_na(cache$countries$iso3c[cache$countries$region != "Aggregates"])
d <- d[d$iso3c %in% country_only_iso3c, ]
}
if(nrow(d) == 0) {
warning("No data was returned for your query. Returning an empty tibble")
return(tibble::tibble())
}
if (return_wide) {
context_cols <- c("iso2c", "iso3c", "country", "date")
extra_cols <- c("unit", "obs_status","footnote", "last_updated")
ind_names <- as.data.frame(unique(d[, c("indicator", "indicator_id")]))
# the decimal column is dropped here b/c support for scale=TRUE was removed
cols_to_keep <- setdiff(names(d), c("indicator", "decimal"))
# when you return a wide data frame with more than one indicator
# the extra_cols are dropped b/c they are specific to each location, indicator, date
# instance and they have to be rowwise so if you transpose with them it messes
# everything up
if (length(unique(ind_names$indicator_id)) > 1) {
cols_to_keep <- setdiff(cols_to_keep, extra_cols)
}
d <- d[ , cols_to_keep]
d <- tidyr::spread(d, key = "indicator_id", value = "value")
# column labels
for (i in 1:nrow(ind_names)) {
d_col_name <- ind_names$indicator_id[i]
d_col_label <- ind_names$indicator[i]
attr(d[[d_col_name]], "label") <- d_col_label
}
# named vector for indicators
if (!is.null(names(indicator))) {
for (i in 1:nrow(ind_names)) {
d_col_old_name <- ind_names$indicator_id[i]
d_col_new_name <- names(indicator[indicator == d_col_old_name])
if(! (d_col_new_name == "" || is.null(d_col_new_name)) )
names(d)[which(names(d) == d_col_old_name)] <- d_col_new_name
}
}
indicator_cols <- setdiff(names(d), c(context_cols, extra_cols))
d <- dplyr::select(d,
context_cols,
indicator_cols,
dplyr::everything()
)
} # end return_wide
else {
# these columns are reordered for readability
col_order <- c("indicator_id", "indicator", "iso2c", "iso3c", "country", "date",
"value", "unit", "obs_status", "footnote", "last_updated")
col_order2 <- col_order[col_order %in% names(d)]
d <- d[ , col_order2]
}
if (date_as_class_date) d <- format_wb_dates(d)
else if (!any(grepl("M|Q", d$date, ignore.case = TRUE))) d$date <- as.numeric(d$date)
d
}