/
createSSURGO.R
384 lines (324 loc) · 15.7 KB
/
createSSURGO.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
#' Get SSURGO ZIP files from Web Soil Survey 'Download Soils Data'
#'
#' Download ZIP files containing spatial (ESRI shapefile) and tabular (TXT) files with standard SSURGO format; optionally including the corresponding SSURGO Template Database with `include_template=TRUE`.
#'
#' To specify the Soil Survey Areas you would like to obtain data you use a `WHERE` clause for query of `sacatalog` table such as `areasymbol = 'CA067'`, `"areasymbol IN ('CA628', 'CA067')"` or `areasymbol LIKE 'CT%'`.
#'
#' @param WHERE A SQL `WHERE` clause expression used to filter records in `sacatalog` table. Alternately `WHERE` can be any spatial object supported by `SDA_spatialQuery()` for defining the target extent.
#' @param areasymbols Character vector of soil survey area symbols e.g. `c("CA067", "CA077")`. Used in lieu of `WHERE` argument.
#' @param destdir Directory to download ZIP files into. Default `tempdir()`.
#' @param exdir Directory to extract ZIP archives into. May be a directory that does not yet exist. Each ZIP file will extract to a folder labeled with `areasymbol` in this directory. Default: `destdir`
#' @param include_template Include the (possibly state-specific) MS Access template database? Default: `FALSE`
#' @param db Either `"SSURGO"` (default; detailed soil map) or `"STATSGO"` (general soil map).
#' @param extract Logical. Extract ZIP files to `exdir`? Default: `TRUE`
#' @param remove_zip Logical. Remove ZIP files after extracting? Default: `FALSE`
#' @param overwrite Logical. Overwrite by re-extracting if directory already exists? Default: `FALSE`
#' @param quiet Logical. Passed to `curl::curl_download()`.
#' @details
#' When `db="STATSGO"` the `WHERE` argument is not supported. Allowed `areasymbols` include `"US"` and two-letter state codes e.g. `"WY"` for the Wyoming general soils map.
#'
#' @export
#'
#' @details Pipe-delimited TXT files are found in _/tabular/_ folder extracted from a SSURGO ZIP. The files are named for tables in the SSURGO schema. There is no header / the files do not have column names. See the _Soil Data Access Tables and Columns Report_: \url{https://sdmdataaccess.nrcs.usda.gov/documents/TablesAndColumnsReport.pdf} for details on tables, column names and metadata including the default sequence of columns used in TXT files. The function returns a `try-error` if the `WHERE`/`areasymbols` arguments result in
#'
#' Several ESRI shapefiles are found in the _/spatial/_ folder extracted from a SSURGO ZIP. These have prefix `soilmu_` (mapunit), `soilsa_` (survey area), `soilsf_` (special features). There will also be a TXT file with prefix `soilsf_` describing any special features. Shapefile names then have an `a_` (polygon), `l_` (line), `p_` (point) followed by the soil survey area symbol.
#'
#' @return Character. Paths to downloaded ZIP files (invisibly). May not exist if `remove_zip = TRUE`.
downloadSSURGO <- function(WHERE = NULL,
areasymbols = NULL,
destdir = tempdir(),
exdir = destdir,
include_template = FALSE,
db = c('SSURGO', 'STATSGO'),
extract = TRUE,
remove_zip = FALSE,
overwrite = FALSE,
quiet = FALSE) {
db <- match.arg(toupper(db), c('SSURGO', 'STATSGO'))
if (!is.null(WHERE) && db == "STATSGO") {
stop('custom WHERE clause not supported with db="STATSGO"', call. = FALSE)
}
if (!is.null(areasymbols) && db == "STATSGO") {
WHERE <- areasymbols
}
if (is.null(WHERE) && is.null(areasymbols)) {
stop('must specify either `WHERE` or `areasymbols` argument', call. = FALSE)
}
if (is.null(WHERE) && !is.null(areasymbols)) {
WHERE <- sprintf("areasymbol IN %s", format_SQL_in_statement(areasymbols))
}
if (!is.character(WHERE)) {
# attempt passing WHERE to SDA_spatialQuery
res <- suppressMessages(SDA_spatialQuery(WHERE, what = 'areasymbol'))
WHERE <- paste("areasymbol IN", format_SQL_in_statement(res$areasymbol))
}
# make WSS download URLs from areasymbol, template, date
urls <- .make_WSS_download_url(WHERE, include_template = include_template, db = db)
if (inherits(urls, 'try-error')) {
message(urls[1])
return(invisible(urls))
}
if (!dir.exists(destdir)) {
dir.create(destdir, recursive = TRUE)
}
# download files
for (i in seq_along(urls)) {
destfile <- file.path(destdir, basename(urls[i]))
if (!file.exists(destfile)) {
try(curl::curl_download(urls[i], destfile = destfile, quiet = quiet, mode = "wb", handle = .soilDB_curl_handle()), silent = quiet)
}
}
paths <- list.files(destdir, pattern = "\\.zip$", full.names = TRUE)
paths2 <- paths[grep(".*wss_(SSA|gsmsoil)_(.*)_.*", paths)]
if (extract) {
if (!quiet) {
message("Extracting downloaded ZIP files...")
}
if (length(paths2) == 0) {
stop("Could not find SSURGO ZIP files in `destdir`: ", destdir, call. = FALSE)
}
if (!dir.exists(exdir)) {
dir.create(exdir, recursive = TRUE)
}
for (i in seq_along(paths2)) {
ssa <- gsub(".*wss_SSA_(.*)_.*", "\\1", paths2[i])
if ((!dir.exists(file.path(exdir, ssa)) || overwrite) &&
length(utils::unzip(paths2[i], exdir = exdir)) == 0) {
message(paste('Invalid zipfile:', paths2[i]))
}
}
if (remove_zip) {
file.remove(paths2)
}
}
invisible(paths2)
}
#' Create a SQLite database or GeoPackage from one or more SSURGO Exports
#'
#' @param filename Output file name (e.g. `'db.sqlite'` or `'db.gpkg'`)
#' @param exdir Path containing containing SSURGO spatial (.shp) and tabular (.txt) files.
#' @param pattern Character. Optional regular expression to use to filter subdirectories of `exdir`. Default: `NULL` will search all subdirectories for SSURGO export files.
#' @param include_spatial Logical. Include spatial data layers in database? Default: `TRUE`.
#' @param overwrite Logical. Overwrite existing layers? Default `FALSE` will append to existing tables/layers.
#' @param header Logical. Passed to `read.delim()` for reading pipe-delimited (`|`) text files containing tabular data.
#' @param quiet Logical. Suppress messages and other output from database read/write operations?
#' @param ... Additional arguments passed to `write_sf()` for writing spatial layers.
#'
#' @return Character. Vector of layer/table names in `filename`.
#' @export
#' @examples
#' \dontrun{
#' downloadSSURGO("areasymbol IN ('CA067', 'CA077', 'CA632')", destdir = "SSURGO_test")
#' createSSURGO("test.gpkg", "SSURGO_test")
#' }
createSSURGO <- function(filename,
exdir,
pattern = NULL,
include_spatial = TRUE,
overwrite = FALSE,
header = FALSE,
quiet = TRUE,
...) {
if (missing(filename) || length(filename) == 0) {
stop("`filename` should be a path to a .gpkg or .sqlite file to create or append to.")
}
IS_GPKG <- grepl("\\.gpkg$", filename, ignore.case = TRUE)[1]
f <- list.files(exdir, recursive = TRUE, pattern = pattern, full.names = TRUE)
if (!requireNamespace("sf"))
stop("package `sf` is required to write spatial datasets to SSURGO SQLite databases", call. = FALSE)
if (!requireNamespace("RSQLite"))
stop("package `RSQLite` is required to write tabular datasets to SSURGO SQLite databases", call. = FALSE)
if (isTRUE(overwrite) && file.exists(filename)) {
file.remove(filename)
}
# create and add combined vector datasets:
# "soilmu_a", "soilmu_l", "soilmu_p", "soilsa_a", "soilsf_l", "soilsf_p"
f.shp <- f[grepl(".*\\.shp$", f)]
shp.grp <- do.call('rbind', strsplit(gsub(".*soil([musfa]{2})_([apl])_([a-z]{2}\\d{3}|[a-z]{2})\\.shp", "\\1;\\2;\\3", f.shp), ";", fixed = TRUE))
layer_names <- c(`mu_a` = "mupolygon", `mu_l` = "muline", `mu_p` = "mupoint",
`sa_a` = "sapolygon", `sf_l` = "featline", `sf_p` = "featpoint")
if (nrow(shp.grp) >= 1 && ncol(shp.grp) == 3 && include_spatial) {
f.shp.grp <- split(f.shp, list(feature = shp.grp[,1], geom = shp.grp[,2]))
lapply(seq_along(f.shp.grp), function(i) {
lapply(seq_along(f.shp.grp[[i]]), function(j){
lnm <- layer_names[match(gsub(".*soil([musfa]{2}_[apl])_.*", "\\1", f.shp.grp[[i]][j]),
names(layer_names))]
if (overwrite && j == 1) {
sf::write_sf(sf::read_sf(f.shp.grp[[i]][j]), filename, layer = lnm, overwrite = TRUE, ...)
} else sf::write_sf(sf::read_sf(f.shp.grp[[i]][j]), filename, layer = lnm, append = TRUE, ...)
NULL
})
})
}
# create and add combined tabular datasets
f.txt <- f[grepl(".*\\.txt$", f)]
txt.grp <- gsub("\\.txt", "", basename(f.txt))
# explicit handling special feature descriptions -> "featdesc" table
txt.grp[grepl("soilsf_t_", txt.grp)] <- "featdesc"
f.txt.grp <- split(f.txt, txt.grp)
# get table, column, index lookup tables
mstabn <- f.txt.grp[[which(names(f.txt.grp) %in% c("mstab", "mdstattabs", "MetadataTable"))[1]]][[1]]
mstabcn <- f.txt.grp[[which(names(f.txt.grp) %in% c("mstabcol", "mdstattabcols", "MetadataColumnLookup"))[1]]][[1]]
msidxdn <- f.txt.grp[[which(names(f.txt.grp) %in% c("msidxdet", "mdstatidxdet", "MetadataIndexDetail"))[1]]][[1]]
if (length(mstabn) >= 1) {
mstab <- read.delim(mstabn[1], sep = "|", stringsAsFactors = FALSE, header = header)
mstab_lut <- mstab[[1]]
names(mstab_lut) <- mstab[[5]]
} else {
mstab_lut <- names(f.txt.grp)
names(mstab_lut) <- names(f.txt.grp)
}
if (length(mstabcn) >= 1) {
mstabcol <- read.delim(mstabcn[1], sep = "|", stringsAsFactors = FALSE, header = header)
}
if (length(msidxdn) >= 1) {
msidxdet <- read.delim(msidxdn[1], sep = "|", stringsAsFactors = FALSE, header = header)
}
con <- RSQLite::dbConnect(RSQLite::SQLite(), filename, loadable.extensions = TRUE)
on.exit(RSQLite::dbDisconnect(con))
lapply(names(f.txt.grp), function(x) {
if (!is.null(mstabcol)) {
newnames <- mstabcol[[3]][mstabcol[[1]] == mstab_lut[x]]
}
if (!is.null(msidxdet)) {
indexPK <- na.omit(msidxdet[[4]][msidxdet[[1]] == mstab_lut[x] & grepl("PK_", msidxdet[[2]])])
indexDI <- na.omit(msidxdet[[4]][msidxdet[[1]] == mstab_lut[x] & grepl("DI_", msidxdet[[2]])])
}
d <- try(as.data.frame(data.table::rbindlist(lapply(seq_along(f.txt.grp[[x]]), function(i) {
y <- read.delim(f.txt.grp[[x]][i], sep = "|", stringsAsFactors = FALSE, header = header)
if (length(y) == 1) {
y <- data.frame(content = y)
} else {
if (!is.null(mstab) && !header) { # preserve headers if present
colnames(y) <- newnames
}
}
y
}))), silent = quiet)
if (length(mstab_lut[x]) && is.na(mstab_lut[x])) {
mstab_lut[x] <- x
}
if (length(mstab_lut[x]) && !is.na(mstab_lut[x]) && inherits(d, 'data.frame') && nrow(d) > 0) {
# remove repeated records/metadata
if (ncol(d) > 1) {
d <- unique(d)
}
# write tabular data to file
try({
if (overwrite) {
RSQLite::dbWriteTable(con, mstab_lut[x], d, overwrite = TRUE)
} else RSQLite::dbWriteTable(con, mstab_lut[x], d, append = TRUE)
}, silent = quiet)
# create pkey indices
if (!is.null(indexPK) && length(indexPK) > 0) {
try({
RSQLite::dbExecute(con, sprintf("CREATE UNIQUE INDEX IF NOT EXISTS %s ON %s (%s)",
paste0('PK_', mstab_lut[x]), mstab_lut[x], paste0(indexPK, collapse = ",")))
}, silent = quiet)
}
# create key indices
if (!is.null(indexDI) && length(indexDI) > 0) {
for (i in seq_along(indexDI)) {
try({
RSQLite::dbExecute(con, sprintf("CREATE INDEX IF NOT EXISTS %s ON %s (%s)",
paste0('DI_', mstab_lut[x]), mstab_lut[x], indexDI[i]))
}, silent = quiet)
}
}
# for GPKG output, add gpkg_contents (metadata for features and attributes)
if (IS_GPKG) {
if (!.gpkg_has_contents(con)) {
# if no spatial data inserted, there will be no gpkg_contents table initally
try(.gpkg_create_contents(con))
}
# update gpkg_contents table entry
try(.gpkg_delete_contents(con, mstab_lut[x]))
try(.gpkg_add_contents(con, mstab_lut[x]))
}
# TODO: other foreign keys/relationships? ALTER TABLE/ADD CONSTRAINT not available in SQLite
# the only way to add a foreign key is via CREATE TABLE which means refactoring above two
# steps into a single SQL statement (create table with primary and foreign keys)
}
})
res <- RSQLite::dbListTables(con)
invisible(res)
}
## From https://github.com/brownag/gpkg -----
#' Add, Remove, Update and Create `gpkg_contents` table and records
#' @description `gpkg_add_contents()`: Add a record to `gpkg_contents`
#' @param con A _geopackage_
#' @param table_name Name of table to add or remove record for in _gpkg_contents_
#' @param description Default `""`
#' @param template Default `NULL` uses global EPSG:4326 with bounds -180,-90:180,90
#' @return Result of `RSQLite::dbExecute()`
#' @noRd
#' @keywords internal
.gpkg_add_contents <- function(con, table_name, description = "", template = NULL) {
stopifnot(requireNamespace("RSQLite"))
if (!missing(template) &&
!is.null(template) &&
is.list(template) &&
all(c("ext", "srsid") %in% names(template))) {
ex <- template$ext
cr <- as.integer(template$srsid)
} else {
ex <- c(-180, -90, 180, 90)
cr <- 4326
}
# append to gpkg_contents
RSQLite::dbExecute(con,
paste0(
"INSERT INTO gpkg_contents (table_name, data_type, identifier,
description, last_change,
min_x, min_y, max_x, max_y, srs_id)
VALUES ('",
table_name ,
"', 'attributes', '",
table_name,
"', '",
description,
"','",
strftime(Sys.time(), '%Y-%m-%dT%H:%M:%OS3Z'),
"', ", ex[1], ", ", ex[2], ", ",
ex[3], ", ", ex[4], ", ",
cr, "
);"
)
)
}
#' @description `.gpkg_delete_contents()`: Delete a record from `gpkg_contents` based on table name
#' @noRd
#' @keywords internal
.gpkg_delete_contents <- function(con, table_name) {
stopifnot(requireNamespace("RSQLite"))
RSQLite::dbExecute(con, paste0("DELETE FROM gpkg_contents WHERE table_name = '", table_name, "'"))
}
#' @description `.gpkg_has_contents()`: Determine if a database has table named `"gpkg_contents"`
#' @noRd
#' @keywords internal
.gpkg_has_contents <- function(con) {
stopifnot(requireNamespace("RSQLite"))
isTRUE("gpkg_contents" %in% RSQLite::dbListTables(con))
}
#' @description `.gpkg_has_contents()`: Determine if a database has table named `"gpkg_contents"`
#' @noRd
#' @keywords internal
.gpkg_create_contents <- function(con) {
stopifnot(requireNamespace("RSQLite"))
q <- "CREATE TABLE gpkg_contents (
table_name TEXT NOT NULL PRIMARY KEY,
data_type TEXT NOT NULL,
identifier TEXT UNIQUE,
description TEXT DEFAULT '',
last_change DATETIME NOT NULL DEFAULT (strftime('%Y-%m-%dT%H:%M:%fZ','now')),
min_x DOUBLE,
min_y DOUBLE,
max_x DOUBLE,
max_y DOUBLE,
srs_id INTEGER,
CONSTRAINT fk_gc_r_srs_id FOREIGN KEY (srs_id) REFERENCES gpkg_spatial_ref_sys(srs_id)
)"
if (!.gpkg_has_contents(con)) {
RSQLite::dbExecute(con, q)
} else return(1)
}