-
Notifications
You must be signed in to change notification settings - Fork 0
/
wildfire_cleaning2.R
192 lines (143 loc) · 5.46 KB
/
wildfire_cleaning2.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
###### Bulk Cleaning and Dataframe Tweaking - Script 3
###### Nicholas Archambault
###### Final Project, S&DS 425, 7 Dec. 2020
# This script reads in the 'total' CSV, the dataframe containing all information
# on fires, their associated stations, and weather measurements made at the time
# of their burns. It pares this large dataframe down to its most essential
# elements and creates new lagged time series variables for specific parameters
# Read in packages and data
library(RANN)
library(zoo)
library(data.table)
library(sp)
library(geosphere)
total <- read.csv("total.csv", as.is = TRUE, stringsAsFactors = FALSE)
# Eliminate unnecessary columns
total <- total[, -c(1, 4, 18, 24:25)]
# Eliminate all rows with at least one NA value -- these will hinder later
# modeling attempts
total <- na.omit(total)
#### DATA CLEANING
# Convert column names to desired length, case and format using 'gsub'
colnames(total)[16:42] <- gsub("^Value.Day", "", colnames(total)[16:42])
colnames(total)[16:42] <- gsub("\\..*", "", colnames(total)[16:42])
colnames(total)[16:42] <-
gsub("([[:lower:]])([[:upper:]])", "\\1_\\2", colnames(total)[16:42])
colnames(total)[16:42] <- tolower(colnames(total)[16:42])
colnames(total)[16:42] <- gsub("precip", "prcp", colnames(total)[16:42])
colnames(total)[16:42] <- gsub("vap_pres", "vp", colnames(total)[16:42])
# Eliminate duplicate rows
total <- unique(total)
# Eliminate extraneous values. Soil temperature restricted to be 0 or positive
# in order to eliminate likely mistakes
total <- total[total$soil_tmp_avg >= 0, ]
# Precipitation restricted to realistic values
total <- total[total$prcp != 6999, ]
# Relative humidity is a percentage and cannot be negative
total <- total[total$rel_hum_min >= 0, ]
# ETo should not be negative but sometimes is due to rounding errors or other
# arithmetical flukes. Re-adjust all values so that they are 0 or greater
total$eto <- total$eto - min(total$eto)
#### INCORPORATING CITIES
# Read in dataframe of geocoding list of cities in California
cities <- read.csv("geocode.csv")
# Fix columns
cities <- cities[, c(2:5, 14)]
colnames(cities) <- c("city", "state", "latitude", "longitude", "county")
# Eliminate mistakes listing lat and long values as 0
cities <- cities[(cities$latitude != 0) & (cities$longitude != 0), ]
# Clean county names
cities$county <- gsub(" County$", "", cities$county)
# Trim tail end of dataframe
endpoint <- which(cities$city == "")[1] - 1
cities <- cities[1:endpoint, ]
# Define new columns in 'total' dataframe
total$closest_city <- NA
total$remoteness <- NA
total$county <- NA
# Create pairs of city coordinates, as well as pairs of fire coordinates
c_coords <- cities[, c(4:3)]
f_coords <- total[, c("longitude", "latitude")]
# Use 'geosphere' packages to calculate the distances between all fires and
# all California cities. Identify the closest city to each fire, and the
# distance between the two points
sp.f_coords <- f_coords
sp.c_coords <- c_coords
d <- distm(sp.f_coords, sp.c_coords, distCosine)
min.d.ind <- apply(d, 1, function(x) order(x, decreasing = FALSE)[1])
min.d <- apply(d, 1, min)
# Fill newly-defined columns appropriately with city name, county name, and
# distance between city and fire
total$closest_city <- cities[min.d.ind, "city"]
total$county <- cities[min.d.ind, "county"]
total$remoteness <- min.d
# Eliminate few mistakes in data
total <- total[total$closest_city != "", ]
# Order dataframe by station number and start date in preparation to create
# lagged columns
total <- total[order(total$station_number, total$start_date), ]
#### CREATE LAGGED VARIABLES
# Define variables that will be lagged
lagged_cols <- total[, c(1, 2, 16, 19:22, 25, 28, 30, 37:38)]
# Split dataframe by station
s <- split(lagged_cols, factor(lagged_cols$station_number))
# Create loop that replaces every mini station-associated dataframe
# with the average of that dataframe's variables over the preceding 7 days
lagged <- list()
for (i in 1:length(s)) {
b <- frollmean(as.data.frame(s[i])[, 3:12], n = 7)
m <- matrix(unlist(b), nrow = nrow(as.data.frame(s[i])[, 3:12]),
ncol = 10, byrow = FALSE)
s[[i]][, 3:12] <- m
lagged <- append(lagged, list(s[[i]]))
}
lagged7 <- do.call(rbind, lagged)
# Repeat for the values of the variables over the preceding 30 days. We now have
# week- and month-lagged columns of most numerical weather variables.
lagged <- list()
for (i in 1:length(s)) {
b <- frollmean(as.data.frame(s[i])[, 3:12], n = 30)
m <- matrix(unlist(b), nrow = nrow(as.data.frame(s[i])[, 3:12]),
ncol = 10, byrow = FALSE)
s[[i]][, 3:12] <- m
lagged <- append(lagged, list(s[[i]]))
}
lagged30 <- do.call(rbind, lagged)
# Bind lagged columns with 'total' dataframe
colnames(lagged7) <- paste0(colnames(lagged7), "_7")
colnames(lagged30) <- paste0(colnames(lagged30), "_30")
lagged7 <- lagged7[, -c(1:2)]
lagged30 <- lagged30[, -c(1:2)]
total <- cbind(total, lagged7, lagged30)
# Write CSV
total <- na.omit(total)
write.csv(total, "total_all.csv")
#### DEFINE UNITS OF NUMERICAL COLUMNS
# size [acres]
# air_tmp_avg [C]
# air_tmp_max [C]
# air_tmp_min [C]
# dew_pnt [C]
# eto [mm]
# prcp [mm]
# rel_hum_avg [%]
# rel_hum_max [%]
# rel_hum_min [%]
# soil_tmp_avg [C]
# soil_tmp_max [C]
# soil_tmp_min [C]
# sol_rad [W -m^2]
# sol_rad_net [W -m^2]
# vp_avg [kPa]
# vp_max [kPa]
# vp_min [kPa]
# wind_ene [m -s]
# wind_ese [m -s]
# wind_nne [m -s]
# wind_nnw [m -s]
# wind_ssw [m -s]
# wind_sse [m -s]
# wind_wnw [m -s]
# wind_wsw [m -s]
# wind_spd_avg [m -s]
# wind_run [m -s]