In [2]:
library(readr)
library(dplyr)
library(tidyr)
library(stringr)
library(ggplot2)
library(sf)
library(sp)
library(geosphere)
library(doMC)
library(USAboundaries)
registerDoMC(cores = 24)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


Linking to GEOS 3.5.2, GDAL 2.4.2, PROJ 4.8.0

Loading required package: foreach

Loading required package: iterators

Loading required package: parallel



In [3]:
### Reading in Data
fp <- '/pool001/mfzhao/PROCESSED_DATA/'
stations <- read_csv(str_c(fp, 'stations.csv'))
weather  <- read_csv(str_c(fp, 'weather.csv'), 
                     col_types = cols(
                         STAID = col_character(),
                         DATE = col_date(format = ""),
                         PRCP = col_double(),
                         TMAX = col_double()
                     ))
    
eu_mobility <- read_csv(str_c(fp, 'eu_mobility.csv'))

Parsed with column specification:
cols(
  STAID = [31mcol_character()[39m,
  STANAME = [31mcol_character()[39m,
  CN = [31mcol_character()[39m,
  lat = [32mcol_double()[39m,
  lng = [32mcol_double()[39m
)

Parsed with column specification:
cols(
  key = [31mcol_character()[39m,
  ds = [34mcol_date(format = "")[39m,
  btvrc = [32mcol_double()[39m,
  rstu = [32mcol_double()[39m,
  n = [32mcol_double()[39m
)



In [4]:
weather %>%
    filter(DATE < as.Date('2020-07-01'), DATE >= as.Date('2020-01-01')) -> weather

stations %>% 
    inner_join(weather %>% select(STAID) %>% distinct()) -> stations

Joining, by = "STAID"



In [5]:
eu_shapefiles <- read_rds('/pool001/mfzhao/geo_data/europe_LRmap.RDS')
us_shapefiles <- us_counties(resolution = 'high') %>%
    unite('key', statefp, countyfp, sep = '') %>%
    mutate(country = 'USA') %>%
    select(country, key, name, geometry)

as.data.frame(eu_shapefiles) %>%
    select(key = id, CNTR_CODE) %>%
    inner_join(eu_mobility) %>%
    select(CNTR_CODE) %>%
    distinct() %>%
    left_join(eu_shapefiles) %>%
    select(key = id, name = NUTS_NAME, CN = CNTR_CODE, geometry) %>%
    rbind(as.data.frame(us_shapefiles) %>%
          mutate(CN = 'US') %>%
          select(key, name, CN, geometry)) -> shapefiles_df

Joining, by = "key"

Joining, by = "CNTR_CODE"



In [6]:
shapefiles <- st_as_sf(shapefiles_df)
stations_sf <- st_as_sf(stations, coords = c("lng", "lat"), crs = st_crs(shapefiles))
stations_in_region <- st_intersects(shapefiles, stations_sf)
names(stations_in_region) <- shapefiles_df$key

In [7]:
mapper <- function(row) {
  key <-  row$key
  return(stations[stations_in_region[[key]], ])
}

# Generating dataframe that maps stations to counties
shapefiles_df %>% 
    select(key) %>%
    group_by(key) %>%
    do(mapper(.)) -> keys_stations

In [8]:
# Getting county centroids
st_centroid(shapefiles) -> temp

as.data.frame(shapefiles) %>%
    ungroup() %>%
    mutate(geometry = str_replace(st_as_text(temp$geometry), 'POINT \\(', ''),
           geometry = str_sub(geometry, 1, -2)) %>%
    separate(geometry, sep = ' ', into = c('lng', 'lat')) %>%
    mutate(lng = as.numeric(lng),
           lat  = as.numeric(lat)) -> centroids

“st_centroid assumes attributes are constant over geometries of x”


In [9]:
shapefiles_df %>% 
    select(key) %>%
    anti_join(keys_stations %>% select(key) %>% distinct()) %>%
    left_join(centroids) -> missing

Joining, by = "key"

Joining, by = "key"



In [10]:
# Generating distances from country Centroids to each weather station for each county in missing
geoDist  <- distm(matrix(c(missing$lng, missing$lat), ncol=2), 
                  matrix(c(stations$lng, stations$lat), ncol=2))/1000
colnames(geoDist) <- stations$STAID

# Transform Distance matrix to "long" dataframe
as.data.frame(geoDist) %>%
  mutate(key = missing$key) %>%
  gather(key = 'STAID', 'dist', -key) -> geoDist


# For each region, get closest 3 weather stations
geoDist %>%
  group_by(key) %>%
  arrange(key, dist) %>%
  filter(row_number() < 4) %>%
  left_join(stations) %>%
  select(-dist) -> temp

# Adding these to the list of countyStations, and removing duplicates
keys_stations %>%
  bind_rows(temp) %>%
  select(-lng, -lat) %>%
  distinct() -> keys_stations

# Freeing up memory
rm(temp)

Joining, by = "STAID"



In [11]:
# Helper function to compute average PRCP and TMAX per county, removing missing values
mapper2 <- function(keyname) {
  keys_stations %>%
    filter(key == keyname) %>%
    left_join(weather) %>%
    group_by(key, DATE) %>%
    summarize(PRCP = mean(PRCP, na.rm = T),
              TMAX = mean(TMAX, na.rm = T)) -> out
  return(out)
}

out <- foreach(keyname = shapefiles_df$key, .combine = rbind) %dopar% mapper2(keyname)

In [12]:
# Spread/Gather to determine which county-dates are missing TMAX
out %>%
    group_by(key) %>%
    select(-PRCP) %>%
    spread(key = DATE, value = TMAX) %>%
    gather(key = 'DATE', value = 'TMAX', -key) -> TMAX

# Spread/Gather to determine which county-dates are missing PRCP
out %>%
    group_by(key) %>%
    select(-TMAX) %>%
    spread(key = DATE, value = PRCP) %>%
    gather(key = 'DATE', value = 'PRCP', -key) -> PRCP
  
PRCP %>%
    full_join(TMAX) -> out

TMAX %>%
    filter(is.na(TMAX)) %>%
    select(key, DATE) %>%
    left_join(centroids) %>%
    mutate(DATE = as.Date(DATE)) -> missTMAX

PRCP %>%
    filter(is.na(PRCP)) %>%
    select(key, DATE) %>%
    left_join(centroids) %>%
    mutate(DATE = as.Date(DATE)) -> missPRCP

rm(TMAX, PRCP)

Joining, by = c("key", "DATE")

Joining, by = "key"

Joining, by = "key"



In [13]:
# Getting set counties with missing TMAX
missTMAX %>%
    select(key, lng, lat) %>%
    distinct() -> mtc

# Getting set counties with missing PRCP
missPRCP %>%
    select(key, lng, lat) %>%
    distinct() -> mpc

# Combining them and removing duplicates
mtc %>%
    bind_rows(mpc) %>%
    distinct() -> missing

rm(mtc, mpc)

# Computing Distance from centroids to weather stations
geoDist <- distm(matrix(c(missing$lng, missing$lat), ncol=2), 
                  matrix(c(stations$lng, stations$lat), ncol=2))/1000
colnames(geoDist) <- stations$STAID

# Turning matrix to dataframe, filtering out stations too far away
as.data.frame(geoDist) %>%
    mutate(key = missing$key) %>%
    gather(key = 'STAID', 'dist', -key) %>%
    group_by(key) %>%
    arrange(dist)  %>%
    filter(dist <= 500) -> geoDist

In [14]:
# Getting just TMAX
weather %>%
    select(-PRCP) %>%
    filter(!is.na(TMAX)) -> TMAX

# Helper function to fill in missing TMAX
# Averages the 4 (or less) closest measurements within 500km
fillTMAX <- function(keyname) {
    missTMAX %>%
        filter(key == keyname) %>%
        left_join(geoDist) %>%
        inner_join(TMAX) %>% 
        group_by(key, DATE) %>%
        arrange(dist) %>%
        filter(row_number() <= 4) %>%
        summarize(rTMAX = mean(TMAX)) -> out
    return(out)
}

missTMAX %>%
    select(key) %>%
    distinct() -> missTMAX_keys

rTMAX <- foreach(keyname = missTMAX_keys$key, .combine = 'rbind') %dopar% fillTMAX(keyname)

In [15]:
weather %>%
  select(-TMAX) %>%
  filter(!is.na(PRCP)) -> PRCP

fillPRCP <- function(keyname) {
    missPRCP %>%
        filter(key == keyname) %>%
        left_join(geoDist) %>%
        inner_join(PRCP) %>% 
        group_by(key, DATE) %>%
        arrange(dist) %>%
        filter(row_number() <= 4) %>%
        summarize(rPRCP = mean(PRCP)) -> out
    return(out)
}

missPRCP %>%
    select(key) %>%
    distinct() -> missPRCP_keys

rPRCP <- foreach(keyname = missPRCP_keys$key, .combine = 'rbind') %dopar% fillPRCP(keyname)

In [16]:
out %>%
    mutate(DATE = as.Date(DATE)) %>%
    left_join(rTMAX) %>%
    left_join(rPRCP) %>%
    mutate(TMAX = ifelse(is.na(TMAX), rTMAX, TMAX),
           PRCP = ifelse(is.na(PRCP), rPRCP, PRCP)) %>%
    select(key, DATE, PRCP, TMAX) -> out

Joining, by = c("key", "DATE")

Joining, by = c("key", "DATE")



In [18]:
write_csv(out, str_c(fp, 'region_weather_2020.csv'))

In [17]:
out

key,DATE,PRCP,TMAX
<chr>,<date>,<dbl>,<dbl>
01001,2020-01-01,0,14.55000
01003,2020-01-01,0,16.76000
01005,2020-01-01,0,17.80000
01007,2020-01-01,0,14.40000
01009,2020-01-01,0,11.70000
01011,2020-01-01,0,15.00000
01013,2020-01-01,0,16.15000
01015,2020-01-01,0,12.80000
01017,2020-01-01,0,13.90000
01019,2020-01-01,0,12.20000
