In [None]:
# Read in necessary libraries
# Define geo data location
# Define processed data location
library(dplyr)
library(stringr)
library(foreach)
library(jsonlite)
library(purrr)
library(tidyr)
library(readr)
library(stringr)
library(lubridate)
geo_data_directory <- '/nfs/sloanlab004/projects/covid_mobility_proj/data/PROCESSED_DATA/safe_graph_geos/2020/'
processed_data_directory <- '/nfs/sloanlab004/projects/covid_mobility_proj/data/PROCESSED_DATA/'

In [None]:
# Read in set of months
months <- list.files(geo_data_directory)

# Iterate through months
data_wrapped <- foreach(i = months, .combine='rbind') %do% {
    directory = str_c(geo_data_directory, i)
    
    # How many days in each month?
    n_days = case_when(i == '01' ~ 31,
                       i == '02' ~ 29,
                       i == '03' ~ 31,
                       i == '04' ~ 19)
    
    # Loop through the months
    data_aggregate_month <- foreach(j = seq(1, n_days, 1), .combine='rbind') %do% {
        # Construct string for file for each day
        day_string <- str_pad(as.character(j), 2, '0', side='left')
        file = str_c(directory, '/2020', as.character(i), as.character(day_string), '_county_graph.txt')
        
        # Read in file
        data = read_tsv(file, 
                        col_types = cols(
                            X1 = col_integer(),
                            `origin-county` = col_integer(),
                            `dest-county` = col_integer(),
                            `num-devices` = col_integer()
                        ))
        
        # Add day to data
        data <- data %>% 
          mutate(ds = as.Date(str_c('2020-', i, '-', day_string)))
        
        # Rename rows for nice manipulation in R
        names(data) <- c('rn', 'origin_county', 'destination_county', 'num_devices', 'ds')
        
        # Remove rn column
        data <- data %>% dplyr::select(-rn)
    }
}

In [None]:
# Filter out county pairs that don't have at least one device traveling between them in each month in our sample.,
# and travel between the pair for at least 30 distinct days in our sample. Also filter out any pairs
# where one county is in a US territory.
data_wrapped %>% 
  filter(origin_county <= 57000 & destination_county <= 57000) %>% 
  filter(origin_county != destination_county) %>%
  group_by(origin_county, destination_county) %>% 
  mutate(n_months = n_distinct(month(ds)),
         n = n()) %>% 
  ungroup() %>% 
  filter(n_months == 4 & n >= 30) %>% 
  dplyr::select(-n_months) -> data_wrapped_good_pairs

In [None]:
# Expand the full set of dates and county pairs
county_dest_ds_expanded <- expand(data_wrapped_good_pairs, 
                                  ds, nesting(origin_county, destination_county))

In [None]:
# Join the observations into the full sets of dates and county pairs
mobility_df <- county_dest_ds_expanded %>% 
  dplyr::left_join(data_wrapped_good_pairs) %>% 
  mutate(num_devices = replace_na(num_devices, 0))

In [None]:
# Write to file
save(mobility_df, file=paste0(c(processed_data_directory,
                                         'safegraph_dyad_mobility.Rdata'),
                                       sep='',
                                       collapse=''))