In [17]:
# read in necessary libraries
# specify the top level directory where all of the social distancing data lives
# Read in a file that will map zip codes to census tracts
library(dplyr)
library(stringr)
library(foreach)
library(purrr)
library(tidyr)
library(readr)
data_directory_2020 <- '/nfs/sloanlab004/projects/covid_mobility_proj/data/safegraph_data/weekly-patterns//v1//main-file/'
processed_data_directory <- '/nfs/sloanlab004/projects/covid_mobility_proj/data/PROCESSED_DATA/'
zip_census_tract <- read.csv(
    '/nfs/sloanlab004/projects/covid_mobility_proj/data/zip_census_mapping/ZIP_TRACT_122019.csv',
    colClasses = rep('character', 6))

In [18]:
# TODO: parallelize this so it is faster
# Cycle through all of the data that we have
data_aggregate <- foreach(i=list.files(data_directory_2020), .combine='rbind') %do% {
    # Construct a file name
    filename = paste0(data_directory_2020, i, sep='', collapse='')
    # Read in the data
    data = read_csv(filename)
    
    aggregate_data <- data %>% 
    # Transform some variables
    mutate(postal_code = as.character(postal_code),
           ds = as.Date(date_range_start)) %>%
    # Join in the census tract info
    left_join(., zip_census_tract, by=c('postal_code'='ZIP')) %>% 
    # Collapse NAICS codes to 2-digit codes, and collapse FIPS to county FIPS
    mutate(two_digit_naics = floor(as.numeric(naics_code)/10000),
           county_fips = floor(as.numeric(TRACT)/1000000)) %>%
    # Group by county, day, and NAICS
    group_by(ds, county_fips, two_digit_naics) %>% 
    # Count visitors and visits
    summarise(raw_visit_count = sum(raw_visit_counts),
              raw_visitor_count = sum(raw_visitor_counts)) %>%
    ungroup()  
}

Parsed with column specification:
cols(
  .default = col_character(),
  naics_code = col_integer(),
  date_range_start = col_datetime(format = ""),
  date_range_end = col_datetime(format = ""),
  raw_visit_counts = col_integer(),
  raw_visitor_counts = col_integer(),
  distance_from_home = col_integer(),
  median_dwell = col_double()
)
See spec(...) for full column specifications.
Parsed with column specification:
cols(
  .default = col_character(),
  naics_code = col_integer(),
  date_range_start = col_datetime(format = ""),
  date_range_end = col_datetime(format = ""),
  raw_visit_counts = col_integer(),
  raw_visitor_counts = col_integer(),
  distance_from_home = col_integer(),
  median_dwell = col_double()
)
See spec(...) for full column specifications.
Parsed with column specification:
cols(
  .default = col_character(),
  naics_code = col_integer(),
  date_range_start = col_datetime(format = ""),
  date_range_end = col_datetime(format = ""),
  raw_visit_counts = col_integer(),
  

In [19]:
# Write to file.
write.csv(data_aggregate, file=paste0(c(processed_data_directory,
                                         'safegraph_weekly_patterns_2_digit_naics_aggregate_county.csv'),
                                       sep='',
                                       collapse=''),
          row.names=FALSE)