data/scripts/build_geojson

#!/bin/bash
export SHELLOPTS

if [ $# -lt 3 ]; then
  echo "Script must be passed at least 3 arguments:
    1) directory with PUMS data organized by year
    2) directory with shapefile for all states and for each PUMA
    3) output dir for processed GeoJSON"
  exit 1
fi

processed_pums_dir=$1
shapefile_dir=$2
processed_geojson_dir=$3

source ./fips

if [ $# -eq 4 ]; then
  single_state=$4
  validate_single_state $single_state
  fips_codes=("$single_state")
fi

# Prepare lookup files for PUMS data for all years where fields are year-prefixed
for f in $(ls $processed_pums_dir)
do
  subdir="${processed_pums_dir}/${f}"
  if [ -d $subdir ] ; then
    year=$f
    echo "Processing geometries for year $year..."

    # Create directory where we can save our processed GeoJSON files for this year
    geojson_year_dir=$processed_geojson_dir/$year
    mkdir -p $geojson_year_dir

    pums_languages_file=$subdir/languages.json
    pums_all_file=$subdir/all.json

    # Get language counts for each PUMA, identified by geoid (STATE + PUMA)
    echo "Creating PUMA language file..."
    puma_languages_year_file=$geojson_year_dir/puma_languages.json
    jq "map({ (.state + .puma): {(\"$year-\" + (.language)): (.count)}}) | reduce .[] as \$x ({}; . * \$x)" $pums_languages_file > $puma_languages_year_file
    echo "PUMA language file saved to $puma_languages_year_file"

    # Get speaker counts for each PUMA across all languages
    echo "Creating PUMA file for all speakers..."
    puma_all_year_file=$geojson_year_dir/puma_all.json
    jq "map({ (.state + .puma): {\"$year-total\": (.count)}}) | reduce .[] as \$x ({}; . * \$x)" $pums_all_file > $puma_all_year_file
    echo "PUMA language file saved to $puma_all_year_file"

    # Get language counts for each state
    echo "Creating states language file..."
    state_languages_year_file=$geojson_year_dir/state_languages.json
    jq "[group_by(.state)[] | group_by(.language)[] | reduce .[] as \$r ({}; \$r + (.count += \$r.count))] | map({(.state): {(\"$year-\" + (.language)): (.count)}}) | reduce .[] as \$x ({}; . * \$x)" $pums_languages_file > $state_languages_year_file
    echo "States language file saved to $state_languages_year_file"

    # Get speaker counts for each state across all languages
    echo "Creating states language file for all speakers..."
    state_all_year_file=$geojson_year_dir/state_all.json
    jq "[group_by(.state)[] | reduce .[] as \$r ({}; \$r + (.count += \$r.count))] | map({(.state): {\"$year-total\": (.count)}}) | reduce .[] as \$x ({}; . * \$x)" $pums_all_file > $state_all_year_file
    echo "States language file saved to $state_all_year_file"
  fi
done

# Join all years of PUMS data together so there is one lookup file to use to update GeoJSON
# Fields are already year-prefixed to differentiate between data for different years
function jq_merge() {
  jq -s 'reduce .[] as $item ({}; . *= $item)' $@
}

state_languages_file=$processed_geojson_dir/state_languages.json
jq_merge $processed_geojson_dir/**/state_languages.json > $state_languages_file
state_all_file=$processed_geojson_dir/state_all.json
jq_merge $processed_geojson_dir/**/state_all.json > $state_all_file

puma_languages_file=$processed_geojson_dir/puma_languages.json
jq_merge $processed_geojson_dir/**/puma_languages.json > $puma_languages_file
puma_all_file=$processed_geojson_dir/puma_all.json
jq_merge $processed_geojson_dir/**/puma_all.json > $puma_all_file

# Process states
states_filename=tl_2020_us_state
states_in_file=$shapefile_dir/$states_filename.shp
states_out_file=$processed_geojson_dir/$states_filename.geojson
ogr2ogr -f GeoJSON -t_srs crs:84 $states_out_file $states_in_file
echo "States GeoJSON saved to $states_out_file"

# NOTE: the states shapefile is a single file which includes the U.S. Virgin
# Islands for which we don't have PUMS data, so we need to filter out that
# feature here
echo "Filtering states by FIPS code..."
fips_json_array=$(jq --compact-output --null-input '$ARGS.positional' --args -- "${fips_codes[@]}")
states_filtered_out_file=$processed_geojson_dir/${states_filename}_filtered.geojson
jq --argjson fips $fips_json_array '.features[] |= select(.properties.GEOID | IN ($fips[]))' $states_out_file > $states_filtered_out_file
echo "Filtered states GeoJSON file saved to $states_filtered_out_file"

echo "Updating states GeoJSON properties..."
states_updated_out_file=$processed_geojson_dir/${states_filename}_updated.geojson
jq --argfile states $state_languages_file --argfile total $state_all_file '.features[] | .id = (.properties.GEOID | tonumber) | .properties |= {geoid: (.GEOID), name: (.NAME)} + $total[(.GEOID)] + $states[(.GEOID)]' $states_filtered_out_file > $states_updated_out_file
echo "Updated states GeoJSON file saved to $states_updated_out_file"

# Census decade since PUMAs are redefined every 10 years
# PUMAs referenced in PUMS data must correspond with geometries for metadata to be correct
decade=10 # 2010

# Process PUMAs
for code in "${fips_codes[@]}"
do
  echo "Processing PUMA for state $code..."
  puma_filename=tl_2020_${code}_puma${decade}
  puma_in_file=$shapefile_dir/$puma_filename.shp
  puma_out_file=$processed_geojson_dir/$puma_filename.geojson
  ogr2ogr -f GeoJSON -t_srs crs:84 $puma_out_file $puma_in_file
  echo "PUMA GeoJSON saved to $puma_out_file"

  echo "Update PUMA GeoJSON properties..."
  puma_updated_out_file=$processed_geojson_dir/${puma_filename}_updated.geojson
  jq --argfile pumas $puma_languages_file --argfile total $puma_all_file ".features[] | .id = (.properties.GEOID${decade} | tonumber) | .properties |= {geoid: (.GEOID${decade}), name: (.NAMELSAD${decade})} + \$total[(.GEOID${decade})] + \$pumas[(.GEOID${decade})]" $puma_out_file > $puma_updated_out_file
done