-
Notifications
You must be signed in to change notification settings - Fork 0
/
build_geojson
executable file
·119 lines (99 loc) · 5.61 KB
/
build_geojson
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#!/bin/bash
export SHELLOPTS
if [ $# -lt 3 ]; then
echo "Script must be passed at least 3 arguments:
1) directory with PUMS data organized by year
2) directory with shapefile for all states and for each PUMA
3) output dir for processed GeoJSON"
exit 1
fi
processed_pums_dir=$1
shapefile_dir=$2
processed_geojson_dir=$3
source ./fips
if [ $# -eq 4 ]; then
single_state=$4
validate_single_state $single_state
fips_codes=("$single_state")
fi
# Prepare lookup files for PUMS data for all years where fields are year-prefixed
for f in $(ls $processed_pums_dir)
do
subdir="${processed_pums_dir}/${f}"
if [ -d $subdir ] ; then
year=$f
echo "Processing geometries for year $year..."
# Create directory where we can save our processed GeoJSON files for this year
geojson_year_dir=$processed_geojson_dir/$year
mkdir -p $geojson_year_dir
pums_languages_file=$subdir/languages.json
pums_all_file=$subdir/all.json
# Get language counts for each PUMA, identified by geoid (STATE + PUMA)
echo "Creating PUMA language file..."
puma_languages_year_file=$geojson_year_dir/puma_languages.json
jq "map({ (.state + .puma): {(\"$year-\" + (.language)): (.count)}}) | reduce .[] as \$x ({}; . * \$x)" $pums_languages_file > $puma_languages_year_file
echo "PUMA language file saved to $puma_languages_year_file"
# Get speaker counts for each PUMA across all languages
echo "Creating PUMA file for all speakers..."
puma_all_year_file=$geojson_year_dir/puma_all.json
jq "map({ (.state + .puma): {\"$year-total\": (.count)}}) | reduce .[] as \$x ({}; . * \$x)" $pums_all_file > $puma_all_year_file
echo "PUMA language file saved to $puma_all_year_file"
# Get language counts for each state
echo "Creating states language file..."
state_languages_year_file=$geojson_year_dir/state_languages.json
jq "[group_by(.state)[] | group_by(.language)[] | reduce .[] as \$r ({}; \$r + (.count += \$r.count))] | map({(.state): {(\"$year-\" + (.language)): (.count)}}) | reduce .[] as \$x ({}; . * \$x)" $pums_languages_file > $state_languages_year_file
echo "States language file saved to $state_languages_year_file"
# Get speaker counts for each state across all languages
echo "Creating states language file for all speakers..."
state_all_year_file=$geojson_year_dir/state_all.json
jq "[group_by(.state)[] | reduce .[] as \$r ({}; \$r + (.count += \$r.count))] | map({(.state): {\"$year-total\": (.count)}}) | reduce .[] as \$x ({}; . * \$x)" $pums_all_file > $state_all_year_file
echo "States language file saved to $state_all_year_file"
fi
done
# Join all years of PUMS data together so there is one lookup file to use to update GeoJSON
# Fields are already year-prefixed to differentiate between data for different years
function jq_merge() {
jq -s 'reduce .[] as $item ({}; . *= $item)' $@
}
state_languages_file=$processed_geojson_dir/state_languages.json
jq_merge $processed_geojson_dir/**/state_languages.json > $state_languages_file
state_all_file=$processed_geojson_dir/state_all.json
jq_merge $processed_geojson_dir/**/state_all.json > $state_all_file
puma_languages_file=$processed_geojson_dir/puma_languages.json
jq_merge $processed_geojson_dir/**/puma_languages.json > $puma_languages_file
puma_all_file=$processed_geojson_dir/puma_all.json
jq_merge $processed_geojson_dir/**/puma_all.json > $puma_all_file
# Process states
states_filename=tl_2020_us_state
states_in_file=$shapefile_dir/$states_filename.shp
states_out_file=$processed_geojson_dir/$states_filename.geojson
ogr2ogr -f GeoJSON -t_srs crs:84 $states_out_file $states_in_file
echo "States GeoJSON saved to $states_out_file"
# NOTE: the states shapefile is a single file which includes the U.S. Virgin
# Islands for which we don't have PUMS data, so we need to filter out that
# feature here
echo "Filtering states by FIPS code..."
fips_json_array=$(jq --compact-output --null-input '$ARGS.positional' --args -- "${fips_codes[@]}")
states_filtered_out_file=$processed_geojson_dir/${states_filename}_filtered.geojson
jq --argjson fips $fips_json_array '.features[] |= select(.properties.GEOID | IN ($fips[]))' $states_out_file > $states_filtered_out_file
echo "Filtered states GeoJSON file saved to $states_filtered_out_file"
echo "Updating states GeoJSON properties..."
states_updated_out_file=$processed_geojson_dir/${states_filename}_updated.geojson
jq --argfile states $state_languages_file --argfile total $state_all_file '.features[] | .id = (.properties.GEOID | tonumber) | .properties |= {geoid: (.GEOID), name: (.NAME)} + $total[(.GEOID)] + $states[(.GEOID)]' $states_filtered_out_file > $states_updated_out_file
echo "Updated states GeoJSON file saved to $states_updated_out_file"
# Census decade since PUMAs are redefined every 10 years
# PUMAs referenced in PUMS data must correspond with geometries for metadata to be correct
decade=10 # 2010
# Process PUMAs
for code in "${fips_codes[@]}"
do
echo "Processing PUMA for state $code..."
puma_filename=tl_2020_${code}_puma${decade}
puma_in_file=$shapefile_dir/$puma_filename.shp
puma_out_file=$processed_geojson_dir/$puma_filename.geojson
ogr2ogr -f GeoJSON -t_srs crs:84 $puma_out_file $puma_in_file
echo "PUMA GeoJSON saved to $puma_out_file"
echo "Update PUMA GeoJSON properties..."
puma_updated_out_file=$processed_geojson_dir/${puma_filename}_updated.geojson
jq --argfile pumas $puma_languages_file --argfile total $puma_all_file ".features[] | .id = (.properties.GEOID${decade} | tonumber) | .properties |= {geoid: (.GEOID${decade}), name: (.NAMELSAD${decade})} + \$total[(.GEOID${decade})] + \$pumas[(.GEOID${decade})]" $puma_out_file > $puma_updated_out_file
done