In [92]:
import geopandas as gpd
import pandas as pd
import pickle
from zipfile import ZipFile
import tempfile
from collections import defaultdict
pd.set_option('display.max_columns',None)

### Purpose
Given the FAF5 regional association with counties and the county population, create an OD matrix (state_orig, county_orig, state_dest, county_dest)=tons

In [107]:
TARGET_FIELD = 'tons_2022'
TOLERANCE = 1e-6

In [70]:
county_pop_pdf = pd.read_csv('../data/raw/us_county_pop.csv',dtype={'STATE':str,'COUNTY':str})

In [71]:
with open('../data/transformed/faf_zone_id_to_state_county_id.pickle','rb') as fp:
    faf_id_to_county_id_map = pickle.load(fp)

In [72]:
actual_state_county_to_pop_map = dict()
for row in county_pop_pdf.loc[county_pop_pdf['COUNTY']!='000'].itertuples(): # ignore state totals
    actual_state_county_to_pop_map[(row.STATE,row.COUNTY)]=row.POPESTIMATE2022

In [73]:
faf_id_to_county_id_population_map = defaultdict(dict)

# look up the county population within each faf zone
for faf_id, vals in faf_id_to_county_id_map.items():
    for (state_id, county_id), pct_county_in_faf in vals.items():
        faf_id_to_county_id_population_map[faf_id][(state_id, county_id)] = actual_state_county_to_pop_map[(state_id, county_id)]*pct_county_in_faf

In [74]:
faf_total_population_map = dict()
for faf_id, vals_dict in faf_id_to_county_id_population_map.items():
    faf_total_population_map[faf_id] = sum(vals_dict.values())

# check that the totals are equal
assert sum(actual_state_county_to_pop_map.values()) == sum(faf_total_population_map.values())

In [108]:
# determine the percentage of the faf demand that should be allocated to the county based on the allocation metric of interest
faf_id_to_county_percent_map = defaultdict(dict)
for faf_id, vals in faf_id_to_county_id_population_map.items():
    for (state_id, county_id), population_portion in vals.items():
        faf_id_to_county_percent_map[faf_id][(state_id, county_id)]=population_portion/faf_total_population_map[faf_id]

# confirm that all the faf ids have a "total" allocation that sums to 1 (within some tolerance)
assert all([abs(sum(pop_map.values())-1)<TOLERANCE for pop_map in faf_id_to_county_percent_map.values()])

In [35]:
target_path = '../data/raw/faf5_demand.zip'
zip = ZipFile(target_path)
with tempfile.TemporaryDirectory() as tmpdirname:
    zip.extractall(path=tmpdirname)
    target_csvs = [f for f in zip.namelist() if f.endswith('csv')]
    faf_pdf = pd.read_csv(zip.open(target_csvs[0]),dtype={'dms_orig':str, 'dms_dest':str})
    zip.close()

In [111]:
# faf_mode_map = {
#     1:  'Truck',
#     2:	'Rail',
#     3:	'Water',
#     4:	'Air (include truck-air)',
#     5:	'Multiple modes & mail',
#     6:	'Pipeline',
#     7:	'Other and unknown',
#     8:	'No domestic mode',
# }

In [100]:
is_by_truck = faf_pdf['dms_mode']==1 # by truck
faf_truck_pdf = faf_pdf.loc[is_by_truck]
total_road_tons_od_pdf = faf_truck_pdf.groupby(['dms_orig','dms_dest'],as_index=False)[[TARGET_FIELD]].sum()

In [110]:
%%time
# create a (state_orig, county_orig, state_dest, county_dest) => tons map
county_od = defaultdict(float)
for row in total_road_tons_od_pdf.itertuples():
    constituent_orig_counties_map = faf_id_to_county_percent_map[row.dms_orig]
    constituent_dest_counties_map = faf_id_to_county_percent_map[row.dms_dest]
    for (state_orig, county_orig), pct_in_county_orig in constituent_orig_counties_map.items():
        for (state_dest, county_dest), pct_in_county_dest in constituent_dest_counties_map.items():
            county_od[(state_orig, county_orig, state_dest, county_dest)] += getattr(row,TARGET_FIELD)*pct_in_county_orig*pct_in_county_dest

assert abs(total_road_tons_od_pdf[TARGET_FIELD].sum()-sum(county_od.values()))<TOLERANCE

CPU times: total: 8.44 s
Wall time: 8.45 s


In [120]:
len(county_od)

9835245

In [116]:
county_od_tuples = [(*k, v) for k,v in county_od.items()]

In [118]:
county_od_pdf = pd.DataFrame(county_od_tuples,columns=['state_orig','county_orig','state_dest','county_dest','tons'])

In [119]:
# save as a parquet file to reduce file size
county_od_pdf.to_parquet('../data/transformed/county_od.parquet')