# Land use

Create the land use data based on the synthetic population generated by `materialize_simulated_population.py`

In [None]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import geopandas as gp
import shapely.geometry
import libpysal.weights
import scipy.stats

DB_URI = 'postgresql://matthewc@localhost/matthewc'

SCENARIO = 'npv_low_opcost'

In [None]:
hh = pd.read_csv(f'../model_inputs/{SCENARIO}/households.csv')
per = pd.read_csv(f'../model_inputs/{SCENARIO}/persons.csv')

In [None]:
lu = pd.DataFrame({
    'TOTHH': hh.groupby('TAZ').size(),
    'TOTPOP': hh.groupby('TAZ').hhsize.sum()  # TODO groupquarters - just assume constant GQ pop
}).reset_index()
lu.head()

## Add geoids, needed for merging with other data

In [None]:
taz_geoids = pd.read_parquet('../la_abm/data/skim_tracts.parquet')

In [None]:
lu = lu.merge(taz_geoids, left_on='TAZ', right_on='idx', how='left', validate='m:1').drop(columns=['idx'])

In [None]:
lu.head()

## Tract areas

These come from Postgres.

In [None]:
total_tract_area = pd.read_sql('SELECT geoid, aland FROM diss.ca_tracts', DB_URI)

In [None]:
lu = lu.merge(total_tract_area, on='geoid', how='left', validate='m:1')

In [None]:
assert not lu.aland.isnull().any()

In [None]:
lu['TOTACRE'] = lu.aland / 4046.86  # square meters to acres

In [None]:
res_comm_area = pd.read_sql('''
    SELECT tract, SUM(acres * (lu16 IN (
            '1110', -- Single Family Residential
            '1111', -- High Density Single Family Residential (9 or more DUs/ac)
            '1112', -- Medium Density Single Family Residential (3-8 DUs/ac)
            '1113', -- Low Density Single Family Residential (2 or less DUs/ac)
            '1120', -- Multi-Family Residential
            '1121', -- Mixed Multi-Family Residential
            '1122', -- Duplexes, Triplexes and 2- or 3-Unit Condominiums and Townhouses
            '1123', -- Low-Rise Apartments, Condominiums, and Townhouses
            '1124', -- Medium-Rise Apartments and Condominiums
            '1125', -- High-Rise Apartments and Condominiums
            '1130', -- Mobile Homes and Trailer Parks
            '1131', -- Trailer Parks and Mobile Home Courts, High-Density
            '1132', -- Mobile Home Courts and Subdivisions, Low-Density
            '1140', -- Mixed Residential
            '1100', -- Residential
            '1150', -- Rural Residential
            '1610'  -- Residential-Oriented Residential/Commercial Mixed Use
        ) OR (
            Building_PropertyLandUseStndCode = 'VL101'
        ))::int4) AS "RESACRE",
        
        SUM(acres * (lu16 IN (
            '1210', -- General Office Use
            '1211', -- Low- and Medium-Rise Major Office Use
            '1212', -- High-Rise Major Office Use
            '1213', -- Skyscrapers
            '1200', -- Commercial and Services
            '1220', -- Retail Stores and Commercial Services
            '1221', -- Regional Shopping Center
            '1222', -- Retail Centers (Non-Strip With Contiguous Interconnected Off-Street Parking)
            '1223', -- Retail Strip Development
            '1230', -- Other Commercial
            '1231', -- Commercial Storage
            '1232', -- Commercial Recreation
            '1233', -- Hotels and Motels
            '1300', -- Industrial
            '1310', -- Light Industrial
            '1311', -- Manufacturing, Assembly, and Industrial Services
            '1312', -- Motion Picture and Television Studio Lots
            '1313', -- Packing Houses and Grain Elevators
            '1314', -- Research and Development
            '1320', -- Heavy Industrial
            '1321', -- Manufacturing
            '1322', -- Petroleum Refining and Processing
            '1323', -- Open Storage
            '1324', -- Major Metal Processing
            '1325', -- Chemical Processing
            '1330', -- Extraction
            '1331', -- Mineral Extraction - Other Than Oil and Gas
            '1332', -- Mineral Extraction - Oil and Gas
            '1340', -- Wholesaling and Warehousing
            -- some but not all transport categories
            '1411', -- Airports
            '1415', -- Bus Terminals and Yards
            '1416', -- Truck Terminals
            '1417', -- Harbor Facilities
            '1418', -- Navigation Aids
            '1420', -- Communication Facilities
            '1430', -- Utility Facilities
            '1431', -- Electrical Power Facilities
            '1432', -- Solid Waste Disposal Facilities
            '1433', -- Liquid Waste Disposal Facilities
            '1435', -- Natural Gas and Petroleum Facilities
            '1438', -- Mixed Utilities
            '1440', -- Maintenance Yards
            '1441', -- Bus Yards
            '1442', -- Rail Yards
            '1500', -- Mixed Commercial and Industrial
            '1600', -- Mixed Residential and Commercial
            '1620'  -- Commercial-Oriented Residential/Commercial Mixed Use
        ))::int4) AS "CIACRE"
        FROM diss.gp16
        GROUP BY tract
''', DB_URI)

In [None]:
lu = lu.merge(res_comm_area, left_on='geoid', right_on='tract', how='left', validate='m:1')
lu.loc[lu.aland == 0, ]
assert not lu.RESACRE.isnull().any()
assert not lu.CIACRE.isnull().any()
lu.head()

In [None]:
hh.household_id.duplicated().sum()

In [None]:
per = per.merge(hh, on='household_id', how='left', validate='m:1')

In [None]:
age0519 = per[(per.age >= 5) & (per.age <= 19)].groupby('TAZ', as_index=False).size()

In [None]:
lu = lu.merge(age0519.rename(columns={'size': 'AGE0519'}), on='TAZ', how='left', validate='1:1')
# tiny Census tract with no kids
lu.loc[(lu.geoid == '06037980021') & (lu.AGE0519.isnull()), 'AGE0519'] = 0
assert not lu.AGE0519.isnull().any()

## Employment

In [None]:
wac = pd.read_csv('../data/ca_wac_S000_JT00_2017.csv', dtype={'w_geocode': 'str'})
wac['tract'] = wac.w_geocode.str.slice(0, 11)
wac.head()

In [None]:
trwac = wac.drop(columns=['w_geocode', 'createdate']).groupby('tract').sum()

In [None]:
trwac['TOTEMP'] = trwac.C000
trwac['RETEMPN'] = trwac.CNS07
# CNS10: finance (NAICS 52)
# CNS12: professional, scientific, and technical services (NAICS 54)
# TODO other things that could fit under finance/professional are real estate...
trwac['FPSEMPN'] = trwac.CNS10 + trwac.CNS12
# CNS15: Educational Services (NAICS 61)
# CNS16: NAICS sector 62 (Health Care and Social Assistance) 
# CNS17: NAICS sector 71 (Arts, Entertainment, and Recreation) 
trwac['HEREMPN'] = trwac.CNS15 + trwac.CNS16 + trwac.CNS17

# CNS01: NAICS sector 11 (Agriculture, Forestry, Fishing and Hunting) 
# CNS02: NAICS sector 21 (Mining, Quarrying, and Oil and Gas Extraction)
trwac['AGREMPN'] = trwac.CNS01 + trwac.CNS02

# CNS05: NAICS sector 31-33 (Manufacturing) 
# CNS06: Number of jobs in NAICS sector 42 (Wholesale Trade) 
trwac['MWTEMPN'] = trwac.CNS05 + trwac.CNS06

trwac['OTHEMPN'] = trwac.TOTEMP - trwac.RETEMPN - trwac.FPSEMPN - trwac.HEREMPN - trwac.AGREMPN - trwac.MWTEMPN

In [None]:
lu = lu.merge(trwac[['TOTEMP', 'RETEMPN', 'FPSEMPN', 'HEREMPN', 'AGREMPN', 'MWTEMPN', 'OTHEMPN']],
              left_on='geoid', right_index=True, how='left', validate='1:1')

In [None]:
# using all, not any, because some tracts have no employment, but make sure the match worked
assert not lu.TOTEMP.isnull().all()
assert not lu.RETEMPN.isnull().all()
assert not lu.FPSEMPN.isnull().all()
assert not lu.HEREMPN.isnull().all()
assert not lu.AGREMPN.isnull().all()
assert not lu.MWTEMPN.isnull().all()
assert not lu.OTHEMPN.isnull().all()

In [None]:
lu[['TOTEMP', 'RETEMPN', 'FPSEMPN', 'HEREMPN', 'AGREMPN', 'MWTEMPN', 'OTHEMPN']] =\
    lu[['TOTEMP', 'RETEMPN', 'FPSEMPN', 'HEREMPN', 'AGREMPN', 'MWTEMPN', 'OTHEMPN']].fillna(0)

In [None]:
lu['PRKCST'] = lu['OPRKCST'] = 0

## Account for people living in nonresidential areas

I'm looking at you, Vernon.

In [None]:
np.sum((lu.TOTHH > 0) & (lu.RESACRE <= 0))

In [None]:
mean_hh_dens = np.mean((lu.TOTHH / lu.RESACRE)[lu.RESACRE > 0])

In [None]:
lu.loc[(lu.TOTHH > 0) & (lu.RESACRE <= 0), 'RESACRE'] = lu.loc[(lu.TOTHH > 0) & (lu.RESACRE <= 0), 'TOTHH'] / mean_hh_dens

## And working in noncommercial areas

In [None]:
np.sum((lu.TOTEMP > 0) & (lu.CIACRE <= 0))

In [None]:
mean_emp_dens = np.mean((lu.TOTHH / lu.CIACRE)[lu.CIACRE > 0])
mean_emp_dens

In [None]:
# doesn't really matter what we set these to, we want density to be zero not 0/0
lu.loc[lu.CIACRE <= 0, 'CIACRE'] = 1e-6
lu.loc[lu.RESACRE <= 0, 'RESACRE'] = 1e-6

In [None]:
lu.loc[(lu.TOTEMP > 0) & (lu.CIACRE <= 0), 'CIACRE'] = lu.loc[(lu.TOTEMP > 0) & (lu.CIACRE <= 0), 'TOTEMP'] / mean_emp_dens

## High school enrollment

Use [California Dept of Education school enrollment for 2016-2017](https://www.cde.ca.gov/ds/sd/sd/filesenr.asp) and the California School Campus Database.

In [None]:
enr = pd.read_csv('../data/2016_2017_cde_school_enrollment.tsv', sep='\t', dtype={'CDS_CODE': 'str'})

In [None]:
enr.ENR_TOTAL.sum()

In [None]:
grpd = enr.groupby('CDS_CODE')
hsenr = grpd.GR_9.sum() + grpd.GR_10.sum() + grpd.GR_11.sum() + grpd.GR_12.sum() + grpd.UNGR_SEC.sum()
hsenr = hsenr[hsenr > 0].copy()
hsenr = pd.DataFrame({
    'HSENROLL': hsenr,
    'enr_county': grpd.COUNTY.first().reindex(hsenr.index),
    'enr_school': grpd.SCHOOL.first().reindex(hsenr.index),
    'enr_district': grpd.DISTRICT.first().reindex(hsenr.index)
})
hsenr

In [None]:
# load CSCD
cscd16 = gp.read_file('../data/cscd/School_Centroids.shp').to_crs(epsg=4326)
cscd21 = gp.read_file('../data/cscd/CSCD_2021_School_Centroids.shp').to_crs(epsg=4326)

In [None]:
# Some new schools opened, but some closed as well, so use a merge of the two CSCD datasets. Prefer the 2016
# dataset when a school is in both since it's closer in time to the rest of the data
new_schools_21 = cscd21[~cscd21.CDSCode.isin(cscd16.CDSCode)]
len(new_schools_21)

In [None]:
cscd = pd.concat([cscd16, new_schools_21], ignore_index=True)

In [None]:
# load tracts
tracts = gp.read_postgis('SELECT geoid, geog::geometry as geom FROM diss.ca_tracts', DB_URI)

In [None]:
cscd_with_tracts = gp.sjoin(cscd, tracts, how='left', op='within')

In [None]:
assert not cscd_with_tracts.geoid.isnull().any()

In [None]:
# merge with enrollment
cscd_with_tracts = cscd_with_tracts.merge(
    hsenr,
    # some schools have multiple campuses
    left_on='CDSCode', right_index=True, how='right', validate='m:1'
)

In [None]:
cscd_with_tracts.loc[cscd_with_tracts.County.isnull() & cscd_with_tracts.enr_county.isin([
    'Los Angeles',
    'San Bernardino',
    'Imperial',
    'Riverside',
    'Orange',
    'Ventura'
]), ['CDSCode', 'HSENROLL', 'enr_district', 'enr_school', 'enr_county']]

### Still unmatched schools

We're missing ~9300 students across all of the SCAG region, out of 1.9 million high school students. That's probably okay.

In [None]:
cscd_with_tracts.loc[cscd_with_tracts.County.isnull() & cscd_with_tracts.enr_county.isin([
    'Los Angeles',
    'San Bernardino',
    'Imperial',
    'Riverside',
    'Orange',
    'Ventura'
]), 'HSENROLL'].sum()

In [None]:
cscd_with_tracts.loc[cscd_with_tracts.enr_county.isin([
    'Los Angeles',
    'San Bernardino',
    'Imperial',
    'Riverside',
    'Orange',
    'Ventura'
]), 'HSENROLL'].sum()

In [None]:
cscd_with_tracts.loc[cscd_with_tracts.County.isnull() & cscd_with_tracts.enr_county.isin([
    'Los Angeles',
    'San Bernardino',
    'Imperial',
    'Riverside',
    'Orange',
    'Ventura'
]), 'CDSCode'].isin(cscd21.CDSCode).mean()

### Multisite schools

It's fairly common for charter schools to have multiple locations, but only report aggregate numbers to the CSCD. We just assume each location has the same number of students. Other schools may own multiple properties in the same tract, which is harmless.

In [None]:
cscd_with_tracts['n_campuses'] = cscd_with_tracts.groupby('CDSCode').HSENROLL.transform(len)

In [None]:
cscd_with_tracts.n_campuses.describe()

In [None]:
cscd_with_tracts['HSENROLL'] /= cscd_with_tracts.n_campuses

In [None]:
tract_hsenroll = cscd_with_tracts.groupby('geoid', as_index=False).HSENROLL.sum()

In [None]:
lu = lu.merge(tract_hsenroll, on='geoid', how='left', validate='1:1')

In [None]:
lu['HSENROLL'] = lu.HSENROLL.fillna(0)

In [None]:
assert lu.HSENROLL.sum() > 0

In [None]:
lu.HSENROLL.sum()

## College enrollment

From the [IPEDS](https://nces.ed.gov/ipeds/datacenter/DataFiles.aspx?goToReportId=7) Age category/gender 2017 fall enrollment statistics, updated 2019.

In [None]:
ipeds_enr = pd.read_csv('../data/ipeds/ef2017b.csv')
ipeds_enr = ipeds_enr[(ipeds_enr.EFBAGE == 1) & (ipeds_enr.LSTUDY == 1)].copy()  # EFBAGE, LSTUDY==1: all age/level of study categories sum
ipeds_dir = pd.read_csv('../data/ipeds/hd2017.csv', dtype={'COUNTYCD': 'str'})
ipeds = ipeds_enr.merge(ipeds_dir, on='UNITID', how='left', validate='m:1')
assert not ipeds.INSTNM.isnull().any()

In [None]:
ipeds['geometry'] = [shapely.geometry.Point(lon, lat) for lon, lat in zip(ipeds.LONGITUD, ipeds.LATITUDE)]
ipeds = gp.GeoDataFrame(ipeds, geometry='geometry')
ipeds.crs = 'epsg:4326'

In [None]:
ipeds = gp.sjoin(ipeds, tracts, op='within')  # will drop non-California schools

In [None]:
ipeds.loc[ipeds.INSTNM == 'University of Southern California', ['EFBAGE', 'LSTUDY', 'EFAGE05']]

In [None]:
# no need to convert long to wide, this will catch all age categories and all institutions
colenrl = ipeds[['geoid', 'EFBAGE', 'LSTUDY', 'EFAGE05', 'EFAGE06']].groupby('geoid').sum().rename(columns={
    'EFAGE05': 'COLLFTE',
    'EFAGE06': 'COLLPTE'
})
colenrl

In [None]:
lu = lu.merge(colenrl, left_on='geoid', right_index=True, how='left', validate='1:1')

In [None]:
lu['COLLFTE'] = lu.COLLFTE.fillna(0)
lu['COLLPTE'] = lu.COLLPTE.fillna(0)

In [None]:
assert lu.COLLFTE.sum() > 0
assert lu.COLLPTE.sum() > 0

In [None]:
lu.COLLFTE.sum()

In [None]:
lu.COLLPTE.sum()

## Area type

This seems to be documented [on the MTC modeling wiki](http://analytics.mtc.ca.gov/foswiki/bin/view/Main/MasterNetworkLookupTables), but the density cutoffs they describe don't match the data in the example model file (and in fact the categories overlap). I'm guessing they smooth the data somehow. I don't know quite how, so I just smooth by replacing every value with the modal value of itself and all its neighbors.

In [None]:
lu['mtcdens'] = (lu.TOTPOP + 2.5 * lu.TOTEMP) / (lu.RESACRE + lu.CIACRE)
lu['raw_area_type'] = pd.cut(lu.mtcdens, [0, 6, 30, 55, 100, 300, 1e6], labels=[5, 4, 3, 2, 1, 0])

In [None]:
lugeo = tracts.merge(lu, on='geoid', how='right', validate='1:1')

In [None]:
# disconnected components is expected-catalina etc.
w = libpysal.weights.Queen.from_dataframe(lugeo)

In [None]:
lugeo['area_type'] = [
    scipy.stats.mode([lugeo.loc[i, 'raw_area_type'], *lugeo.loc[w.neighbors[i], 'raw_area_type']])[0][0]
    for i in lugeo.index
]

In [None]:
f, ax = plt.subplots(figsize=(16, 14))
lugeo.plot(column='area_type', ax=ax, legend=True)

In [None]:
# don't put differentiation in this variable, for now anyhow
lugeo['TERMINAL'] = 3

## Topography

`tract_topography.gpkg` was calculated by downloading 1/3 arc-second DEMs from the USGS 3DEP (I think - here: ), making them into a mosaic using `gdalbuildvrt`, calculating slope using `gdaldem slope -p socal_elevation.vrt -s 111120 socal_slope.tif` and then using zonal stats in QGIS to get the median slope in the tract, which is then cut into topographical bins. The use of the median and these cutpoints correctly classifies 91% of TAZs in the example MTC dataset.

In [None]:
topo = gp.read_file('../data/tract_topography.gpkg')

In [None]:
# sic - called topology in activitysim inputs even though it's actually topography
# documentation seems to be wrong - 3 is flat and 1 is steep.
topo['TOPOLOGY'] = pd.cut(topo._median, [-np.inf, 1.68, 2.77, np.inf], right=True, labels=[3, 2, 1])

In [None]:
lugeo = lugeo.merge(topo[['geoid', 'TOPOLOGY']], on='geoid', how='left', validate='1:1')

In [None]:
out_lu = lugeo[['TAZ', 'TOTHH', 'TOTPOP', 'TOTACRE',
       'RESACRE', 'CIACRE', 'TOTEMP', 'AGE0519', 'RETEMPN', 'FPSEMPN',
       'HEREMPN', 'OTHEMPN', 'AGREMPN', 'MWTEMPN', 'PRKCST', 'OPRKCST',
       'area_type', 'HSENROLL', 'COLLFTE', 'COLLPTE', 'TERMINAL', 'TOPOLOGY']]
assert not out_lu.isnull().any().any()

In [None]:
out_lu.to_csv(f'../model_inputs/{SCENARIO}/land_use.csv', index=False)