Minimum requirements:
 * cores
 * memory

# Data preprocessing

In [171]:
import os
import subprocess
import sys
import time
import pathlib
import json
import pandas as pd

from IPython.display import Image

grassdata = pathlib.Path("/data/grassdata")
inputs = pathlib.Path("/data/FUTURES/nlcd")
os.chdir(inputs)

In [None]:
# Ask GRASS GIS where its Python packages are.
sys.path.append(
    subprocess.check_output(["grass8", "--config", "python_path"], text=True).strip()
)

# Import GRASS packages
import grass.script as gs
import grass.jupyter as gj

# Start GRASS Session
session = gj.init(grassdata, "FUTURES", "PERMANENT")

In [None]:
gs.run_command("g.extension", extension="r.futures")
gs.run_command("g.extension", extension="r.mapcalc.tiled")
gs.run_command("g.extension", extension="r.sample.category")

## Preprocessing

Dataset contains raster layers:

* nlcd_2001
* nlcd_2004
* nlcd_2006
* nlcd_2008
* nlcd_2011
* nlcd_2013
* nlcd_2016
* nlcd_2019
* DEM

And vector layers:
* tl_2021_us_county
* protected

### Process county boundaries

In [None]:
%%bash
v.extract tl_2021_us_county output=SE_counties where="STATEFP in ('01', '12', '13', '37', '45', '47')" --q
v.db.addcolumn SE_counties column="state integer" --q
v.db.addcolumn SE_counties column="county integer" --q
v.db.update SE_counties col=state qcol="CAST(STATEFP AS integer)" --q
v.db.update SE_counties col=county qcol="CAST(GEOID AS integer)" --q
v.dissolve input=SE_counties column=state output=SE_states --q

In [None]:
from grass.pygrass.modules.grid import GridModule
region = gs.region()
nprocs = 8
height = int(region["rows"] / nprocs) + 1
width = region["cols"]
grid = GridModule("v.to.rast", input="SE_states", output="SE_states", type="area", use="cat",
                 width=width, height=height, processes=nprocs)
grid.run()

Split states for further parallelization:

In [None]:
states = [1, 12, 13, 37, 45, 47]
gs.use_temp_region()
for state in states:
    gs.run_command("v.extract", input="SE_counties", where=f"state == '{state}'", output=f"state_{state}")
    gs.run_command("g.region", vector=f"state_{state}", align="nlcd_2019")
    gs.run_command("v.to.rast", input=f"state_{state}", output=f"state_{state}", use="attr", attribute_column="county")
gs.del_temp_region()

### Protected land

In [None]:
start = time.time()
gs.run_command("v.to.rast", input="protected", output="protected", type="area", use="val")
time.time() - start

In [None]:
start = time.time()

region = gs.region()
nprocs = 8
height = int(region["rows"] / nprocs) + 1
width = region["cols"]
grid = GridModule("v.to.rast", input="protected", output="protected", type="area", use="val",
                 width=width, height=height, processes=nprocs)
grid.run()
time.time() - start

In [None]:
gs.run_command("r.mapcalc.tiled", expression="masking = if((isnull(protected) &&  isnull(water) && (nlcd_2019 != 0)), 1, null())", width=width, height=height, nprocs=nprocs)

### Process NLCD data

In [None]:
NLCD_years = [2001, 2004, 2006, 2008, 2011, 2013, 2016, 2019]
NLCD_descriptor_years = [2001, 2019]
gs.write_command("r.reclass", input="nlcd_2019", output="water", rules="-", stdin="11 = 1")
gs.write_command("r.reclass", input="nlcd_2019", output="forest", rules="-", stdin="41 42 43 = 1")
gs.write_command("r.reclass", input="nlcd_descriptor_2019", output="roads", rules="-", stdin="20 21 22 23 = 1")
for year in NLCD_years:
    gs.write_command("r.reclass", input=f"nlcd_{year}", output=f"urban_{year}", rules="-", stdin="20 21 22 23 = 1\n* = 0")
for year in NLCD_descriptor_years:
    gs.write_command("r.reclass", input=f"nlcd_descriptor_{year}", output=f"urban_no_roads_{year}", rules="-", stdin="24 25 26 = 1\n* = 0")

In Bash use background processing to compute distance to water, forest, and roads in parallel since these are independent:

In [None]:
%%bash
r.grow.distance input=water distance=dist_to_water -m &
r.grow.distance input=forest distance=dist_to_forest -m &
r.grow.distance input=roads distance=dist_to_roads -m &
wait

We can use r.mapcalc.tiled to split computation into tiles:

In [None]:
region = gs.region()
nprocs = 8
height = int(region["rows"] / nprocs) + 1
width = region["cols"]
gs.run_command("r.mapcalc.tiled", expression="log_dist_to_water = log(dist_to_water + 1)", width=width, height=height, nprocs=nprocs)
gs.run_command("r.mapcalc.tiled", expression="log_dist_to_forest = log(dist_to_forest + 1)", width=width, height=height, nprocs=nprocs)
gs.run_command("r.mapcalc.tiled", expression="log_dist_to_roads = log(dist_to_roads + 1)", width=width, height=height, nprocs=nprocs)

Development pressure

In [None]:
gs.run_command("r.futures.devpressure", input="urban_no_roads_2001", output="devpressure_2001", size=10, gamma=1.5, nprocs=8)
gs.run_command("r.futures.devpressure", input="urban_no_roads_2019", output="devpressure_2019", size=10, gamma=1.5, nprocs=8)

### Process DEM

In [None]:
gs.run_command("r.slope.aspect", elevation="DEM", slope="slope", flags="e", nprocs=8)

### Demand

In [None]:
%%writefile demand_for_state.py
import sys
import grass.script as gs

state = sys.argv[1]

gs.run_command("g.region", raster=f"state_{state}")
gs.mapcalc("MASK = if (isnull(roads), 1, null())")
gs.run_command("r.futures.demand", subregions=f"state_{state}",
               development=[f"urban_{year}" for year in [2001, 2004, 2006, 2008, 2011, 2013, 2016, 2019]],
               observed_population="observed_population_SE_counties_NLCD_2001-2019.csv",
               projected_population="Hauer_2020_2100_SE_counties_SSP2_projections_demand.csv",
               simulation_times=list(range(2019, 2101)), method="logarithmic",
               demand=f"demand_{state}.csv", plot=f"demand_{state}.png", overwrite=True)

In [None]:
%%bash
rm -f demand_jobs.sh
for S in 1 12 13 37 45 47
do
    echo grass8 --tmp-mapset /data/grassdata/FUTURES --exec python demand_for_state.py ${S} >> demand_jobs.sh
done
cat demand_jobs.sh
parallel -j 6 < demand_jobs.sh 2> log.txt

In [None]:
Image("demand_37.png")

### Sampling

In [None]:
gs.run_command("r.mapcalc.tiled", expression="urban_change = if(urban_2019 == 0 && urban_2001 == 1, 1, 0)", width=width, height=height, nprocs=nprocs)

In [None]:
gs.run_command("r.buildvrt", input=[f"state_{s}" for s in states], output="SE_counties")

In [None]:
%%writefile sampling_for_state.py
import sys
import grass.script as gs

state = sys.argv[1]

gs.run_command("g.region", raster=f"state_{state}")
gs.run_command("r.mask", raster=f"state_{state}")
# create an identical, virtual map of counties with different name to later simplify patching of attribute tables
gs.write_command("r.reclass", input=f"state_{state}", output="counties", rules="-", stdin="* = *")
gs.run_command("r.sample.category", input="urban_change", output=f"sample_{state}",
               sampled=["counties", "slope", "devpressure_2001", "log_dist_to_forest", "log_dist_to_water", "log_dist_to_roads"],
               npoints=[10000, 5000],
               random_seed=1, overwrite=True)
gs.run_command("r.mask", flags="r")

In [None]:
%%bash
rm -f sampling_jobs.sh
for S in 1 12 13 37 45 47
do
    rm -rf /data/grassdata/FUTURES/sampling_${S}
    echo grass8 -c /data/grassdata/FUTURES/sampling_${S} --exec python sampling_for_state.py ${S} >> sampling_jobs.sh
done
parallel -j 6 < sampling_jobs.sh 2> log.txt

In [None]:
gs.run_command("v.patch", input=[f"sample_{state}@sampling_{state}" for state in states], output="samples", flags="e")

### Potential

In [None]:
gs.run_command("r.futures.potential", input="samples", output="best_model.csv",
               columns=["slope", "devpressure_2001", "log_dist_to_forest", "log_dist_to_water", "log_dist_to_roads"],
               developed_column="urban_change",
               subregions_column="counties",
               random_column="devpressure_2001",
               min_variables=3,
               nprocs=8, flags="d",
               dredge_output="all_models.csv")

In [None]:
pd.read_csv("all_models.csv", index_col=0)

In [None]:
pd.read_csv("best_model.csv", index_col=0)

### Calibration

In [None]:
%%writefile calibration_for_state.py
import sys
import grass.script as gs

state = sys.argv[1]

gs.run_command("g.region", raster=f"state_{state}")
gs.run_command("r.futures.calib", flags="sl",
               development_start="urban_2001", development_end="urban_2019",
               subregions=f"state_{state}", patch_threshold=1800,
               patch_sizes=f"patch_sizes_{state}.csv", nprocs=1)

In [None]:
! cat calibration_for_state.py

In [170]:
%%bash
rm -f calibration_jobs.sh
for S in 1 12 13 37 45 47
do
    echo grass8 --tmp-mapset /data/grassdata/FUTURES/ --exec python calibration_for_state.py ${S} >> calibration_jobs.sh
done
time parallel -j 6 < calibration_jobs.sh 2> log.txt


real	30m3.346s
user	108m47.952s
sys	1m30.163s


### FUTURES PGA

In [None]:
%%writefile simulation_for_state.py
import sys
import grass.script as gs

state, seed = sys.argv[1:3]

gs.run_command("g.region", raster=f"state_{state}")
gs.run_command("r.mask", raster="masking")
gs.run_command("r.futures.pga", developed="urban_2019", development_pressure="devpressure_2019",
               compactness_mean=0.5, compactness_range=0.1, discount_factor=1,
               predictors=["log_dist_to_forest", "log_dist_to_roads", "log_dist_to_water", "slope"],
               n_dev_neighbourhood=10, devpot_params="best_model.csv", num_neighbors=4, seed_search="probability",
               development_pressure_approach="gravity", gamma=1.5, scaling_factor=1,
               subregions=f"state_{state}", demand=f"demand_{state}.csv",
               output=f"out_state_{state}_seed_{seed}", patch_sizes=f"patch_sizes_{state}.csv", memory=12, random_seed=seed)
gs.run_command("r.mask", flags="r")

In [None]:
!rm -rf /data/grassdata/FUTURES/pga_1
!grass8 -c /data/grassdata/FUTURES/pga_1 --exec python simulation_for_state.py 1 1 >> pga_jobs.sh

In [None]:
%%bash
rm -f pga_jobs.sh
for SEED in {1..10}
do
    for STATE in 1 12 13 37 45 47
    do
        rm -rf /data/grassdata/FUTURES/pga_${STATE}_${SEED}
        echo grass8 -c /data/grassdata/FUTURES/pga_${STATE}_${SEED} --exec python simulation_for_state.py ${STATE} ${SEED} >> pga_jobs.sh
    done
done
time parallel -j 6 < pga_jobs.sh 2> log.txt

In [None]:
%%bash
rm -f patch_jobs.h
for SEED in {1..10}
do
    MAPS=$(grass --tmp-mapset /data/grassdata/FUTURES --exec \
           g.list type=raster pattern="out_state_*_seed_${SEED}" mapset="*" -m separator=comma)
    rm -rf /data/grassdata/FUTURES/results_${SEED}
    echo grass8 -c /data/grassdata/FUTURES/results_${SEED} --exec r.patch input=${MAPS} output="out_seed_${SEED}" nprocs=4 >> patch_jobs.sh
done
time parallel -j 5 < patch_jobs.sh 2> log.txt

In [None]:
%%writefile reclass.txt
-1 0 = 0
1 thru 81 = 1
* = 0

In [None]:
for i in range(1, 11):
    gs.run_command("r.reclass", input=f"out_seed_{seed}@results_{seed}", output=f"out_seed_{seed}", rules="reclass.txt")
gs.run_command("r.series", input=[f"out_seed_{seed}" for seed in range(1, 11)], output="sum", method="sum") # nprocs=
gs.run_command("r.mapcalc.tiled", expression="probability = float(sum) / 10", width=width, height=height, nprocs=nprocs)


### Validation

In [None]:
import os
import sys
import json
from multiprocessing import Pool


def compute(params):
    env = os.environ.copy()
    region = params.pop("region")
    env["GRASS_REGION"] = gs.region_env(**region)
    results = gs.read_command(
        "r.futures.validation", format="json", env=env, quiet=True, **params
    )
    results = json.loads(results)
    reg = gs.region(env=env)
    results["n"] = (reg["n"] + reg["s"]) / 2
    results["e"] = (reg["e"] + reg["w"]) / 2
    return results


with Pool(processes=nprocs) as pool:
    results = pool.map_async(compute, params).get()