# Implementing the BYM2 for Disconnected Graphs

## Notebook Setup

Import all libraries, load the NYC study data.

In [None]:
# import all libraries used in this notebook
import os
import numpy as np
import pandas as pd
import geopandas as gpd
import libpysal as sa
import matplotlib
import splot as splt
from splot.libpysal import plot_spatial_weights 
import plotnine as p9
import arviz as az
%matplotlib inline

from cmdstanpy import CmdStanModel, cmdstan_path, cmdstan_version, write_stan_json

# suppress plotnine warnings
import warnings
warnings.filterwarnings('ignore')

# setup plotnine look and feel
p9.theme_set(
  p9.theme_grey() + 
  p9.theme(text=p9.element_text(size=10),
        plot_title=p9.element_text(size=14),
        axis_title_x=p9.element_text(size=12),
        axis_title_y=p9.element_text(size=12),
        axis_text_x=p9.element_text(size=8),
        axis_text_y=p9.element_text(size=8)
       )
)
xlabels_90 = p9.theme(axis_text_x = p9.element_text(angle=90, hjust=1))

map_theme =  p9.theme(figure_size=(7,6),
                 axis_text_x=p9.element_blank(),
                 axis_ticks_x=p9.element_blank(),
                 axis_text_y=p9.element_blank(),
                 axis_ticks_y=p9.element_blank())

In [None]:
nyc_geodata = gpd.read_file(os.path.join('data', 'nyc_study.geojson'))
nyc_geodata.columns

In [None]:
nyc_nbs = sa.weights.Rook.from_dataframe(nyc_geodata, geom_col='geometry')
print(f'number of components: {nyc_nbs.n_components}')
print(f'islands? {nyc_nbs.islands}')
print(f'max number of neighbors per node: {nyc_nbs.max_neighbors}')
print(f'mean number of neighbors per node: {nyc_nbs.mean_neighbors}')
plot_spatial_weights(nyc_nbs, nyc_geodata)

In [None]:
bym2_islands_model_file = os.path.join('stan', 'bym2_islands.stan')

In [None]:
with open(bym2_islands_model_file, 'r') as file:
    contents = file.read()
    print(contents)

## Data Prep

### Get edgeset

- Compute this automatically from `nyc_geodata` spatial geometry component
  + Python package `libpysal`
  + R package `spdep`

In [None]:
nyc_nbs_adj =  nyc_nbs.to_adjlist(remove_symmetric=True)

In [None]:
# create np.ndarray from columns in adjlist, increment indices by 1
j1 = nyc_nbs_adj['focal'] + 1
j2 = nyc_nbs_adj['neighbor'] + 1
edge_pairs = np.vstack([j1, j2])
singleton_ids = nyc_nbs.islands

In [None]:
for n in range(0, len(singleton_ids)) :
    singleton_ids[n] += 1

### Compute scaling factor `tau`

Scaling factor for singletons is 1, Scaling factor for multi-node component is computed as before

* Compute cardinality of each component

In [None]:
comp_ids = nyc_nbs.component_labels
(comp_id, counts) = np.unique(comp_ids, return_counts = True)
comp_id, counts

* Get subset of regions for multi-node components

In [None]:
nyc_geodata['comp_id'] = comp_ids

In [None]:
taus = np.ones(len(counts))
for id in range(len(counts)):
    comp = nyc_geodata[nyc_geodata['comp_id']==id]
    comp_nbs = sa.weights.Rook.from_datafram(comp, geom_col='geometry')
    taus[i] = get_scaling_factor(comp_nbs)


* Compute scaling_factor  (or do this in R)

* Scale each region according to scaling factor for the component they belong to.

In [None]:
tau = np.array([0.6, 1, 0.8, 1, 1, 1, 1, 1])
taus = tau[nyc_nbs.component_labels]

#### Assemble the input data 

In [None]:
design_vars = np.array(['pct_pubtransit','med_hh_inc', 'traffic', 'frag_index'])

design_mat = nyc_geodata[design_vars].to_numpy()
design_mat[:, 1] = np.log(design_mat[:, 1])
design_mat[:, 2] = np.log(design_mat[:, 2])

pd.DataFrame(data=design_mat).describe()

In [None]:
bym2_islands_data = {
    "N":nyc_geodata.shape[0],
    "y":nyc_geodata['count'].astype('int'),
    "E":nyc_geodata['kid_pop'].astype('int'),
    "K":4,
    "xs":design_mat,
    "N_edges": edge_pairs.shape[1],
    "neighbors": edge_pairs,
    "taus": taus,
    "N_singletons" : len(singleton_ids),
    "singletons":singleton_ids
}

## Fitting the ICAR Model on the nyc data

#### Model is compiled (as needed) on instantiation 

In [None]:
bym2_islands_mod = CmdStanModel(stan_file=bym2_islands_model_file)

#### Run Pathfinder to get initial parameter values

see CmdStanPy notebook: [Using Variational Estimates to Initialize the NUTS-HMC Sampler](https://mc-stan.org/cmdstanpy/users-guide/examples/VI%20as%20Sampler%20Inits.html#Using-Variational-Estimates-to-Initialize-the-NUTS-HMC-Sampler)


In [None]:
bym2_islands_pathfinder = bym2_islands_mod.pathfinder(data=bym2_islands_data)

In [None]:
param_inits = bym2_islands_pathfinder.create_inits()

In [None]:
bym2_islands_fit = bym2_islands_mod.sample(
    data=bym2_islands_data, iter_warmup=5000, save_warmup=1, output_dir='tmp/bad')


In [None]:
bym2_islands_fit.summary().round(2).loc[
  ['beta_intercept', 'beta0', 'betas[1]', 'betas[2]', 'betas[3]', 'betas[4]', 'sigma', 'rho']]

#### Does pathfinder help?

In [None]:
bym2_islands_fit_default_inits = bym2_islands_mod.sample(
    data=bym2_islands_data, iter_warmup=2000)

In [None]:
bym2_islands_fit_default_inits.summary().round(2).loc[
  ['beta_intercept', 'beta0', 'betas[1]', 'betas[2]', 'betas[3]', 'betas[4]', 'sigma', 'rho']]

### Visualize fit

In [None]:
idata_bym2_islands = az.from_cmdstanpy(
    bym2_islands_fit,
    posterior_predictive="y_rep",
    dims={"betas": ["covariates"]},
    coords={"covariates": design_vars},
    observed_data={"y": bym2_islands_data['y']}
)
idata_bym2_islands

az.plot_ppc(idata_bym2_islands, data_pairs={"y":"y_rep"})