In [19]:
import os

repo_dir = os.environ.get("REPO_DIR")
code_dir = os.path.join(repo_dir, "code/")
data_dir = os.path.join(repo_dir, "data/")

os.chdir(code_dir)

import matplotlib.pyplot as plt
import numpy as np
import scipy.linalg
import pickle
import sklearn 
import sys
import pandas as pd
from importlib import reload

from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge
import seaborn as sns

from scipy.stats import spearmanr, mode

import geopandas as gpd
import rasterio
import zarr

import gc

import warnings

from mosaiks.utils.imports import *

from mosaiks.utils.io import weighted_groupby
from affine import Affine

import cartopy.crs as ccrs
# Key prediction functions are here
from prediction_utils import (X_matrix_to_demeaned_X,df_to_demeaned_y_vars, flatten_raster,rasterize_df,
make_train_pred_scatterplot as make_scatterplot, cv_solve, solver_kwargs, get_truth_preds_from_kfold_results,
                             predict_y_from_kfold_dict, generalized_demean)

# Predicting grid level HDI

In this notebook, we generate HDI at the grid level. This notebook is here for reference, but executing it would require the MOSAIKS features at the native tile level (.01 x .01 degrees) for the globe (3TB). We cannot make data of this size easily downloadable.

### First, we need nightlight values at the .01 x .01 degree (tile) level

This file is created in `code/analysis/NL_feature_creation_and_other_NL_processing/nighlight_features_for_hdi_labels_DMSP.ipynb`.

In [3]:
path = (data_dir + "int/DMSP_NL/"
           "F182013.v4c_web.stable_lights.avg_vis_converted_to_np-float.tif")

In [4]:
src = rasterio.open(path)
arr = src.read(1)
x,y,vals = flatten_raster(arr, src.transform)
nl = pd.DataFrame({"lon":x,"lat":y,"nl":vals})

In [None]:
nl.head()

In [None]:
## Convert .00833 res to .001 res
nl["lon"] = np.round(np.round(nl["lon"] - .005,2) + .005,3)
nl["lat"] = np.round(np.round(nl["lat"] + .005,2) - .005,3)

In [None]:
nl.head()

In [None]:
nl = nl.groupby(["lon","lat"]).mean().reset_index()

In [None]:
print(nl["lon"].min(),nl["lon"].max(),nl["lat"].min(),nl["lat"].max())

In [None]:
bins = np.hstack([0,np.linspace(0.0,63,20)])

In [None]:
def binned(a):
    d =  np.histogram(a, bins=bins, density = False, weights=None)
    perc_in_each_bin = d[0]
    return perc_in_each_bin

### Now we also need population weights
This population weights file is created in the `population_weights_for_features_and_grid_preds.ipynb` notebook.

In [6]:
pop = pd.read_pickle(data_dir + "int/GPW_pop/" +
                               "/population_density_global_dense_grid.p")

### We need the country dense grid tiles for each country

In [None]:
df = pd.read_pickle(data_dir  + "features/prepared_labels/GDL_HDI_polygon_coords_for_featurization.p")
hdi = pd.read_pickle(data_dir +"int/GDL_HDI/HDI_indicators_and_indices_clean.p")[["Sub-national HDI"]]

In [None]:
df = df.merge(hdi, "left", left_on="GDLCODE", right_index=True)
df.set_index(["lon","lat"], inplace=True)
df = df.merge(nl, "left", left_index=True, right_on = ["lon", "lat"])
df = df.merge(pop, "left", on = ["lon", "lat"])
df.set_index(["lon","lat"], inplace=True)

In [None]:
df

### We also need the country level features that we used to demean in model training

In [None]:
adm1_X = pd.read_pickle(
(data_dir + "features/mosaiks_features/"
"GDL_ADM1_polygon_X_creation_pop_weight=True.p")).drop(columns="GDLCODE")

filepath = (data_dir + "features/nl_features/GDL_HDI_polygons/"
"dmsp_nightlight_features_for_hdi_polygons_20_bins_GPW_pop_weighted.p")
adm1_nl = pd.read_pickle(filepath).loc[adm1_X.index]

In [None]:
X_demean_vectors = X_matrix_to_demeaned_X(adm1_X, return_mean_frame=True )
X_demean_vectors_nl = X_matrix_to_demeaned_X(adm1_nl, return_mean_frame=True )

In [None]:
X_demean_vectors_nl

### And finally, we need the primary model used for preds

In [None]:
path = (data_dir + "/model_data/" +
           "within_country_rcf_and_nl_demeaned_solve_all_outcomes_country_fold"
           "_DENSE_pop_weight=True_dmsp_hist_bins_GPW_pop_weighted.pkl")

nl_and_rcf_demeaned_kfold_dict = pickle.load(open(path, "rb"))

### Now we cycle through all the dense grid chunks and produce predictions for those associated with a country

These source files in `z_directory` and the intermediate output files in `int_dir` are not publicly available. They source MOSAIKS features at the 0.01 by 0.01 level total about 3 TB which makes them impractical to share.

In [None]:
# This data directory is outside the GITHUB repo and is not publicy accesible
z_directory = "/shares/maps100/data/features/global_dense_grid/complete/concat/replace_2022/"
int_dir = data_dir + "/preds/int_grid/"

In [None]:
grid_slices = []

for file in os.listdir(z_directory):
    if not file.endswith(".zarr"):
        continue
    print(file)
    
    int_path = int_dir + file.split(".")[0]+"_int_hdi_grid_preds.p"
    
    if os.path.exists(int_path):
        grid_slices.append(pd.read_pickle(int_path))
        continue
    
    z = zarr.load(z_directory + file)
    z = pd.DataFrame(z)
    z = z.rename(columns = {0:"lon",1:"lat"})
    z["lon"], z["lat"] = z["lon"].round(3), z["lat"].round(3)
    
    z.set_index(["lon","lat"], inplace=True)
    rcf_cols = "X_" + np.arange(4000).astype(str).astype(object)
    z.columns = rcf_cols
    
    subset = df.merge(z, how="inner",left_index=True, right_index=True )
    countries_in_slice = subset["iso_code"].unique()
    
    file_data = []
    for country in countries_in_slice:
        print(country)
    
        country_subset = subset[subset["iso_code"] == country]
    
        ## Demeaned X1 for country
        rcf_X_country = country_subset.loc[:,"X_0":] - X_demean_vectors.loc[country]
        
        X_country_nl = pd.DataFrame(np.vstack(country_subset["nl"].apply(binned)) - X_demean_vectors_nl.loc[country].to_numpy(),
                               index= country_subset.index)
    
    
        country_preds_clipped = predict_y_from_kfold_dict(rcf_X_country,
                           nl_and_rcf_demeaned_kfold_dict,
                           "Sub-national HDI",
                           X_country_nl,
                          clip_preds=True)
        
        country_preds_not_clipped = predict_y_from_kfold_dict(rcf_X_country,
                           nl_and_rcf_demeaned_kfold_dict,
                           "Sub-national HDI",
                           X_country_nl,
                          clip_preds=False)
        
        country_subset.drop(columns = rcf_cols, inplace=True)
        
        country_subset["raw_pred_hdi"] = country_preds_clipped
        country_subset["raw_pred_hdi_not_clipped"] = country_preds_not_clipped
        file_data.append(country_subset)
        gc.collect()
    
    int_output = pd.concat(file_data)
    
    int_output.to_pickle(int_path)
    
    grid_slices.append(int_output)    
    
    gc.collect()
    

In [None]:
data = pd.concat(grid_slices)

In [None]:
#data.to_pickle(data_dir + "preds/raw_hdi_preds_at_grid.p")

# Now we are going to re-center and up-sample the raw grid estimates

In [28]:
data = pd.read_pickle(data_dir + "preds/raw_hdi_preds_at_grid.p").reset_index()

### Use finer resolution human settlement population. 
We do not want to release predictions for locations where people do not live. For these we will use the Global Human Settlement population data layer available for download at https://ghsl.jrc.ec.europa.eu/download.php?ds=pop

Specifically this is the following data product:

**GHS population grid (R2022)**

Product: GHS-POP, epoch: 2020, resolution: 1 km, coordinate system: Mollweide

#### First we need to re-project from Mollweide to WGS84 

In [16]:
directory = data_dir + "GHS_pop/"
file = "GHS_POP_E2020_GLOBE_R2022A_54009_1000_V1_0.tif"
dst_crs = 'EPSG:4326'

with rasterio.open(directory+file) as src:
    transform, width, height = calculate_default_transform(
        src.crs, dst_crs, src.width, src.height, *src.bounds)
    kwargs = src.meta.copy()
    kwargs.update({
        'crs': dst_crs,
        'transform': transform,
        'width': width,
        'height': height
    })
    
    with rasterio.open(directory + "GHS_POP_E2020_GLOBE_R2022A_54009_1000_V1_0_re-project.tif", 'w', **kwargs) as dst:
        for i in range(1, src.count + 1):
            reproject(
                source=rasterio.band(src, i),
                destination=rasterio.band(dst, i),
                src_transform=src.transform,
                src_crs=src.crs,
                dst_transform=transform,
                dst_crs=dst_crs,
                resampling=Resampling.nearest)

#### Now we can continue using the GHS Pop product to create a population mask

In [14]:
hsdl = rasterio.open(data_dir + "int/GHS_pop/" +
                      "GHS_POP_E2020_GLOBE_R2022A_54009_1000_V1_0_re-project.tif")

In [None]:
hsdl_arr = hsdl.read(1)

hsdl_df = flatten_raster(hsdl_arr, hsdl.transform)
hsdl_df = pd.DataFrame({"lon":hsdl_df[0],"lat":hsdl_df[1],"hsdl_pop":hsdl_df[2]})

hsdl_df["pop_binary"] = (hsdl_df["hsdl_pop"] > 0).astype(int)

In [None]:
hsdl_df["lon01"] = np.round(np.round(hsdl_df["lon"] + .005,2) - .005,3)
hsdl_df["lat01"] = np.round(np.round(hsdl_df["lat"] + .005,2) - .005,3)
hsdl_df_grp = hsdl_df.groupby(["lon01","lat01"])["pop_binary"].agg(np.nansum).reset_index()

hsdl_df_grp["pop_binary"] = (hsdl_df_grp["pop_binary"] > 0).astype(int)

In [None]:
## Crop the flat raster bounds to be the same as the prediction bounds. Will result in identically shaped rasters
hsdl_df_grp = hsdl_df_grp[ (hsdl_df_grp["lat01"] >= data["lat"].min() ) & \
                          (hsdl_df_grp["lat01"] <= data["lat"].max())]

In [21]:
## Save intermediate output
#hsdl_df_grp.to_pickle(data_dir + "int/GHS_pop/hsdl_pop_at_01_grid.p")
hsdl_df_grp = pd.read_pickle(data_dir + "int/GHS_pop/hsdl_pop_at_01_grid.p")

In [None]:
data = data.merge(hsdl_df_grp, 
                   "left", left_on = ["lon","lat"], right_on=["lon01","lat01"])

In [26]:
## Save a second intermediate output
#data.to_pickle(data_dir + "preds/raw_hdi_preds_at_grid_with_hsdl.p")
data = pd.read_pickle(data_dir + "preds/raw_hdi_preds_at_grid_with_hsdl.p").reset_index()

In [None]:
# Drop preds that do not have 0 population in the HSDL dataset
data = data[data["pop_binary"] == 1].copy()

In [None]:
# Assign smallest positive pop density weight to remaining locations where pop density weights were NaN
data.loc[data["population"].isnull(), "population"] = data["population"].min()

In [None]:
data.drop(columns =["lon01","lat01"], inplace=True)
data.head()

## Re-center preds on the known ADM1 Value

In [None]:
grouped = io.weighted_groupby(data, "GDLCODE", weights_col_name="population", cols_to_agg=["raw_pred_hdi"] )
grouped.rename(columns = {"raw_pred_hdi":"weighted_avg_raw"}, inplace=True)

In [None]:
data = data.merge(grouped, left_on="GDLCODE", right_index=True)

In [None]:
data["adj_factor"] = data["Sub-national HDI"] - data["weighted_avg_raw"] 

In [None]:
data["centered_pred"] = data["raw_pred_hdi"] + data["adj_factor"]

In [None]:
## Code to check that groupby worked
#weighted_groupby(data.dropna(), "GDLCODE", weights_col_name="population", cols_to_agg=["centered_pred","Sub-national HDI"] )


## Rasterize and upsample

In [None]:
data["lat10"] = np.round(np.round(data["lat"] + .05,1) - .05,2)
data["lon10"] = np.round(np.round(data["lon"] + .05,1) - .05,2)

In [None]:
pre_raster = data.groupby(["lon10","lat10"])[["centered_pred","population","Sub-national HDI","GDLCODE"]].agg(
    {
    "population": np.nansum, # Sum the weights
    "Sub-national HDI": lambda x: mode(x, nan_policy="omit")[0], # For this col, keep the modal HDI
     "GDLCODE": lambda x: mode(x,nan_policy="omit")[0], # For this col, keep the modal parent ADM1 code
    }) #ignore NaNs for all

In [None]:
### Now for HDI we want to take the weighted average of the cells, 
# using the same GPW pop density weights that we have been using throughout
pre_raster = pd.concat( [pre_raster,weighted_groupby(data, 
                                                   ["lon10","lat10"], 
                                                   "population", 
                                                   cols_to_agg = ["centered_pred"]
                                                  )
                       ],axis=1).reset_index()

In [None]:
print( round((pre_raster["centered_pred"] > 1).sum() / len(pre_raster) * 100,3),"% of pixels have values higher than 1. We will clip these")

## Apply clipping because HDI should not exceed 1. 
#This is an imperfect solution, but we will accept since the number of pixels is so small
pre_raster["clipped"] = np.clip(pre_raster["centered_pred"],0,1)

In [None]:
pre_raster = pre_raster.reset_index()

In [None]:
pre_raster.to_pickle(data_dir + "preds/"
           "hdi_grid_predictions_flat_file.p")

In [None]:
# Check that weighted aggregation matches
# NOTE that this will not be perfect becausecountry borders that are smoothed over
#weighted_groupby(pre_raster.dropna(), "GDLCODE", weights_col_name="population", cols_to_agg=["centered_pred","Sub-national HDI"] )

In [None]:
raster, extent = rasterize_df(pre_raster, 
                              data_colname = "clipped", 
                              grid_delta=.1, 
                              lon_col="lon10", 
                              lat_col="lat10",
                             custom_extent = (-180,180,-56,74)
                             )

In [None]:
### Checks on the output

In [None]:
raster.shape

In [None]:
extent

In [None]:
plt.imshow(raster, interpolation="nearest", extent=extent)

####  Write grid data product as a raster

In [None]:
raster.shape

In [None]:
meta = {'driver': 'GTiff',
 'dtype': 'float64',
 'nodata': np.nan,
 'width': 3600,
 'height': 1300,
 'count': 1,
'crs': "EPSG:4326",
'transform': Affine(0.1, 0.0, extent[0],
        0.0, -0.1, extent[3])
       }

raster_outpath = (data_dir + "preds/"
           "hdi_raster_predictions.tif")

with rasterio.open(raster_outpath , "w", **meta) as dest:
     dest.write(np.array([raster]))