# Data Acquisition and Formatting

This notebook provides the workflow for aquiring and formatting the data needed to run the PoPS Global Model. To run this notebook, the following are assumed:
- Data are saved in a project folder (e.g., Google Drive root H:/Project Folder/)
- Cloned the PoPS Global GitHub repository (git clone https://github.com/ncsu-landscape-dynamics/PoPS-Global.git)
- Notebook was launched from the notebook folder of the cloned repo
- Already have the following data available or downloaded:
    - Koppen-Geiger Climate Classification raster (e.g., Beck_KG_V1_present_0p083.tif from http://koeppen-geiger.vu-wien.ac.at/data)
    - Phytosanitary capacity (data frame with country name and ISO3 code, estimate/index of phytosanitary capacity)
    - Binary host map raster
    - File with the following environmental variables:
        - DATA_PATH (file path to data folder that will contain original data, formatted model input data, and model output data)
        - COMTRADE_AUTH_KEY (API key to query and download data from the UN Comtrade Database)

## Imports

In [None]:
import os
import glob
import json
import requests
import numpy as np
import pandas as pd
import geopandas as gpd
import rasterio
from rasterstats import zonal_stats
import dotenv
from functools import reduce

In [None]:
# If notebook was launched from notebook folder of the clone GitHub
# repository, then set working directory to level above
# (e.g., '..' to navigate to /PoPS-Global)

# This should be the path where the .env file is saved
repo_path = ".."
os.chdir(repo_path)
print(os.getcwd())

In [None]:
from Data.Comtrade.get_comtrade import query_comtrade
from pandemic.generate_trade_forecasts import simple_trade_forecast
from pandemic.helpers import distance_between, convert_to_binary
from pandemic.ecological_calculations import (
    create_climate_similarities_matrix,
    create_climate_similarities_matrix_origins,
)
from Data.GBIF import get_GBIF_key, get_GBIF_records

## Set Environmental Variables and Paths

In [None]:
# Load variables and paths from .env
dotenv.load_dotenv(".env")

In [None]:
# Path to case-study specific raw data inputs (host map, phytosanitary capacity dataset)
data_dir = os.getenv("DATA_PATH")

# Path to formatted model inputs
input_dir = os.getenv("INPUT_PATH")

# Path to save outputs
out_dir = os.getenv("OUTPUT_PATH")

# Simulation name
sim_name = os.getenv("SIM_NAME")

## UN Comtrade Data

This step only needs to be run once per aggregation (e.g., monthly, annual, start year,
commodity code). 

It queries the UN Comtrade API to download data based on the first year
of interest, end year (inclusive), commodity codes, frequency (e.g., monthly, annual), 
and unit value (e.g., value in dollars or net weight). Data are saved as csvs by HS code
and time step. 

In [None]:
comtrade_auth_key = os.getenv("COMTRADE_AUTH_KEY")
start_year = 2000
end_year = 2020
temporal_res = "M"
hs_list = ["6802", "6803"]
agg_commodities = True

if agg_commodities:
    dir_suffix = "agg"
else:
    dir_suffix = "adjusted"

if len(hs_list) == 1:
    code_str = str(hs_list[0])
else:
    code_str = "-".join([str(hs_code) for hs_code in hs_list])

In [None]:
query_comtrade(
    model_inputs_dir=f"{input_dir}/comtrade",
    auth_code=comtrade_auth_key,
    hs_list=hs_list,
    start_year=start_year,
    end_year=end_year,
    temporal_res=temporal_res,
    crosswalk_path="Data/un_to_iso.csv",
)

### Adjust Trade Data for Inflation

Get the Consumer Price Index from the US Bureau of Labor Statistics

Series CUUR0000SA0L1E - All items less food and energy in U.S. city average, all urban consumers, not seasonally adjusted

Other CPI series are available. See more information here: https://www.bls.gov/cpi/

In [None]:
# Download CPI data
# API allows 10 years per request, divide year requests into lists of 10 or less
years = list(range(start_year, end_year + 1))
year_subsets = [years[x : x + 10] for x in range(0, len(years), 10)]
cpi_series = "CUUR0000SA0L1E"

cpi_list = []
for subset in year_subsets:
    print(f"Downloading CPI for {str(subset[0])}-{str(subset[-1])}...")
    headers = {"Content-type": "application/json"}
    data = json.dumps(
        {
            "seriesid": [cpi_series],
            "startyear": str(subset[0]),
            "endyear": str(subset[-1]),
        }
    )
    p = requests.post(
        "https://api.bls.gov/publicAPI/v1/timeseries/data/", data=data, headers=headers
    )
    json_data = json.loads(p.text)
    json_data = json_data["Results"]["series"][0]

    for ts in json_data["data"]:
        year = ts["year"]
        period = ts["period"]
        value = ts["value"]
        cpi_list.append([year, period, value])

cpi_df = pd.DataFrame(cpi_list, columns=["year", "period", "cpi"])
cpi_df["period"] = cpi_df["period"].str.lstrip("M")
cpi_df["ts"] = cpi_df["year"] + cpi_df["period"]
cpi_df = cpi_df.set_index("ts")
print(f"CPI for {len(cpi_df)} timesteps downloaded.")

In [None]:
# Choose baseline year to convert current (nominal) $ to
base_ts = "201901"
cpi_base = cpi_df.loc[base_ts, "cpi"]

In [None]:
if temporal_res == "M":
    file_list = glob.glob(f"{input_dir}/comtrade/monthly/*/*.csv")
    if not file_list:
        print("No trade files found...")
    else:
        print(f"Converting current $ to {base_ts} $ for {len(file_list)} files...")
        for file in file_list:
            file_name = file.split("\\")[-1]
            ts = file_name[-10:-4]
            cpi_ts = cpi_df.loc[ts, "cpi"]
            adjusted_dir = f"{input_dir}/comtrade/monthly_adjusted/{file_name[:4]}"
            if not os.path.exists(adjusted_dir):
                os.makedirs(adjusted_dir)
            trade = pd.read_csv(file, index_col=0)
            trade_adjusted = (trade * (float(cpi_base) / 100)) / (float(cpi_ts) / 100)
            trade_adjusted.to_csv(adjusted_dir + "/" + file_name)
        print(f"Adjusted trade values saved at {input_dir}/comtrade/monthly_adjusted/")

### Aggregate Multiple Commodities

This step only needs to be run once per download if the model will be run
using a sum of all commodities of interest as opposed to by each commodity individually. 

In [None]:
if agg_commodities:
    print(code_str)

    if temporal_res == "M":
        # get all trade dates
        file_list = glob.glob(
            f"{input_dir}/comtrade/monthly_adjusted/{str(hs_list[0])}/{str(hs_list[0])}*.csv"
        )
        date_list = []
        for fn in file_list:
            date = os.path.splitext(fn)[0].split("_")[-1]
            date_list.append(date)

        date_list_unique = list(set(date_list))
        date_list_unique.sort()

        out_path = f"{input_dir}/comtrade/monthly_agg/{code_str}/"
        if not os.path.exists(out_path):
            os.makedirs(out_path)

        for d in date_list_unique:
            d_file_list = glob.glob(
                input_dir + f"/comtrade/monthly_adjusted/*/*{d}*.csv"
            )
            for use_hs in hs_list[1:]:
                d_file_list += glob.glob(
                    input_dir + f"/comtrade/monthly_adjusted/{use_hs}/*{d}*.csv"
                )
            print(f"{d}: {len(d_file_list)}")
            dfs = [
                pd.read_csv(f, sep=",", header=0, index_col=0, encoding="latin1")
                for f in d_file_list
            ]
            all_com = reduce(pd.DataFrame.add, dfs)
            all_com.to_csv(out_path + f"{code_str}_{d}.csv")

    # If trade data are annual
    if temporal_res == "A":
        out_path = input_dir + f"/comtrade/annual_agg/{code_str}/"
        if not os.path.exists(out_path):
            os.makedirs(out_path)

        year_range = list(range(start_year, end_year + 1, 1))
        for d in year_range:
            d_file_list = glob.glob(input_dir + f"/comtrade/annual/*/*{d}.csv")
            print(f"{d}: {len(d_file_list)}")
            dfs = [
                pd.read_csv(f, sep=",", header=0, index_col=0, encoding="latin1")
                for f in d_file_list
            ]
            all_com = reduce(pd.DataFrame.add, dfs)
            all_com.to_csv(out_path + f"{code_str}_{d}.csv")

    if temporal_res == "M":
        out_path = input_dir + f"/comtrade/annual_agg/{code_str}/"
        if not os.path.exists(out_path):
            os.makedirs(out_path)

        year_range = list(range(start_year, end_year + 1, 1))
        for d in year_range:
            d_file_list = glob.glob(
                input_dir + f"/comtrade/monthly_agg/{code_str}/*_{d}*.csv"
            )
            print(f"{d}: {len(d_file_list)}")
            dfs = [
                pd.read_csv(f, sep=",", header=0, index_col=0, encoding="latin1")
                for f in d_file_list
            ]
            all_com = reduce(pd.DataFrame.add, dfs)
            all_com.to_csv(out_path + f"{code_str}_{d}.csv")

### Create Trade Forecast

This step only needs to be run once per aggregation. It is a simple sampling of
historical trade data to be used as predictions of future trade values.

TO DO: Add ability to include a percent change (e.g., 1% increase) by year or time
horizon. 

In [None]:
if agg_commodities:
    code_str = "-".join([str(hs_code) for hs_code in hs_list])
    print(f"Creating forecast with aggregated commodities: {code_str}")
    hist_trade_dir = f"{input_dir}/comtrade/monthly_{dir_suffix}/{code_str}"
    forecast_dir = (
        f"{input_dir}/comtrade/trade_forecast/monthly_{dir_suffix}/{code_str}"
    )
    start_forecast_year = (end_year + 1) * 100 + 1
    number_historical_years = 5
    number_forecast_years = 2
    random_seed = None
    simple_trade_forecast(
        data_dir=f"{input_dir}/comtrade",
        output_dir=forecast_dir,
        start_forecast_date=start_forecast_year,
        num_yrs_historical=number_historical_years,
        num_yrs_forecast=number_forecast_years,
        hist_data_dir=hist_trade_dir,
        random_seed=random_seed,
    )
else:
    for code in hs_list:
        code_str = str(code)
        print(f"Creating forecast for {code_str}")
        hist_trade_dir = f"{input_dir}/comtrade/monthly_{dir_suffix}/{code_str}"
        forecast_dir = (
            f"{input_dir}/comtrade/trade_forecast/monthly_{dir_suffix}/{code_str}"
        )
        start_forecast_year = (end_year + 1) * 100 + 1
        number_historical_years = 5
        number_forecast_years = 2
        random_seed = None
        simple_trade_forecast(
            data_dir=f"{input_dir}/comtrade",
            output_dir=forecast_dir,
            start_forecast_date=start_forecast_year,
            num_yrs_historical=number_historical_years,
            num_yrs_forecast=number_forecast_years,
            hist_data_dir=hist_trade_dir,
            random_seed=random_seed,
        )

In [None]:
# Creating annual trade forecast data from the monthly forecast
if temporal_res == "M":
    if agg_commodities:
        code_str = "-".join([str(hs_code) for hs_code in hs_list])
        print(f"{code_str}")
        out_path = input_dir + f"/comtrade/trade_forecast/annual_agg/{code_str}/"
        if not os.path.exists(out_path):
            os.makedirs(out_path)
        forecast_year = int(str(start_forecast_year)[:4])
        year_range = list(
            range(forecast_year, forecast_year + number_forecast_years, 1)
        )
        for d in year_range:
            d_file_list = glob.glob(
                input_dir
                + f"/comtrade/trade_forecast/monthly_agg/{code_str}/*_{d}*.csv"
            )
            print(f"{d}: {len(d_file_list)}")
            dfs = [
                pd.read_csv(f, sep=",", header=0, index_col=0, encoding="latin1")
                for f in d_file_list
            ]
            all_com = reduce(pd.DataFrame.add, dfs)
            all_com.to_csv(out_path + f"{code_str}_{d}.csv")
    else:
        for code in hs_list:
            code_str = code
            print(f"{code_str}")
            out_path = input_dir + f"/comtrade/trade_forecast/annual_agg/{code_str}/"
            if not os.path.exists(out_path):
                os.makedirs(out_path)
            forecast_year = int(str(start_forecast_year)[:4])
            year_range = list(
                range(forecast_year, forecast_year + number_forecast_years, 1)
            )
            for d in year_range:
                d_file_list = glob.glob(
                    input_dir
                    + f"/comtrade/trade_forecast/monthly_adjusted/{code_str}/*_{d}*.csv"
                )
                print(f"{d}: {len(d_file_list)}")
                dfs = [
                    pd.read_csv(f, sep=",", header=0, index_col=0, encoding="latin1")
                    for f in d_file_list
                ]
                all_com = reduce(pd.DataFrame.add, dfs)
                all_com.to_csv(out_path + f"{code_str}_{d}.csv")

## Country Data

In [None]:
countries_geo_path = "Data/TM_WORLD_BORDERS-0.3/TM_WORLD_BORDERS-0.3.shp"

In [None]:
countries_gdf = gpd.read_file(countries_geo_path)
countries_gdf.iloc[136, 4] = "Macao"
countries_gdf.iloc[169, 4] = "Réunion"
countries_gdf.iloc[17, 4] = "Myanmar"
countries_gdf.iloc[245, 4] = "Saint Barthelemy"

## Koppen-Geiger Climate Classification by Country
This step only needs to be run once. 

It creates a data frame consisting of countries as rows, climate
classification codes as columns, and percent area in each country
as values. 

In [None]:
# Choose mask to limit host and climate calculations to potential commodity destination areas
# Options: 'hii' for Human Influence Index (e.g., urban areas), or
# 'cl' for cropland areas (e.g., agricultural input)
# or "none" to not mask out any areas

mask = "hii"

In [None]:
koppen_path = "Data/Beck_KG_V1_present_0p083.tif"
koppen_codes = pd.read_csv("Data/KGcodes.csv")

In [None]:
# Read Koppen raster
koppen_rast = rasterio.open(koppen_path)
koppen_arr = koppen_rast.read(1)
koppen_arr.shape

In [None]:
if mask == "cl":
    # If using cropland layer to mask host and climate areas
    # Read cropland binary
    cl = rasterio.open(f"Data/cropland_binary.tif")
    cl_arr = cl.read(1)
    cl.shape

    # Set values
    threshold_val = ""

    # Mask selected areas
    koppen = cl_arr[:2160, :4320] * koppen_arr

if mask == "hii":
    # Read Human Influence Index layer (already resampled to match Koppen raster)
    hii = rasterio.open("Data/hii_v2_resamp.tif")
    hii_arr = hii.read(1)
    hii_arr = hii_arr.astype("float64")
    hii_arr.shape

    # Create mask to exclude areas with values below the threshold
    # from the % area calculations of climate similaritiy and
    # host availability
    threshold_val = 16

    # Mask values less than threshold (water is already 255)
    hii_arr[hii_arr < threshold_val] = 0
    hii_arr[hii_arr == 255] = 0

    # Keep areas greater than threshold
    hii_arr[hii_arr >= threshold_val] = 1
    hii_arr[hii_arr == 0] = np.nan

    # Mask selected areas
    koppen = hii_arr * koppen_arr

if mask == "none":
    koppen = koppen_arr

In [None]:
# Calculate zonal statistics of koppen classes for each country
affine = koppen_rast.transform
stats = zonal_stats(countries_gdf, koppen, categorical=True, affine=affine)

In [None]:
# Add % of each climate classification to countries geodataframe
koppen_df = countries_gdf.loc[:, ["ISO3", "NAME"]]
koppen_df["koppen_stats"] = stats
koppen_df = pd.concat(
    [koppen_df, koppen_df["koppen_stats"].apply(pd.Series)], axis=1
).fillna(0)
pix_ct = pd.DataFrame(koppen_df.sum(axis=1))
cat_pct = koppen_df.iloc[:, 3:].div(pix_ct[0], axis=0, fill_value=None)
cat_pct = cat_pct.iloc[:, 1:]
keep_cat = [int(x - 1) for x in cat_pct.columns]
cat_pct.columns = list(koppen_codes["let"].iloc[keep_cat])
koppen_df = pd.concat([koppen_df.iloc[:, 0:2], cat_pct], axis=1)

koppen_df = koppen_df.fillna(0)
koppen_df.drop(["NAME"], axis=1, inplace=True)
koppen_df = koppen_df.set_index("ISO3")

In [None]:
koppen_df.to_csv(f"{input_dir}/koppen_{mask}Mask{threshold_val}.csv", sep=",")

In [None]:
# If loading from file:
koppen_df = pd.read_csv(
    f"{input_dir}/koppen_{mask}Mask{threshold_val}.csv", index_col=0
)

In [None]:
koppen_df.head()

## Host
The step only needs to be run once. 

Using a binary host map, calculate the percent area in each country with
probable presence of host.

In [None]:
# Host file name
host_file = "toh.global_ensemble.tif"

# Read bindary host raster
# Copy host raster in Data folder or update path as appropriate
host_rast_path = f"{data_dir}/{host_file}"
host_rast = rasterio.open(host_rast_path)

# Check raster matches projection and resolution of other inputs
if (host_rast.crs != "EPSG:4326") or (
    host_rast.transform[0] != koppen_rast.transform[0]
):
    print("Resampling host raster to match climate raster...")
    prj_dst = f"{os.path.splitext(host_rast_path)[0]}_prj.tif"
    dst_crs = koppen_rast.crs

    with rasterio.open(host_rast_path) as src:
        transform, width, height = rasterio.warp.calculate_default_transform(
            src.crs,
            dst_crs,
            src.width,
            src.height,
            *src.bounds,
            resolution=koppen_rast.res,
        )
        kwargs = src.meta.copy()
        kwargs.update(
            {"crs": dst_crs, "transform": transform, "width": width, "height": height}
        )

        with rasterio.open(prj_dst, "w", **kwargs) as dst:
            rasterio.warp.reproject(
                source=rasterio.band(src, 1),
                destination=rasterio.band(dst, 1),
                src_transform=src.transform,
                src_crs=src.crs,
                dst_transform=transform,
                dst_crs=dst_crs,
                resampling=rasterio.warp.Resampling.nearest,
            )
        dst.close()
    host_rast = rasterio.open(prj_dst)
    host_arr = host_rast.read(1)
    print("\tdone")
else:
    host_arr = host_rast.read(1)

In [None]:
# Set large negative values to 0
host_arr[host_arr < 0.0001] = 0

In [None]:
# Check values of original host raster
host_arr_val_counts = np.unique(host_arr, return_counts=True)
host_arr_val_counts[0][0:5]

In [None]:
# If raster contains only two values, are they already 0 and 1?
if len(host_arr_val_counts[0]) == 2:
    if (host_arr_val_counts[0][0]) == 0 and (host_arr_val_counts[0][1] == 1):
        print("Host raster is already in the required binary form")
    # If raster has only two values, but they are not 0 and 1, check output
    else:
        print("Host raster needs to be converted to a binary output")
        print("with 0 = no host present, 1 = host present")
        print(host_arr_val_counts[0])

# If more than two values are present, convert based on specified threshold
else:
    print("Converting provided host raster to a binary output")
    threshold = 0.2  # e.g., above 20% if in percent, or >=1 if area harvested
    host_arr = convert_to_binary(host_arr, threshold)

# Confirm converted to 0 and 1
print(np.unique(host_arr, return_counts=True))

In [None]:
if mask == "cl":
    host_arr = np.append(
        host_arr,
        np.zeros([cl_arr.shape[0] - host_arr.shape[0], host_arr.shape[1]]),
        axis=0,
    )
    host = host_arr * cl_arr  # This might not work if they are very different shapes

if mask == "hii":
    host_arr = np.append(
        host_arr,
        np.zeros([hii_arr.shape[0] - host_arr.shape[0], host_arr.shape[1]]),
        axis=0,
    )
    host = host_arr * hii_arr

if mask == "none":
    host = host_arr

In [None]:
# Calculate zonal statistics
affine = host_rast.transform
stats = zonal_stats(countries_gdf, host, categorical=True, affine=affine)

In [None]:
# Create host dataframe with country identifiers, results from zonal stats, and calculate host percent area
host_df = countries_gdf.loc[:, ["ISO3", "NAME"]]
host_df["host_stats"] = stats
host_df = pd.concat([host_df, host_df["host_stats"].apply(pd.Series)], axis=1).fillna(0)
host_df["Host Percent Area"] = (host_df[1.0] / (host_df[0.0] + host_df[1.0])).fillna(0)
host_df.iloc[136, 1] = "Macao"
host_df.iloc[169, 1] = "Réunion"
host_df.iloc[17, 1] = "Myanmar"
host_df.iloc[245, 1] = "Saint Barthelemy"
host_df.head()

In [None]:
host_df.to_csv(f"{input_dir}/host_{mask}Mask{threshold_val}.csv", sep=",")

In [None]:
# If loading from file:
host_df = pd.read_csv(f"{input_dir}/host_{mask}Mask{threshold_val}.csv", sep=",")
host_df.drop(["Unnamed: 0", "host_stats", "0.0", "1.0"], axis=1, inplace=True)
host_df.head()

## Phytosanitary Capacity Data
Incorporates an estimate of phytosanitary capacity for each country. Currently the model 
is using the proactive value from:

Early, R., Bradley, B., Dukes, J. et al. Global threats from invasive alien species in the twenty-first century and national response capacities. Nat Commun 7, 12485 (2016). https://doi-org.prox.lib.ncsu.edu/10.1038/ncomms12485

In [None]:
phyto_path = f"{data_dir}/phytosanitary_capacity_iso3.csv"  # Original file found in: Q:\Shared drives\APHIS  Projects\Pandemic\Data\phytosanitary_capacity
phyto_df = pd.read_csv(phyto_path, index_col=0)

In [None]:
phyto_df = phyto_df[["proactive", "ISO3", "UN"]]
phyto_df = phyto_df.rename(columns={"proactive": "Phytosanitary Capacity"})

In [None]:
phyto_df.shape

## Create final countries dataframe
Merge country attributes with host percent area, climate classificaiton percent area, 
and phytosanitary capacity estimates. Filter and order dataframe to match countries 
with trade data available. 

In [None]:
# Merge koppen, host, and phytosanitary attributes with countries geodataframe
countries_gdf = countries_gdf.merge(koppen_df, on="ISO3")
countries_gdf = countries_gdf.merge(
    host_df[["ISO3", "Host Percent Area"]], how="left", on="ISO3"
)
countries_gdf = countries_gdf.merge(
    phyto_df, how="left", on="ISO3", suffixes=[None, "_y"]
)
countries_gdf.head()

In [None]:
# Get unique values of phytosanitary capacity for rescaling
unique_keys = list(countries_gdf["Phytosanitary Capacity"].unique())
unique_keys.sort()

In [None]:
# Rescale input phytosanitary capacity values using specified
# minimum and maxmimum values

# Minimum phytosanitary capacity value when rescaled
scaled_min = 0.3

# Maximum phytosanitary capacity value when rescaled
scaled_max = 0.8

phyto_dict = {}

for i in unique_keys:
    if np.isnan(i):
        phyto_dict[i] = 0
    else:
        if np.isnan(unique_keys).any():
            increments = len(unique_keys) - 1
        else:
            increments = len(unique_keys)

        scale_diff = scaled_max - scaled_min
        phyto_dict[i] = (
            round((scale_diff / increments) * unique_keys.index(i), 2) + scaled_min
        )

phyto_dict

In [None]:
countries_gdf["Phytosanitary Capacity"] = countries_gdf[
    "Phytosanitary Capacity"
].replace(phyto_dict)

In [None]:
countries_gdf.set_index("ISO3", inplace=True)
countries_gdf.head()

In [None]:
# Read example trade matrix to identify which countries are in the geodataframe but not
# in the trade data
if temporal_res == "M":
    temporal_folder = "monthly"
    month_str = "01"
if temporal_res == "A":
    temporal_folder = "annual"
    month_str = ""
    
example_trade = pd.read_csv(
    f"{input_dir}/comtrade/{temporal_folder}_{dir_suffix}/{code_str}/{code_str}_{start_year}{month_str}.csv",
    header=0,
    index_col=0,
    encoding="latin-1",
)
country_set = set(countries_gdf.index.values)
trade_set = set(example_trade.index.values)

In [None]:
print("# ISO3 codes in countries geopackage:\t", len(country_set))
print("# ISO3 code matches:\t\t\t", len(trade_set.intersection(country_set)))

print("Which countries are in the TRADE data but NOT the COUNTRIES geopackage")
miss_country = trade_set - country_set
if miss_country:
    print("\n", miss_country)
else:
    print("No missing countries.")


print("Which countries are in the COUNTRIES geopackage but NOT the TRADE data:")
miss_trade = country_set - trade_set
if miss_trade:
    print("\n", miss_trade)
else:
    print("No missing countries.")

IMN: Isle of Man
PRI: Puerto Rico
ALA: Aland Islands
LIE: Liechtenstein
MTQ: Martinique
GUF: French Guiana
MAF: Saint-Martin (French part)
TWN: Taiwan
JEY: Jersey
MCO: Monaco
GGY: Guernsey
GLP: Guadeloupe
REU: Réunion
VIR: US Virgin Islands
BVT: Bouvet Island
SJM: Svalbard and Jan Mayen Islands

In [None]:
# Remove countries from the geodataframe that do not have trade data
countries_filtered = countries_gdf.drop(miss_trade, axis=0)

In [None]:
# Reorder the countries geodataframe rows to match the
# trade index order
index_list = list(example_trade.index.values)
countries_filtered_reindex = countries_filtered.loc[index_list, :]

In [None]:
countries_filtered_reindex.reset_index(inplace=True)
countries_filtered_reindex.head()

## Create Distance Matrix
Calculate the distance between each origin-destination country pair. Save 
as matrix array. 

In [None]:
distances = distance_between(
    example_trade, countries_filtered_reindex
)  # great circle dist in km
distances.shape

In [None]:
np.save(f"{input_dir}/distance_matrix.npy", distances)

## Create Climate Simiarities Matrix
Calculate the similarity matrix. Options for matching between each origin-destination country pair
or matching to the origins at time step 1 only. Save as matrix array.

In [None]:
climate_match_type = "origins"  # "origins" or "pairs"

In [None]:
native_countries_list = [
    "China",
    "Viet Nam",
]

native_iso_list = []
for country in native_countries_list:
    native_iso_list.append(countries_gdf[countries_gdf["NAME"] == country].index[0])

In [None]:
if climate_match_type == "pairs":
    # Create an n x n array of climate similarity calculations
    climate_similarities = create_climate_similarities_matrix(
        array_template=example_trade, countries=countries_filtered_reindex
    )

In [None]:
if climate_match_type == "origins":
    # If matching to origins at time step 1, set list of origins

    # Specify if only areas with host in origins should be considered
    host_only = False

    if host_only:
        # Mask to include only areas with host in calculations
        koppen = koppen_arr * host_arr
    else:
        koppen = koppen_arr

    # Calculate koppen categories for each country (must recompute here, without commodity destination mask)
    stats = zonal_stats(countries_gdf, koppen, categorical=True, affine=affine)
    # Add % of each climate classification to countries geodataframe
    koppen_df = countries_gdf.loc[:, ["NAME"]]
    koppen_df["koppen_stats"] = stats
    koppen_df = pd.concat(
        [koppen_df, koppen_df["koppen_stats"].apply(pd.Series)], axis=1
    ).fillna(0)
    pix_ct = pd.DataFrame(koppen_df.sum(axis=1))
    cat_pct = koppen_df.iloc[:, 3:].div(pix_ct[0], axis=0, fill_value=None)
    keep_cat = [int(x - 1) for x in cat_pct.columns]
    cat_pct.columns = list(koppen_codes["let"].iloc[keep_cat])
    koppen_df = pd.concat([koppen_df.iloc[:, 0:2], cat_pct], axis=1)

    koppen_df = koppen_df.fillna(0)
    koppen_df.drop(["NAME", "koppen_stats"], axis=1, inplace=True)

    native_iso_list = []
    for country in native_countries_list:
        native_iso_list.append(countries_gdf[countries_gdf["NAME"] == country].index[0])

    native_koppen = koppen_df.loc[native_iso_list]
    origin_climates = native_koppen[native_koppen > 0].dropna(axis=1, thresh=1).columns

    # Create an n x 1 array of climate similarity calculations
    climate_similarities = create_climate_similarities_matrix_origins(
        countries=countries_filtered_reindex, origins_climate_list=origin_climates
    )

    countries_filtered_reindex["Climate Similarity"] = climate_similarities

In [None]:
np.save(
    f"{input_dir}/climate_similarities_{mask}Mask{threshold_val}.npy",
    climate_similarities,
)

In [None]:
# Save filtered and reindexed countries data with climate, host %s, and phyto values
countries_path = f"{input_dir}/countries_{mask}Mask{threshold_val}.gpkg"
countries_filtered_reindex.to_file(countries_path, driver="GPKG")

In [None]:
# Save full country data frame path to .env file
print(dotenv.set_key(".env", "COUNTRIES_PATH", countries_path))

## Create validation and native countries dataframes

Native countries: A list of the countries native range (provided above), determined from literature.

Validation: Get country-year first observations from GBIF. This data can be supplemented with dates from literature and other sources. If no observations are available, you wil need to create the dataframe in the format provided below.

In [None]:
# Create native countries list

pd.Series(native_iso_list, name="ISO3").to_csv(
    input_dir + "origin_locations.csv", index=False
)

In [None]:
# Create validation data

species_sci_name = "Lycorma delicatula"
method = "GBIF"  # "GBIF" or "manual"

years = list(range(start_year, end_year + 1))

# Or, set manual years:
# years = list(range(1980,2022))

In [None]:
# Get GBIF usageKey (unique identifier)
if method == "GBIF":
    gbif_key = get_GBIF_key(species_sci_name)

# If the species is not found, you will need to create the observations dataframe below.

In [None]:
# Extract first records (year-country)

if method == "GBIF":
    first_records = get_GBIF_records(gbif_key, years)

# If records are not found, you will need to create the observations dataframe below.

In [None]:
# Remove native country records

first_records = first_records.loc[
    ~first_records["ISO3"].isin(native_iso_list)
].reset_index(drop=True)

In [None]:
# Alternative: manually create dataframe:
if method == "manual":
    countries_ISO3 = ["KOR", "JPN", "USA"]
    first_intro_year = [2009, 2010, 2014]

    first_records = pd.DataFrame(
        {"ISO3": countries_ISO3, "ObsFirstIntro": first_intro_year}
    )

In [None]:
# Check that it looks correct

first_records

In [None]:
# Write to .csv

first_records.to_csv(input_dir + "first_records_validation.csv", index=False)

## Save key data paremeters to the case study configuration file
Further modified in 2. Create Model Config

In [None]:
args = {}

# Trade data related

args["timestep"] = temporal_folder
args["trade_type"] = dir_suffix

if agg_commodities == True:
    args["commodity_list"] = [code_str]
else:
    args["commodity_list"] = hs_list

# For accessing input file names

args["climate_match"] = climate_match_type
args["climate_match_host_only"] = host_only
args["mask"] = mask
args["threshold_val"] = threshold_val

# Native countries

args["native_countries_list"] = native_iso_list

# Phytosantitary capacity scales

args["scaled_min"] = scaled_min
args["scaled_max"] = scaled_max

In [None]:
config_json_path = f"{out_dir}/config_{sim_name}.json"

if os.path.isfile(config_json_path):
    with open(config_json_path) as file:
        prev_config = json.load(file)

    prev_config.update(args)

    with open(config_json_path, mode="w") as f:
        f.write(json.dumps(prev_config, indent=4))

else:
    with open(config_json_path, "w") as file:
        json.dump(args, file, indent=4)

print("\tSaved ", config_json_path)

## Next: Evaluate case study and data