Copyright (c) 2022, salesforce.com, inc and MILA.  
All rights reserved.  
SPDX-License-Identifier: BSD-3-Clause  
For full license text, see the LICENSE file in the repo root  
or https://opensource.org/licenses/BSD-3-Clause  

Copyright (c) 2022, salesforce.com, inc and MILA.  
All rights reserved.  
SPDX-License-Identifier: BSD-3-Clause  
For full license text, see the LICENSE file in the repo root  
or https://opensource.org/licenses/BSD-3-Clause  

# This is the notebook to create the datasets and yaml file
Dependency: wbgapi, pandas, numpy

In [1]:
# pip install wbgapi
import wbgapi as wb
import numpy as np
import pandas as pd
from opt_helper import *
import warnings
warnings.filterwarnings('ignore')

# Data Loading

In [2]:
# Get convergence population predicted by United Nation
lasdf = pd.read_csv("csv_asset/UN-pop-pred.csv")
lasdf = lasdf[lasdf["Year"]==2100].reset_index(drop=True)

# Get the country classes information from the world bank classification. Class 0 includes countries all over the world. Class 1 to 20 are divided by region X income level.
cc = pd.read_csv("csv_asset/CountryClass_3.csv")
countryclass = {i:list(cc[cc["RIG"]==i]["Code"]) for i in range(1,4)}
countryclass[0] = sum([countryclass[i] for i in range(1,4)] ,[])

# Get the env protection proportion contribution from the IMF data
envdf = pd.read_csv("csv_asset/Environmental_Protection_Expenditures_Geo_Avg_Recent_Years_Sum.csv")

# Include the env protection expenditure for the countries
import csv
env_pay = {}
with open("csv_asset/Environmental_Protection_Expenditures_Geo_Avg_Recent_Years_Sum.csv", 'r') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        if row[0] != "ISO3":
            env_pay[row[0]] = float(row[1])/100

# include tax rate
taxdf = pd.read_csv("csv_asset/tax_rate.csv")

# The series (token) what we are interested.
# the list of series that we want to query
series_list = ["NY.GDP.MKTP.CD","CM.MKT.LCAP.CD", "SP.POP.TOTL","EN.ATM.CO2E.KT", "NE.CON.TOTL.ZS"]
df = wb.data.DataFrame(series_list, time=range(1960, 2022, 1), labels=True)
df = df.reset_index()



# Preprocess the data
Determine which years are interested and what countries should be excluded (because of lacking in data)

In [3]:
# retrive country codes stuff and make sure the data are float rather than str
economy_list = list(df["economy"])[:len(set(df["economy"]))]
country_list = list(df["Country"])[:len(set(df["economy"]))]
economy_region_list = []
for x in economy_list:
    if x=="WLD": break
    else: economy_region_list.append(x)

In [4]:
# add to the exec_code list because no GDP data available
noY=[]
for x in economy_region_list:
    tmp = []
    for y in range(2003, 2021):
        if pd.isnull(get_data_list(df, x, "Y")[1]["YR"+str(y)]):
            tmp.append(0)
        else:
            tmp.append(1)
    if sum(tmp)!=len(range(2003, 2021)):
        noY.append(x)

In [5]:
noY

['YEM',
 'VEN',
 'MAF',
 'SSD',
 'SOM',
 'SXM',
 'NRU',
 'XKX',
 'PRK',
 'GIB',
 'ERI',
 'CUW',
 'CHI',
 'CYM',
 'VGB']

In [6]:
# exc_code = noY
exc_code = ['YEM','VIR','VEN', 'TKM','SYR', 'MAF', 'SSD', 'SOM', 'SXM', 'SMR', 'MNP', 'NCL', 'NRU', 'LIE', 'XKX', 'PRK', 'IMN', 
             'GRL', 'GIB', 'PYF', 'FRO', 'ERI', 'CUW', 'CHI', 'CYM', 'VGB', 'ABW', 'AND', "TWN"] # exclude codes becasue we don't even have GDP data ever
dict_country = {}
for i in range(len(economy_list)):
    dict_country[economy_list[i]]=country_list[i]
years = list(df.columns)[4:]
df[years] = df[years].apply(pd.to_numeric)

Introducing the carbon intensity

In [7]:
# include the carbon intensity
for i in range(len(country_list)):
    s = df[df["economy"]==economy_list[i]][df["series"]=="EN.ATM.CO2E.KT"][years].reset_index(drop=True)*1_000_000/df[df["economy"]==economy_list[i]][df["series"]=="NY.GDP.MKTP.CD"][years].reset_index(drop=True)
    s.at[0,"economy"] = economy_list[i]
    s.at[0,"Country"] = country_list[i]
    s.at[0,"series"] = "EN.ATM.CO2E.KD.CD"
    s.at[0, "Series"] = "sigma"
    df = df.append(s)
    df = df.reset_index(drop=True)

In [8]:
# check those regions who does not have sigma data
count = 0
noco2 = {}
hasco2 = {}
for i in range(len(country_list)):
    if df[df["series"]=="EN.ATM.CO2E.KD.CD"][df["economy"]==economy_list[i]]["YR2018"].isnull().values[0]:
        noco2[economy_list[i]]=country_list[i]
        continue
    else:
        hasco2[economy_list[i]]=country_list[i]
        count+=1

In [9]:
# get the country list without co2 data
noco2_region_dict = {}
for k,v in noco2.items():
    if k=="WLD": break
    else: noco2_region_dict[k]=v

In [10]:
# since there is no co2 data for the key countries, 
# we manually find a replacement data for them from other countires
borrowdict = {'PSE': 'EGY',
 'VIR': 'USA',
 'VEN': 'MEX',
 'TCA': 'GBR',
 'MAF': 'FRA',
 'SSD': 'EGY',
 'SXM': 'NLD',
 'SMR': 'ITA',
 'PRI': 'USA',
 'MNP': 'USA',
 'NCL': 'FRA',
 'MCO': 'FRA',
 'MAC': 'CHN',
 'XKX': 'TUR',
 'PRK': "RUS",
 'IMN': 'GBR',
 'HKG': 'CHN',
 'GUM': 'USA',
 'GRL': 'DNK',
 'GIB': 'GBR',
 'PYF': 'FRA',
 'FRO': 'DNK',
 'ERI': 'EGY',
 'CUW': 'NLD',
 'CHI': 'GBR',
 'CYM': 'GBR',
 'VGB': 'GBR',
 'BMU': 'CAN',
 'ABW': 'MEX',
 'ASM': 'USA',}

In [11]:
# find the country code to index mapping for noco2data available countries
code2idx = {}
for k in noco2_region_dict.keys():
    code2idx[k] = df[df["series"]=="EN.ATM.CO2E.KD.CD"][df["economy"]==k].index.values[0]

In [12]:
# update the co2 data for noco2 recorded countries by borrowdict
for k in noco2_region_dict.keys(): # borrowdict
    try:
        df.loc[code2idx[k],years] = df[df["series"]=="EN.ATM.CO2E.KD.CD"][df["economy"]==borrowdict[k]][years].iloc[0]
    except:
        print(k)

INX


Fillna for countries without capital data

In [13]:
# Prepare the (Y,L) data and predict K data by KNN
def get_Y_K_L_pairs(codes, year=None, exc_code=[]):
    if year is None:
        year = "YR2020"
    else:
        year = "YR"+str(year)
    train_data = []
    train_label = []
    train_codes = []
    test_data = []
    test_codes = []
    codes_ = []
    for code in codes:
        if code in exc_code:
            continue
        else:
            label = get_data_list(df, code, "K")[1][year]
            if pd.isnull(label):
                test_data.append([get_data_list(df, code, "Y")[1][year], get_data_list(df, code, "L")[1][year]])
                test_codes.append(code)
            else:
                train_data.append([get_data_list(df, code, "Y")[1][year], get_data_list(df, code, "L")[1][year]])
                train_label.append(label)
                train_codes.append(code)
    return np.array(train_data), np.array(train_label), np.array(test_data), dict(zip(train_codes, train_data)), dict(zip(test_codes, test_data))

In [14]:
# use KNN to predict those who has Y, L data but don't have K data
for y in range(2003, 2021):
    train_data, train_label, test_data, train_dict, test_dict = get_Y_K_L_pairs(economy_region_list, y, exc_code)
    from sklearn.neighbors import KNeighborsRegressor
    neigh = KNeighborsRegressor(n_neighbors=5)
    neigh.fit(train_data, train_label)
    test_K = neigh.predict(test_data)
    test_region = list(test_dict.keys())
    for i in range(len(test_region)):
        idx = df[df["economy"]==test_region[i]][df["series"]=="CM.MKT.LCAP.CD"]["YR2003"].index.values[0]
        df.loc[idx,"YR"+str(y)]=test_K[i]

Use KNN to fill consumption dataï¼Œ prepare the (Y,L) data and predict C data by KNN

In [15]:
def get_Y_C_L_pairs(codes, year=None, exc_code=[]):
    if year is None:
        year = "YR2020"
    else:
        year = "YR"+str(year)
    train_data = []
    train_label = []
    train_codes = []
    test_data = []
    test_codes = []
    codes_ = []
    for code in codes:
        if code in exc_code:
            continue
        else:
            label = get_data_list(df, code, "C")[1][year]
            if pd.isnull(label):
                test_data.append([get_data_list(df, code, "Y")[1][year], get_data_list(df, code, "L")[1][year]])
                test_codes.append(code)
            else:
                train_data.append([get_data_list(df, code, "Y")[1][year], get_data_list(df, code, "L")[1][year]])
                train_label.append(label)
                train_codes.append(code)
    return np.array(train_data), np.array(train_label), np.array(test_data), dict(zip(train_codes, train_data)), dict(zip(test_codes, test_data))

In [16]:
# use KNN to predict those who has Y, L data but don't have K data
for y in range(2003, 2021):
    train_data, train_label, test_data, train_dict, test_dict = get_Y_C_L_pairs(economy_region_list, y, exc_code)
    from sklearn.neighbors import KNeighborsRegressor
    neigh = KNeighborsRegressor(n_neighbors=5)
    neigh.fit(train_data, train_label)
    test_C = neigh.predict(test_data)
    test_region = list(test_dict.keys())
    for i in range(len(test_region)):
        idx = df[df["economy"]==test_region[i]][df["series"]=="NE.CON.TOTL.ZS"]["YR2003"].index.values[0]
        df.loc[idx,"YR"+str(y)]=test_C[i]

In [17]:
for code in economy_region_list:
    if code in exc_code:
        continue
    else:
        if len(get_data_list(df, code, "C")[0]) == 0:
            print(code)

introduce the tech factor (ATFP) based on Y,K,L

In [18]:
for i in range(len(economy_list)):
    if economy_list[i] in exc_code: continue
    else:
        s = df[df["economy"]==economy_list[i]][df["series"]=="NY.GDP.MKTP.CD"][years].reset_index(drop=True)/(1000_000_000_000*((df[df["economy"]==economy_list[i]][df["series"]=="CM.MKT.LCAP.CD"][years].reset_index(drop=True)/1000_000_000_000)**0.3*(df[df["economy"]==economy_list[i]][df["series"]=="SP.POP.TOTL"][years].reset_index(drop=True)/1000_000_000)**0.7))
        s.at[0,"economy"] = economy_list[i]
        s.at[0,"Country"] = country_list[i]
        s.at[0,"series"] = "ATFP"
        s.at[0, "Series"] = "A"
        df = df.append(s)
        df = df.reset_index(drop=True)

Gather the results of the time series accross different regions

In [19]:
raw_results = {}
for x in economy_region_list:
    if x in exc_code:
        continue
    else:
        raw_results[x]={}
        raw_results[x]["TS_Y"] = get_data_list(df, x, "Y")
        raw_results[x]["TS_A"] = get_data_list(df, x, "A")
        raw_results[x]["TS_K"] = get_data_list(df, x, "K")
        raw_results[x]["TS_L"] = get_data_list(df, x, "L")
        raw_results[x]["TS_sigma"] = get_data_list(df, x, "sigma")
        raw_results[x]["TS_C"] = get_data_list(df, x, "C")
        raw_results[x]["La"] = list(lasdf[lasdf["Code"]==x]["Population (future projections)"])[0]
        raw_results[x]["mitigation"] = get_env_data_list(envdf, x)
        raw_results[x]["saving"] = 1 - int(raw_results[x]["TS_C"][0][-1])/100
        try:
            raw_results[x]["tax"] = get_tax_data_list(taxdf, x)
        except:
            raw_results[x]["tax"] = 0

In [20]:
raw_results["CHN"]["TS_Y"][2]

{'YR1960': 59716249310.9742,
 'YR1961': 50056685957.359,
 'YR1962': 47209186415.3555,
 'YR1963': 50706614526.1472,
 'YR1964': 59708125203.8643,
 'YR1965': 70436008642.4251,
 'YR1966': 76720005491.8964,
 'YR1967': 72881364882.4909,
 'YR1968': 70846276051.4727,
 'YR1969': 79705614854.7674,
 'YR1970': 92602634891.6589,
 'YR1971': 99800593790.9886,
 'YR1972': 113689308020.343,
 'YR1973': 138543170458.064,
 'YR1974': 144188970821.072,
 'YR1975': 163429530659.638,
 'YR1976': 153939265947.775,
 'YR1977': 174935933078.663,
 'YR1978': 218502169137.562,
 'YR1979': 263711728825.005,
 'YR1980': 306165314855.846,
 'YR1981': 289576581830.449,
 'YR1982': 283928672988.111,
 'YR1983': 304748904221.289,
 'YR1984': 313728547706.897,
 'YR1985': 309835803013.587,
 'YR1986': 300514204520.969,
 'YR1987': 327089403146.073,
 'YR1988': 407844670393.058,
 'YR1989': 456289122063.159,
 'YR1990': 394565747349.055,
 'YR1991': 413375445354.473,
 'YR1992': 493136961883.002,
 'YR1993': 619111946511.628,
 'YR1994': 5643

In [23]:
import json
for k in raw_results:
    with open(f"/home/work/climate-cooperation-competition/other_yamls/back_up_all/{k}.yaml", 'w') as f:
        json.dump(raw_results[k], f)

FileNotFoundError: [Errno 2] No such file or directory: '/home/work/climate-cooperation-competition/other_yamls/back_up_all/ZWE.yaml'

Merge regions from data from small regions

Align the time series length of A, K and L

An output template

In [21]:
default = {"_RICE_CONSTANT":
  {"xgamma": 0.3, # in CAP Eq 5 the capital elasticty

  # A rice data
  "xA_0": 0,
  "xg_A": 0,
  "xdelta_A": 0.0214976314392836,
  # L
  "xL_0": 1397.715000, # in POP population at the staring point
  "xL_a": 1297.666000, # in POP the expected population at convergence
  "xl_g": 0.04047275402855734, # in POP control the rate to converge
  # K
  "xK_0": 93.338152,
  "xa_1": 0,
  "xa_2": 0.00236 ,
  "xa_3": 2,

  # xsigma_0: 0.5201338309755572
  "xsigma_0": 0.215}}

Get the parameters of the dynamics from the timeseries gather the above

In [22]:

para_result = {k:{} for k in countryclass.keys()}
for i in countryclass.keys():
    print(countryclass[i])
    a = merge_region_dict(packup_regions(raw_results, i, countryclass, exc_code, raw_results))
    para_result[i]["xl_g"] = get_pop_lg(a["Ls"], a["Las"])
    para_result[i]["xL_a"] = a["Las"]/1_000_000
    para_result[i]["xL_0"] = a["Ls"][-1]/1_000_000
    para_result[i]["xg_A"],para_result[i]["xdelta_A"] = get_gA_deltaA(a["As"])
    para_result[i]["xA_0"] = a["As"][-1]
    para_result[i]["xK_0"] = a["Ks"][-1]/1_000_000_000_000
    para_result[i]["xsigma_0"] = a["sigmas"]/(1-0.05)
    para_result[i]["xtax"] = a["taxs"]
    para_result[i]["xmitigation_0"] = a["mitigations"]
    para_result[i]["xsaving_0"] = a["savings"]

['BWA', 'GAB', 'GNQ', 'MUS', 'NAM', 'ZAF', 'AGO', 'BEN', 'CIV', 'CMR', 'COG', 'COM', 'CPV', 'GHA', 'KEN', 'LSO', 'MRT', 'NGA', 'SEN', 'STP', 'SWZ', 'TZA', 'ZMB', 'ZWE', 'BDI', 'BFA', 'CAF', 'COD', 'ERI', 'ETH', 'GIN', 'GMB', 'GNB', 'LBR', 'MDG', 'MLI', 'MOZ', 'MWI', 'NER', 'RWA', 'SDN', 'SLE', 'SOM', 'SSD', 'TCD', 'TGO', 'UGA', 'SYC', 'MDV', 'BGD', 'BTN', 'IND', 'LKA', 'NPL', 'PAK', 'AFG', 'IRQ', 'JOR', 'LBN', 'LBY', 'DJI', 'DZA', 'EGY', 'IRN', 'MAR', 'PSE', 'TUN', 'SYR', 'YEM', 'ARE', 'BHR', 'ISR', 'KWT', 'MLT', 'OMN', 'QAT', 'SAU', 'ASM', 'CHN', 'FJI', 'MHL', 'MYS', 'THA', 'TON', 'TUV', 'FSM', 'IDN', 'KHM', 'KIR', 'LAO', 'MMR', 'MNG', 'PHL', 'PNG', 'SLB', 'TLS', 'VNM', 'VUT', 'WSM', 'AUS', 'BRN', 'GUM', 'HKG', 'JPN', 'KOR', 'MAC', 'MNP', 'NCL', 'NRU', 'NZL', 'PLW', 'PYF', 'SGP', 'TWN']
Optimization terminated successfully.
         Current function value: 0.218649
         Iterations: 91
         Function evaluations: 166
['ALB', 'ARM', 'AZE', 'BGR', 'BIH', 'BLR', 'GEO', 'KAZ', 'MDA'

Add the "rest of the world" region and used the worldwide parameters for it

In [23]:
# Calculate the number of population which is not covered
popworldwide = list(df[df["economy"]=="WLD"][df["series"]=="SP.POP.TOTL"]["YR2020"])[0]
popcovvered = sum([raw_results[x]["TS_L"][0][-1] for x in economy_region_list if x not in exc_code])
popnotcovered = popworldwide - popcovvered
# calculate the estimate worldwide convergence population
covered_convergence_pop = sum([raw_results[x]["La"] for x in economy_region_list if x not in exc_code])
not_covvered_convergence_pop = popnotcovered*covered_convergence_pop/popcovvered

In [24]:
# check the worldwide properties
i=0
wld = {}
a = merge_region_dict(packup_regions(raw_results, i, countryclass, exc_code, raw_results))
wld["xl_g"] = get_pop_lg(a["Ls"], a["Las"])
wld["xL_a"] = a["Las"]/1_000_000
wld["xL_0"] = a["Ls"][-1]/1_000_000
wld["xg_A"],wld["xdelta_A"] = get_gA_deltaA(a["As"])
wld["xA_0"] = a["As"][-1]
wld["xK_0"] = a["Ks"][-1]/1_000_000_000_000
wld["xsigma_0"] = a["sigmas"]/(1-0.05)
wld["xtax"] = a["taxs"]
wld["xmitigation_0"] = a["mitigations"]
wld["xsaving_0"] = a["savings"]

Optimization terminated successfully.
         Current function value: 0.158644
         Iterations: 87
         Function evaluations: 164


In [25]:
# get the rest of the world params
rest_region = wld.copy()
rest_region["xL_a"] = not_covvered_convergence_pop/1_000_000
rest_region["xL_0"] = popnotcovered/1_000_000
rest_region["xK_0"] = popnotcovered*wld["xK_0"]/popcovvered
para_result[4] = rest_region

In [26]:
write_yaml_files(para_result, "/home/work/climate-cooperation-competition/other_yamls/3_regions")

In [27]:
import json
import yaml

# Load the JSON file
with open("csv_asset/3_import_2016.json", "r") as json_file:
    import_data_dict = json.load(json_file)
with open("csv_asset/3_export_2016.json", "r") as json_file:
    export_data_dict = json.load(json_file)


# Function to update a single YAML file with new data
def update_yaml_file(file_path, data_im, data_ex):
    try:
        with open(file_path, "r") as yaml_file:
            current_data = yaml.safe_load(yaml_file) or {}
    except FileNotFoundError:
        current_data = {}

    current_data["_RICE_CONSTANT"]["ximport"] = data_im
    current_data["_RICE_CONSTANT"]["xexport"] = data_ex
    with open(file_path, "w") as yaml_file:
        yaml.dump(current_data, yaml_file, sort_keys=False)
    return True


In [28]:
# update the yaml files with import and export data
[
    update_yaml_file(
        f"/home/work/climate-cooperation-competition/other_yamls/3_regions/{i}.yml",
        import_data_dict[str(i)],
        export_data_dict[str(i)],
    )
    for i in range(1, 4)
]

[True, True, True]