The objective of this notebook is to join input data of multiple countries.

In [1]:
import pandas as pd
import h3.api.numpy_int as h3

from tqdm import tqdm
import re

import stc_unicef_cpi.utils.constants as c
import stc_unicef_cpi.utils.clean_text as ct

tqdm.pandas()




# Join countries 

In [None]:
read_path = '/mnt/c/Users/vicin/Desktop/DSSG/Project/stc_continuing/data/processed'

In [None]:
# specify dtypes 
dtypes ={'hex_code': int,
        'name_commuting':str,
        'geometry':str,  # come salvare geometry non lo so.
        'built':str}

dic_rename = {'gdp_ppp_1990':'GDP_PPP_1990', 'gdp_ppp_2000':'GDP_PPP_2000', 'gdp_ppp_2015':'GDP_PPP_2015',
            'NDVI_mean':'ndvi', 'NDWI_mean':'ndwi', 
            'precipitation_mean':'precimean', 'precipitation_stdDev':'precistd',
            'pr_mean':'precipiacc',
            'aet_mean_x':'evapotrans'
            }


In [None]:
joined = pd.DataFrame()
for country in c.countries_ssf[0:len(c.countries_ssf)]:
    # Get country code
    country_code = ct.get_alpha3_code(country)
    print(country_code)

    # Upload input data for that country
    data = pd.read_csv(read_path + f'/hexes_{country_code.upper()}_res7_thres30_all.csv',  dtype = dtypes)
    
    # Rename columns
    data = data.rename(columns=dic_rename)
    # Add country and country code column
    data['country_code'] = country_code
    data['country'] = country

    joined = pd.concat([joined, data])


# Compute features
Add child population, geometry, global human settlement ect

In [None]:
# child population
joined['child_pop'] = joined[['M_0', 'M_1', 'M_5','M_10', 'F_0', 'F_1', 'F_5','F_10']].sum(axis=1) + 0.6 * joined[['M_15', 'F_15']].sum(axis=1)

In [None]:
# remove areas with no population 
joined = joined[joined['child_pop']>0].copy()
print(joined.shape)

In [None]:
# hex centroid
joined['hex_centroid'] = joined['hex_code'].progress_apply(lambda x: h3.h3_to_geo(x))
joined['hex_code_str'] = joined['hex_code'].progress_apply(lambda x: h3.h3_to_string(x))
joined['geometry'] = joined['hex_code'].progress_apply(lambda x: h3.h3_to_geo_boundary(x, geo_json=True))


In [None]:
# global human settlement 
# water surface
joined['water_surface'] = joined['built'].apply(lambda x: int(bool(re.search('1', str(x)))))
# Land no built-up in any epoch
joined['no_built'] = joined['built'].apply(lambda x: int(bool(re.search('2', str(x)))))
# Built-up from 2000 to 2014 epochs
joined['build_2000_2014'] = joined['built'].apply(lambda x: int(bool(re.search('3', str(x)))))
# Built-up from 1990 to 2000 epochs
joined['build_1990_2000'] = joined['built'].apply(lambda x: int(bool(re.search('4', str(x)))))
# Built-up from 1975 to 1990 epochs
joined['build_1975_1990'] = joined['built'].apply(lambda x: int(bool(re.search('5', str(x)))))
# built-up up to 1975 epoch
joined['build_prior_1975'] = joined['built'].apply(lambda x: int(bool(re.search('6', str(x)))))

In [None]:
joined.drop(columns=['copland', 'built', 'aet_mean_y', 'lat', 'long'], inplace=True) # 'NR', 'CDMA',

In [None]:
print(joined.shape)
# (4493068, 154)

In [None]:
# save file
joined.to_csv(read_path + f'/20221021_hexes_ssf.csv', index=False)