In [1]:
from datetime import datetime

# Get the current date and time
current_datetime = datetime.now()

print(current_datetime)

2024-11-22 12:46:09.819452


# Spatio Temporal Logistic
This notebook is the first test of a spatio temporal logistic to link properly the **built-up surface** of a given region depending on its quality of life (**GDP/cap**) and of its **population density**.

In [3]:
import numpy as np
from Region import Region
from Df import Df
import pandas as pd
import geopandas as gpd
import os

In [None]:
from concurrent.futures import ThreadPoolExecutor
import itertools
from colorama import Fore, Style# init
from termcolor import colored

In [5]:
max_workers = 8

## PART 1 : Initialisation
Initialisation of the analysis parameters.

- **region_names** : (*string*) the country to study, named by their ISO3 
- **years** : (*string*) years to study
- **raster_S** : (*string*) letter used in the **GHSL** dataset (S, S_NRES, POP, ...)
- **lvl** : (*int*) the level of our administrative data (GDP and population)
- **subregion_borders** : (*string*) the path to administrative border shapefile to cut the subregions 
- **i dentifier** : (*string*) the column name to match the region names between the administrative data and the GIS data

To each region is associated a **DataFrame** (*oecd_DF_merged*) with the subregions matching administrative observables (GDP, population, etc).

In [None]:
lvl = 1

# GHSL type
raster_str = "Built_S"

In [7]:
data_folder = "/data/mineralogie/hautervo/data/"
folder_GHSL_S = data_folder + "Outputs/GHSL/Built_S/GADM_" + str(lvl) + "/"
folder_GHSL_POP = data_folder + "Outputs/GHSL/POP/GADM_" + str(lvl) + "/"
folder_DOSE = data_folder + "Outputs/DOSE/GADM_" + str(lvl) + "/"
folder_OSM_building = data_folder + "Outputs/OSM/building/GADM_" + str(lvl) + "/"

In [8]:
dose_df = pd.read_csv(data_folder+"DOSE/V2/DOSE_V2.9.csv")

In [9]:
# The OECD admin units

# oecd_admin_units = data_folder + "OECD/admin_units/TL" + str(lvl) + "/OECD_TL" + str(lvl) + "_2020_ESRI54009.shp"
# gpd_oecd_admin_units = gpd.read_file(oecd_admin_units)

gadm_admin_units = data_folder + "GADM/ESRI_54009/GADM_" + str(lvl) + "_ESRI54009.shp"
gpd_gadm_admin_units = gpd.read_file(gadm_admin_units)

In [10]:

subregion_col = "GID_"+str(lvl)
parent_col = "GID_" + str(lvl-1) 

In [11]:
# Countries to ignore from our study (not enough data)
# country_to_pop = ["SRB", "CRI", "ISR", "CYP", "ISL", "ALB", "LIE","MNE","MKD"]
country_to_pop = ["ATA"]

In [21]:
regions_names = list(gpd_gadm_admin_units["GID_0"].unique())

# #exclude some countries if necessary
for c in country_to_pop:
    try:
        regions_names.pop(regions_names.index(c))
    except:
        pass

regions_names = ["FRA", "DEU", "GBR", "BEL", "ITA", "LUX", "ESP", "USA", "JPN", "CAN", "AUS"] # to remove
regions_names = ["FRA", "DEU", "GBR", "BEL", "ITA", "LUX", "ESP", "JPN", "CAN", "AUS", "CHN", "IND", "IDN"] # to remove
# regions_names = ["FRA", "DEU", "GBR", "BEL", "ITA", "LUX", "ESP"] # to remove
# regions_names = ["FRA", "CHN"]
regions_names = ["USA"]

years = ["1975", "1990", "2000", "2010", "2020"] 
# years = ["2000", "2010", "2020"]
# years = ["2020"]

regions = []

for name in regions_names:
    new_region = Region(name, lvl-1)
    # new_region.parent_name = name
    # new_region.subregions.append(new_region)
    regions.append(new_region)

    for y in years:        
        new_region.add_gis(data_folder + "GHSL/"+ raster_str + "/E" + y + "_100m_Global/subregions/" + name + ".tif", raster_str + "_" + y, str(y), lvl-1) 
        # new_region.add_gis(data_folder + "GHSL/Built_POP/E" + y + "_100m_Global/subregions/" + name + ".tif", "POP_" + y, str(y), lvl-1) 



## PART 2 : Computation of the observables
Now that we define all the parameters of our study, we will cut the regions into their respective subregions (*make_subregions*). 

We then compute for each subregions GIS, the geographical observables that we store in its oecd_DF_merged.

In [22]:
def get_GHSL_values(subregion, folder: str, type: str, overwrite):    
    csv_path = os.path.join(folder, subregion.parent_name, subregion.name, '_'.join(years))+".csv"
    if not os.path.isfile(csv_path) or overwrite:
        # S
        if type == "S":
            output_df = pd.DataFrame({"year": years, "Built up surface GHSL":None, "Total surface":None, "Built up surface fraction":None})

            for y in years:
                # first use the Built_S gis
                gis = next((gis for gis in subregion.gis_list if gis.name == raster_str + "_" + y), None)
                if gis != None:
                    output_df.loc[output_df["year"]==y, "Built up surface GHSL"] = int(gis.get_total_sum_pixel_values())
                    output_df.loc[output_df["year"]==y, "Total surface"] = int(1e4*gis.get_total_number_pixels())
                    output_df.loc[output_df["year"]==y, "Built up surface fraction"] = output_df.loc[output_df["year"]==y, "Built up surface GHSL"] / output_df.loc[output_df["year"]==y, "Total surface"]
                else:
                    print(Fore.RED, f"{subregion.name} ", raster_str + "_" + y, " not found.", Style.RESET_ALL)

            # save the new df
            os.makedirs(os.path.dirname(csv_path), exist_ok=True)
            output_df.to_csv(csv_path, index=False)
            subregion.output_df_list.append(Df(output_df, type))
            print(colored(f"Saving {csv_path}", "green"))

        # POP    
        elif type == "POP":
            output_df = pd.DataFrame({"year": years, "Population":None})

            for y in years:
                # first use the Built_S gis
                gis = next((gis for gis in subregion.gis_list if gis.name == "POP_" + y), None)
                if gis != None:
                    output_df.loc[output_df["year"]==y, "Population"] = int(gis.get_total_sum_pixel_values())
                else:
                    print(Fore.RED, f"{subregion.name} ", "POP_" + y, " not found.", Style.RESET_ALL)

            # save the new df
            os.makedirs(os.path.dirname(csv_path), exist_ok=True)
            output_df.to_csv(csv_path, index=False)
            subregion.output_df_list.append(Df(output_df, type))
            print(colored(f"Saving {csv_path}", "green"))

        # If we fall here, something wrong happened    
        else:
            print(f"Type of GHSL data to compute : {type} not understood.")
    else:
        print("Reading ", csv_path)
        #use a precomputed csv
        subregion.output_df_list.append(Df(pd.read_csv(csv_path), type))

In [23]:
def get_DOSE_values(subregion, folder: str, overwrite=False):
    csv_path = os.path.join(folder, subregion.parent_name, subregion.name, '_'.join(years))+".csv"

    if not os.path.isfile(csv_path) or overwrite:
        output_df = pd.DataFrame({"year": years, "grp_pc_usd_2015":None, "pop":None, "ag_grp_pc_usd_2015":None, "man_grp_pc_usd_2015":None, "serv_grp_pc_usd_2015":None, "PPP":None})

        for y in years:
            int_y = int(y)
            if y == "2020":
                int_y = 2018
                print("Using 2018 values")
            try:
                output_df.loc[output_df["year"]==y, "grp_pc_usd_2015"] = dose_df.loc[(dose_df["GID_1"] == subregion.name) & (dose_df["year"] == int_y), "grp_pc_usd_2015"].values[0]
            except Exception as e:
                print("1", e)

            try:
                output_df.loc[output_df["year"]==y, "pop"] = dose_df.loc[(dose_df["GID_1"] == subregion.name) & (dose_df["year"] == int_y), "pop"].values[0]
            except Exception as e:
                print("1 pop", e)

            try:
                output_df.loc[output_df["year"]==y, "ag_grp_pc_usd_2015"] = dose_df.loc[(dose_df["GID_1"] == subregion.name) & (dose_df["year"] == int_y), "ag_grp_pc_usd_2015"].values[0]
            except Exception as e:
                print("2",e)

            try:    
                output_df.loc[output_df["year"]==y, "man_grp_pc_usd_2015"] = dose_df.loc[(dose_df["GID_1"] == subregion.name) & (dose_df["year"] == int_y), "man_grp_pc_usd_2015"].values[0]
            except Exception as e:
                print("3", e)

            try:
                output_df.loc[output_df["year"]==y, "serv_grp_pc_usd_2015"] = dose_df.loc[(dose_df["GID_1"] == subregion.name) & (dose_df["year"] == int_y), "serv_grp_pc_usd_2015"].values[0]
            except Exception as e:
                print("4",e)

            try:
                output_df.loc[output_df["year"]==y, "PPP"] = dose_df.loc[(dose_df["GID_1"] == subregion.name) & (dose_df["year"] == int_y), "PPP"].values[0]
            except Exception as e:
                print("5",e)

        # save the new df
        os.makedirs(os.path.dirname(csv_path), exist_ok=True)
        output_df.to_csv(csv_path, index=False)
        subregion.output_df_list.append(Df(output_df, type))
        print(colored(f"Saving {csv_path}", "green"))
    else:
        #use a precomputed csv
        print("Reading ", csv_path)
        subregion.output_df_list.append(Df(pd.read_csv(csv_path), type))


In [24]:
def get_OSM_areas(subregion, folder: str, overwrite=False): 
    gis_name = "OSM_building"  
    csv_path = os.path.join(folder, subregion.parent_name, subregion.name,  subregion.name +".csv")

    if not os.path.isfile(csv_path) or overwrite:
        gis = next((gis for gis in subregion.gis_list if gis.name == gis_name), None)
        if gis is not None:
            try:
                shp = gpd.read_file(gis.file)
                print("Starting area OSM ", gis.file)
                value = (shp["geometry"].area).sum()
                output_df = pd.DataFrame({gis_name+"_area":value}, index=[0])

                # save the new df
                os.makedirs(os.path.dirname(csv_path), exist_ok=True)
                output_df.to_csv(csv_path, index=False)
                print(colored(f"Saving {csv_path}", "green"))
                subregion.output_df_list.append(Df(output_df, gis_name))
                print("Ending area OSM ", gis.file)
            except Exception as e:
                print(e)
        else:
            print(f"OSM shp for {subregion.name} not found.")
    else:
        #use a precomputed csv
        print("Reading ", csv_path)
        subregion.output_df_list.append(Df(pd.read_csv(csv_path), gis_name))
    

### Preprocess with the desired computational methods

In [25]:
def preprocess():
    # Step 1 : make subregions
    overwrite = False

    print(Fore.GREEN + "Starting make_subregions()" + Style.RESET_ALL)
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        list(executor.map(lambda region: region.make_subregions(gpd_gadm_admin_units, subregion_col, parent_col, overwrite=overwrite), regions))

    # Step 2.1 : Computation
    overwrite = False

    subregions_list_parallel = []

    for region in regions:
        for subregion in region.subregions:
            subregions_list_parallel.append(subregion)
    #         try:
    #             file = os.path.join(folder_OSM_building, subregion.parent_name, subregion.name, subregion.name +".shp")
    #             if os.path.isfile(file):
    #                 subregion.add_gis(file, "OSM_building", "", "")
    #             else:
    #                 print(f"File {file} not found.")
    #         except Exception as e:
    #             print(e)

    print(Fore.GREEN + "Starting GHSL_S" + Style.RESET_ALL)
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        executor.map(get_GHSL_values, subregions_list_parallel, itertools.repeat(folder_GHSL_S), itertools.repeat("S"), itertools.repeat(overwrite))
    print(Fore.GREEN + "Starting GHSL_POP" + Style.RESET_ALL)   
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        executor.map(get_GHSL_values, subregions_list_parallel, itertools.repeat(folder_GHSL_POP), itertools.repeat("POP"), itertools.repeat(overwrite))
    print(Fore.GREEN + "Starting DOSE" + Style.RESET_ALL)

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        executor.map(get_DOSE_values, subregions_list_parallel, itertools.repeat(folder_DOSE), itertools.repeat(overwrite))

    # print(Fore.GREEN + "Starting OSM_area_computation()" + Style.RESET_ALL)
    # with ThreadPoolExecutor(max_workers=max_workers) as executor:
    #     executor.map(get_OSM_areas, subregions_list_parallel, itertools.repeat(folder_OSM_building), itertools.repeat(overwrite))

    del subregions_list_parallel # not needed anymore

In [26]:
preprocess()   

[32mStarting make_subregions()[0m
Saved a new tif:
 /data/mineralogie/hautervo/data/GHSL/Built_S/E1975_100m_Global/subregions/USA/subregions/USA.2_1.tif
Saved a new tif:
 /data/mineralogie/hautervo/data/GHSL/Built_S/E1990_100m_Global/subregions/USA/subregions/USA.2_1.tif
Saved a new tif:
 /data/mineralogie/hautervo/data/GHSL/Built_S/E2000_100m_Global/subregions/USA/subregions/USA.2_1.tif
Saved a new tif:
 /data/mineralogie/hautervo/data/GHSL/Built_S/E2010_100m_Global/subregions/USA/subregions/USA.2_1.tif
Saved a new tif:
 /data/mineralogie/hautervo/data/GHSL/Built_S/E2020_100m_Global/subregions/USA/subregions/USA.2_1.tif
Saved a new tif:
 /data/mineralogie/hautervo/data/GHSL/Built_S/E1975_100m_Global/subregions/USA/subregions/USA.3_1.tif
Saved a new tif:
 /data/mineralogie/hautervo/data/GHSL/Built_S/E1990_100m_Global/subregions/USA/subregions/USA.3_1.tif
Saved a new tif:
 /data/mineralogie/hautervo/data/GHSL/Built_S/E2000_100m_Global/subregions/USA/subregions/USA.3_1.tif
Saved a new 

In [27]:
from datetime import datetime

# Get the current date and time
current_datetime = datetime.now()
print(Fore.GREEN, "Ended normally.", Style.RESET_ALL)
print(current_datetime)

[32m Ended normally. [0m
2024-11-22 15:26:25.072562


# TEST ZONE

In [28]:
# pop_value = dose_df.loc[(dose_df["GID_1"] == "ALB.1_1") & (dose_df["year"] == 2014), "pop"].values[0]
