"¿Cómo influyen los indicadores económicos y demográficos en la tasa de crecimiento de la población a nivel global?"

In [97]:
# Importación de librerías necesarias
from pgmpy.models import BayesianNetwork
from pgmpy.factors.discrete import TabularCPD
from pgmpy.inference import VariableElimination, BeliefPropagation 
from pgmpy.independencies.Independencies import IndependenceAssertion
from pgmpy.estimators import BayesianEstimator, MaximumLikelihoodEstimator
from pgmpy.inference import VariableElimination

from pandas import read_csv, DataFrame
import numpy as np
import time

In [98]:


# Definir constantes para los nodos de la red bayesiana
LV_EDU = "Nivel educativo"                                  # 
LV_URB = "Nivel urbanización"                               # SP.URB.TOTL.IN.ZS
RT_PARO = "Tasa de paro"                                    # SL.UEM.TOTL.ZS
PNB = "Ingreso nacional bruto"                              # NY.GNP.PCAP.CD
PIB = "Producto Interior Bruto"            #GDP             # NY.GDP.PCAP.KD.ZG
GASTO_EDUCATIVO = "Gasto educativo"                         # SE.XPD.TOTL.GD.ZS
ACCESO_SALUD = "Acceso a salud"                             # SH.XPD.CHEX.PC.CD
RT_FERTILIDAD = "Tasa de fertilidad"                        # SP.DYN.TFRT.IN
RT_MORTALIDAD = "Tasa de mortalidad"                        # 
MIGRACION_NETA = "Migracion neta"                           # SM.POP.NETM
RT_CRECIMIENTO = "Tasa de crecimiento de la población"      # SP.POP.GROW

# Se crea el objeto para el model 
model = BayesianNetwork()

# Añadimos todos los nodos
nodes = [
    #LV_EDU, MIGRACION_NETA,
    RT_CRECIMIENTO, PIB, RT_PARO, GASTO_EDUCATIVO, ACCESO_SALUD, PNB, LV_URB, RT_FERTILIDAD, RT_MORTALIDAD]

model.add_nodes_from(nodes)

#Creamos y añadimos los caminos (origen, destino) del grafo dirigido
edges_RT_PARO = [
    # (LV_EDU, RT_PARO), 
    (LV_URB, RT_PARO)]
edges_PIB = [(RT_PARO, PIB), (PNB, PIB)]
edges_RT_FERTILIDAD = [(GASTO_EDUCATIVO, RT_FERTILIDAD), (ACCESO_SALUD, RT_FERTILIDAD)]
edges_RT_CRECIMIENTO = [
    # (MIGRACION_NETA, MIGRACION_NETA),
    (RT_FERTILIDAD, RT_CRECIMIENTO), 
    (RT_MORTALIDAD, RT_CRECIMIENTO)
    ]

model.add_edges_from(edges_RT_PARO)
model.add_edges_from(edges_PIB)
model.add_edge(PIB, GASTO_EDUCATIVO)
model.add_edges_from(edges_RT_FERTILIDAD)
model.add_edges_from(edges_RT_CRECIMIENTO)


In [99]:
years = {"min" : 2014, "max" : 2022}
df_raw = read_csv("./csv/data.csv", delimiter=';')
        
df_raw_growth = DataFrame(df_raw[:])
print("There are " + str(df_raw_growth.shape[0]) + " indicators in the dataframe.")
df_raw_growth.head()

There are 9 indicators in the dataframe.


Unnamed: 0,Indicator Name,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,Population growth,-0.3,-0.1,0.1,0.2,0.4,0.7,0.5,0.1,0.8
1,GDP per capita growth,1.7,3.9,3.0,2.7,1.8,1.3,-11.6,6.3,5.0
2,Unemployment,24.4,22.1,19.6,17.2,15.3,14.1,15.5,14.8,12.9
3,Public spending on education,4.3,4.3,4.2,4.2,4.2,4.2,4.9,4.6,4.5
4,Current health spending per capita,2679.5,2349.1,2376.6,2524.7,2741.4,2716.8,2899.0,3234.3,3143.4


In [100]:
df_growth = df_raw_growth.transpose().iloc[1:]
df_growth.columns = nodes[0:9]
df_growth

Unnamed: 0,Tasa de crecimiento de la población,Producto Interior Bruto,Tasa de paro,Gasto educativo,Acceso a salud,Ingreso nacional bruto,Nivel urbanización,Tasa de fertilidad,Tasa de mortalidad
2014,-0.3,1.7,24.4,4.3,2679.5,29160.0,79.4,1.3,0.85
2015,-0.1,3.9,22.1,4.3,2349.1,28460.0,79.6,1.3,0.91
2016,0.1,3.0,19.6,4.2,2376.6,27570.0,79.8,1.3,0.88
2017,0.2,2.7,17.2,4.2,2524.7,27120.0,80.1,1.3,0.91
2018,0.4,1.8,15.3,4.2,2741.4,29330.0,80.3,1.3,0.91
2019,0.7,1.3,14.1,4.2,2716.8,30360.0,80.6,1.2,0.89
2020,0.5,-11.6,15.5,4.9,2899.0,27180.0,80.8,1.2,1.04
2021,0.1,6.3,14.8,4.6,3234.3,30090.0,81.1,1.2,0.95
2022,0.8,5.0,12.9,4.5,3143.4,32090.0,81.3,1.2,0.98


In [101]:
TIERS_NUM = 3

def boundary_str(start, end, tier):
    return f'{tier}: {start:+0,.2f} to {end:+0,.2f}'

def relabel(v, boundaries):
    if v >= boundaries[0][0] and v <= boundaries[0][1]:
        return boundary_str(boundaries[0][0], boundaries[0][1], tier='A')
    elif v >= boundaries[1][0] and v <= boundaries[1][1]:
        return boundary_str(boundaries[1][0], boundaries[1][1], tier='B')
    elif v >= boundaries[2][0] and v <= boundaries[2][1]:
        return boundary_str(boundaries[2][0], boundaries[2][1], tier='C')
    else:
        return np.nan

def get_boundaries(tiers):
    prev_tier = tiers[0]
    boundaries = [(prev_tier[0], prev_tier[prev_tier.shape[0] - 1])]
    for index, tier in enumerate(tiers):
        if index != 0:
            boundaries.append((prev_tier[prev_tier.shape[0] - 1], tier[tier.shape[0] - 1]))
            prev_tier = tier
    return boundaries

new_columns = {}

for i, content in enumerate(df_growth.items()):
    (label, series) = content
    values = np.sort(np.array([x for x in series.tolist() if not np.isnan(x)] , dtype=float))
    if values.shape[0] < TIERS_NUM:
        print(f'Error: there are not enough data for label {label}')
        break
    boundaries = get_boundaries(tiers=np.array_split(values, TIERS_NUM))
    new_columns[label] = [relabel(value, boundaries) for value in series.tolist()]

df = DataFrame(data=new_columns)
df.columns = nodes
df.index = range(years["min"], years["max"] + 1)
df.head(10)

Unnamed: 0,Tasa de crecimiento de la población,Producto Interior Bruto,Tasa de paro,Gasto educativo,Acceso a salud,Ingreso nacional bruto,Nivel urbanización,Tasa de fertilidad,Tasa de mortalidad
2014,A: -0.30 to +0.10,A: -11.60 to +1.70,C: +17.20 to +24.40,B: +4.20 to +4.30,"B: +2,524.70 to +2,741.40","B: +27,570.00 to +29,330.00",A: +79.40 to +79.80,B: +1.20 to +1.30,A: +0.85 to +0.89
2015,A: -0.30 to +0.10,C: +3.00 to +6.30,C: +17.20 to +24.40,B: +4.20 to +4.30,"A: +2,349.10 to +2,524.70","B: +27,570.00 to +29,330.00",A: +79.40 to +79.80,B: +1.20 to +1.30,B: +0.89 to +0.91
2016,A: -0.30 to +0.10,B: +1.70 to +3.00,C: +17.20 to +24.40,A: +4.20 to +4.20,"A: +2,349.10 to +2,524.70","A: +27,120.00 to +27,570.00",A: +79.40 to +79.80,B: +1.20 to +1.30,A: +0.85 to +0.89
2017,B: +0.10 to +0.40,B: +1.70 to +3.00,B: +14.80 to +17.20,A: +4.20 to +4.20,"A: +2,349.10 to +2,524.70","A: +27,120.00 to +27,570.00",B: +79.80 to +80.60,B: +1.20 to +1.30,B: +0.89 to +0.91
2018,B: +0.10 to +0.40,B: +1.70 to +3.00,B: +14.80 to +17.20,A: +4.20 to +4.20,"B: +2,524.70 to +2,741.40","B: +27,570.00 to +29,330.00",B: +79.80 to +80.60,B: +1.20 to +1.30,B: +0.89 to +0.91
2019,C: +0.40 to +0.80,A: -11.60 to +1.70,A: +12.90 to +14.80,A: +4.20 to +4.20,"B: +2,524.70 to +2,741.40","C: +29,330.00 to +32,090.00",B: +79.80 to +80.60,A: +1.20 to +1.20,A: +0.85 to +0.89
2020,C: +0.40 to +0.80,A: -11.60 to +1.70,B: +14.80 to +17.20,C: +4.30 to +4.90,"C: +2,741.40 to +3,234.30","A: +27,120.00 to +27,570.00",C: +80.60 to +81.30,A: +1.20 to +1.20,C: +0.91 to +1.04
2021,A: -0.30 to +0.10,C: +3.00 to +6.30,A: +12.90 to +14.80,C: +4.30 to +4.90,"C: +2,741.40 to +3,234.30","C: +29,330.00 to +32,090.00",C: +80.60 to +81.30,A: +1.20 to +1.20,C: +0.91 to +1.04
2022,C: +0.40 to +0.80,C: +3.00 to +6.30,A: +12.90 to +14.80,C: +4.30 to +4.90,"C: +2,741.40 to +3,234.30","C: +29,330.00 to +32,090.00",C: +80.60 to +81.30,A: +1.20 to +1.20,C: +0.91 to +1.04


# APRENDIZAJE

In [102]:
model.cpds = []
model.fit(data=df,
          estimator=BayesianEstimator,
          prior_type="BDeu",
          equivalent_sample_size=10)

print(f'Check model: {model.check_model()}\n')

for cpd in model.get_cpds():
    cpd.to_csv(filename=f'./csv/{cpd.variable}.csv')
    print(f'CPT of {cpd.variable}:')
    print(cpd, '\n')

Check model: True

CPT of Tasa de crecimiento de la población:
+-----+
| ... |
+-----+
| ... |
+-----+
| ... |
+-----+
| ... |
+-----+
| ... |
+-----+ 

CPT of Producto Interior Bruto:
+-----+
| ... |
+-----+
| ... |
+-----+
| ... |
+-----+
| ... |
+-----+
| ... |
+-----+ 

CPT of Tasa de paro:
+-----------------------------------+-----+
| Nivel urbanización                | ... |
+-----------------------------------+-----+
| Tasa de paro(A: +12.90 to +14.80) | ... |
+-----------------------------------+-----+
| Tasa de paro(B: +14.80 to +17.20) | ... |
+-----------------------------------+-----+
| Tasa de paro(C: +17.20 to +24.40) | ... |
+-----------------------------------+-----+ 

CPT of Gasto educativo:
+-----+
| ... |
+-----+
| ... |
+-----+
| ... |
+-----+
| ... |
+-----+ 

CPT of Acceso a salud:
+-------------------------------------------+----------+
| Acceso a salud(A: +2,349.10 to +2,524.70) | 0.333333 |
+-------------------------------------------+----------+
| Acceso a sal

# INFERENCIA

In [103]:
def query_report(infer, variables, evidence=None, elimination_order="MinFill", show_progress=False, desc=""):
    if desc:
        print(desc)
    start_time = time.time()
    print(infer.query(variables=variables,
                      evidence=evidence,
                      elimination_order=elimination_order,
                      show_progress=show_progress))
    print(f'--- Query executed in {time.time() - start_time:0,.4f} seconds ---\n')

def get_ordering(infer, variables, evidence=None, elimination_order="MinFill", show_progress=False, desc=""):
    start_time = time.time()
    ordering = infer._get_elimination_order(variables=variables,
                                        evidence=evidence,
                                        elimination_order=elimination_order,
                                        show_progress=show_progress)
    if desc:
        print(desc, ordering, sep='\n')
        print(f'--- Ordering found in {time.time() - start_time:0,.4f} seconds ---\n')
    return ordering

def padding(heuristic):
    return (heuristic + ":").ljust(16)

def compare_all_ordering(infer, variables, evidence=None, show_progress=False):
    ord_dict = {
        "MinFill": get_ordering(infer, variables, evidence, "MinFill", show_progress),
        "MinNeighbors": get_ordering(infer, variables, evidence, "MinNeighbors", show_progress),
        "MinWeight": get_ordering(infer, variables, evidence, "MinWeight", show_progress),
        "WeightedMinFill": get_ordering(infer, variables, evidence, "WeightedMinFill", show_progress)
    }
    if not evidence:
        pre = f'elimination order found for probability query of {variables} with no evidence:'
    else:
        pre = f'elimination order found for probability query of {variables} with evidence {evidence}:'
    if ord_dict["MinFill"] == ord_dict["MinNeighbors"] and ord_dict["MinFill"] == ord_dict["MinWeight"] and ord_dict["MinFill"] == ord_dict["WeightedMinFill"]:
        print(f'All heuristics find the same {pre}.\n{ord_dict["MinFill"]}\n')
    else:
        print(f'Different {pre}')
        for heuristic, order in ord_dict.items():
            print(f'{padding(heuristic)} {order}')
        print()

infer = VariableElimination(model)

var = [RT_CRECIMIENTO]
heuristic = "MinNeighbors"
ordering = get_ordering(infer, variables=var, elimination_order=heuristic,
                        desc=f'Elimination order for {var} with no evidence computed through {heuristic} heuristic:')
query_report(infer, variables=var, elimination_order=ordering,
             desc=f'Probability query of {var} with no evidence through precomputed elimination order:')
query_report(infer, variables=var, elimination_order=list(reversed(ordering)),
             desc=f'Probability query of {var} with no evidence through dummy elimination order:')
# compare_all_ordering(infer, variables=var)

# var = ['CO2']
# ev = {'EC': 'A: -7.05 to -0.12'}
# heuristic = "MinFill"
# query_report(infer, variables=var, evidence=ev, elimination_order=heuristic,
#              desc=f'Probability query of {var} with evidence {ev} computed through {heuristic} heuristic:')
# compare_all_ordering(infer, variables=var, evidence=ev)
# heuristic = "MinNeighbors"
# query_report(infer, variables=var, evidence=ev, elimination_order=heuristic,
#              desc=f'Probability query of {var} with evidence {ev} computed through {heuristic} heuristic:')

Elimination order for ['Tasa de crecimiento de la población'] with no evidence computed through MinNeighbors heuristic:
['Nivel urbanización', 'Tasa de mortalidad', 'Ingreso nacional bruto', 'Tasa de paro', 'Producto Interior Bruto', 'Acceso a salud', 'Gasto educativo', 'Tasa de fertilidad']
--- Ordering found in 0.0020 seconds ---

Probability query of ['Tasa de crecimiento de la población'] with no evidence through precomputed elimination order:
+--------------------------------------------------------+--------------------------------------------+
| Tasa de crecimiento de la población                    |   phi(Tasa de crecimiento de la población) |
| Tasa de crecimiento de la población(A: -0.30 to +0.10) |                                     0.3807 |
+--------------------------------------------------------+--------------------------------------------+
| Tasa de crecimiento de la población(B: +0.10 to +0.40) |                                     0.2880 |
+---------------------------