"¿Cómo influyen los indicadores económicos y demográficos en la tasa de crecimiento de la población a nivel global?"

In [46]:
# Importación de librerías necesarias
from pgmpy.models import BayesianNetwork
from pgmpy.inference import VariableElimination, ApproxInference, BeliefPropagation
from pgmpy.estimators import BayesianEstimator
from pgmpy.sampling import BayesianModelSampling

from pandas import read_csv, DataFrame
import numpy as np
import time

In [47]:


# Definir constantes para los nodos de la red bayesiana
LV_EDU = "% de personas mayores de 25 años con al menos la ESO"  # SE.SEC.CUAT.UP.ZS
LV_URB = "Porcentaje de población urbana"                        # SP.URB.TOTL.IN.ZS
RT_PARO = "Tasa de paro"                                         # SL.UEM.TOTL.ZS
PNB = "Ingreso nacional bruto"                                   # NY.GNP.PCAP.CD
PIB = "Producto Interior Bruto"            #GDP                  # NY.GDP.PCAP.KD.ZG
GASTO_EDUCATIVO = "Gasto educativo"                              # SE.XPD.TOTL.GD.ZS
GASTO_SALUD = "Gastos en salud"                                  # SH.XPD.CHEX.PC.CD
RT_FERTILIDAD = "Tasa de fertilidad"                             # SP.DYN.TFRT.IN
RT_MORTALIDAD = "Tasa de mortalidad"                         
MIGRACION_NETA = "Migracion neta"                                # SM.POP.NETM
RT_CRECIMIENTO = "Tasa de crecimiento de la población"           # SP.POP.GROW

# Se crea el objeto para el model 
model = BayesianNetwork()

# Añadimos todos los nodos
nodes = [
    RT_CRECIMIENTO, PIB, RT_PARO, GASTO_EDUCATIVO, GASTO_SALUD, PNB, LV_URB, RT_FERTILIDAD, RT_MORTALIDAD,
    MIGRACION_NETA, LV_EDU]

model.add_nodes_from(nodes)

#Creamos y añadimos los caminos (origen, destino) del grafo dirigido
edges_RT_PARO = [
    (LV_EDU, RT_PARO), 
    (LV_URB, RT_PARO)
    ]
edges_PIB = [
    (RT_PARO, PIB), 
    (PNB, PIB)
    ]
edges_RT_FERTILIDAD = [
    (GASTO_EDUCATIVO, RT_FERTILIDAD), 
    (GASTO_SALUD, RT_FERTILIDAD)
    ]
edges_RT_CRECIMIENTO = [
    (MIGRACION_NETA, RT_CRECIMIENTO),
    (RT_FERTILIDAD, RT_CRECIMIENTO), 
    (RT_MORTALIDAD, RT_CRECIMIENTO)
    ]

model.add_edges_from(edges_RT_PARO)
model.add_edges_from(edges_PIB)
model.add_edge(PIB, GASTO_EDUCATIVO)
model.add_edge(PIB, GASTO_SALUD)
model.add_edges_from(edges_RT_FERTILIDAD)
model.add_edge(GASTO_SALUD, RT_MORTALIDAD)
model.add_edges_from(edges_RT_CRECIMIENTO)


In [48]:
years = {"min" : 1975, "max" : 2022}
df_raw = read_csv("./csv/data.csv", delimiter=';')
        
df_raw_growth = DataFrame(df_raw[:])
print("There are " + str(df_raw_growth.shape[0]) + " indicators in the dataframe.")
df_raw_growth.head()

There are 11 indicators in the dataframe.


Unnamed: 0,Year,1975,1976,1977,1978,1979,1980,1981,1982,1983,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,Population growth (annual %),1.081294,1.056852,1.029132,0.962877,0.881585,0.802964,0.710878,0.60039,0.487183,...,-0.327669,-0.298951,-0.077589,0.08443,0.234588,0.437983,0.717716,0.488502,0.105799,0.761702
1,GDP per capita growth (annual %),-0.539097,2.217773,1.785657,0.490728,-0.836528,1.391315,-0.839887,0.640409,1.275515,...,-1.079741,1.699353,3.919117,2.950816,2.734475,1.83746,1.254632,-11.600189,6.290659,4.968051
2,"Unemployment, total (% of total labor force) (...",,,,,,,,,,...,26.09,24.44,22.06,19.64,17.22,15.25,14.1,15.53,14.78,12.92
3,"Educational attainment, at least completed upp...",,,,,,,12.83375,,,...,47.237251,47.863049,48.649059,49.865326,50.416988,52.195572,53.256748,54.987942,55.349281,
4,Current health expenditure per capita (current...,,,,,,,,,,...,2629.466797,2679.476563,2349.09375,2376.601074,2524.684814,2741.382568,2716.833496,2899.018799,3234.292969,


In [49]:
df_growth = df_raw_growth.transpose().iloc[1:]
df_growth.columns = nodes
df_growth

Unnamed: 0,Tasa de crecimiento de la población,Producto Interior Bruto,Tasa de paro,Gasto educativo,Gastos en salud,Ingreso nacional bruto,Porcentaje de población urbana,Tasa de fertilidad,Tasa de mortalidad,Migracion neta,% de personas mayores de 25 años con al menos la ESO
1975,1.081294,-0.539097,,,,3240.0,69.57,2.77,298192.0,1177.0,
1976,1.056852,2.217773,,,,3430.0,70.231,2.77,299007.0,4854.0,1.89939
1977,1.029132,1.785657,,,,3640.0,70.883,2.65,294324.0,195.0,2.07717
1978,0.962877,0.490728,,,,4040.0,71.527,2.54,296781.0,-1893.0,2.23073
1979,0.881585,-0.836528,,,,5010.0,72.162,2.37,291213.0,-6124.0,2.33805
1980,0.802964,1.391315,,,,6210.0,72.789,2.22,289344.0,15108.0,
1981,0.710878,-0.839887,,12.83375,,6310.0,73.279,2.04,293386.0,-6044.0,
1982,0.60039,0.640409,,,,5680.0,73.515,1.94,286655.0,-16439.0,
1983,0.487183,1.275515,,,,4860.0,73.749,1.8,302569.0,-21369.0,
1984,0.415256,1.362897,,,,4580.0,73.982,1.73,299409.0,-18090.0,


In [50]:
TIERS_NUM = 3

def boundary_str(start, end, tier):
    return f'{tier}: {start:+0,.2f} to {end:+0,.2f}'

def relabel(v, boundaries):
    if v >= boundaries[0][0] and v <= boundaries[0][1]:
        return boundary_str(boundaries[0][0], boundaries[0][1], tier='A')
    elif v >= boundaries[1][0] and v <= boundaries[1][1]:
        return boundary_str(boundaries[1][0], boundaries[1][1], tier='B')
    elif v >= boundaries[2][0] and v <= boundaries[2][1]:
        return boundary_str(boundaries[2][0], boundaries[2][1], tier='C')
    else:
        return np.nan

def get_boundaries(tiers):
    prev_tier = tiers[0]
    boundaries = [(prev_tier[0], prev_tier[prev_tier.shape[0] - 1])]
    for index, tier in enumerate(tiers):
        if index != 0:
            boundaries.append((prev_tier[prev_tier.shape[0] - 1], tier[tier.shape[0] - 1]))
            prev_tier = tier
    return boundaries

new_columns = {}

for i, content in enumerate(df_growth.items()):
    (label, series) = content
    values = np.array([x for x in series.tolist() if not np.isnan(float(x))] , dtype=float)
    values = np.sort(values)
    if values.shape[0] < TIERS_NUM:
        print(f'Error: there are not enough data for label {label}')
        break
    boundaries = get_boundaries(tiers=np.array_split(values, TIERS_NUM))
    new_columns[label] = [relabel(float(value), boundaries) for value in series.tolist()]

df = DataFrame(data=new_columns)
df.columns = nodes
df.index = range(years["min"], years["max"] + 1)
df.head(10)

Unnamed: 0,Tasa de crecimiento de la población,Producto Interior Bruto,Tasa de paro,Gasto educativo,Gastos en salud,Ingreso nacional bruto,Porcentaje de población urbana,Tasa de fertilidad,Tasa de mortalidad,Migracion neta,% de personas mayores de 25 años con al menos la ESO
1975,C: +0.71 to +1.85,A: -11.60 to +1.25,,,,"A: +3,240.00 to +12,210.00",A: +69.57 to +75.35,C: +1.38 to +2.77,"A: +286,655.00 to +331,515.00","B: -871.00 to +166,516.00",
1976,C: +0.71 to +1.85,B: +1.25 to +2.30,,,,"A: +3,240.00 to +12,210.00",A: +69.57 to +75.35,C: +1.38 to +2.77,"A: +286,655.00 to +331,515.00","B: -871.00 to +166,516.00",A: +1.90 to +4.14
1977,C: +0.71 to +1.85,B: +1.25 to +2.30,,,,"A: +3,240.00 to +12,210.00",A: +69.57 to +75.35,C: +1.38 to +2.77,"A: +286,655.00 to +331,515.00","B: -871.00 to +166,516.00",A: +1.90 to +4.14
1978,C: +0.71 to +1.85,A: -11.60 to +1.25,,,,"A: +3,240.00 to +12,210.00",A: +69.57 to +75.35,C: +1.38 to +2.77,"A: +286,655.00 to +331,515.00","A: -254,292.00 to -871.00",A: +1.90 to +4.14
1979,C: +0.71 to +1.85,A: -11.60 to +1.25,,,,"A: +3,240.00 to +12,210.00",A: +69.57 to +75.35,C: +1.38 to +2.77,"A: +286,655.00 to +331,515.00","A: -254,292.00 to -871.00",A: +1.90 to +4.14
1980,C: +0.71 to +1.85,B: +1.25 to +2.30,,,,"A: +3,240.00 to +12,210.00",A: +69.57 to +75.35,C: +1.38 to +2.77,"A: +286,655.00 to +331,515.00","B: -871.00 to +166,516.00",
1981,B: +0.41 to +0.71,A: -11.60 to +1.25,,A: +12.83 to +42.83,,"A: +3,240.00 to +12,210.00",A: +69.57 to +75.35,C: +1.38 to +2.77,"A: +286,655.00 to +331,515.00","A: -254,292.00 to -871.00",
1982,B: +0.41 to +0.71,A: -11.60 to +1.25,,,,"A: +3,240.00 to +12,210.00",A: +69.57 to +75.35,C: +1.38 to +2.77,"A: +286,655.00 to +331,515.00","A: -254,292.00 to -871.00",
1983,B: +0.41 to +0.71,B: +1.25 to +2.30,,,,"A: +3,240.00 to +12,210.00",A: +69.57 to +75.35,C: +1.38 to +2.77,"A: +286,655.00 to +331,515.00","A: -254,292.00 to -871.00",
1984,B: +0.41 to +0.71,B: +1.25 to +2.30,,,,"A: +3,240.00 to +12,210.00",A: +69.57 to +75.35,C: +1.38 to +2.77,"A: +286,655.00 to +331,515.00","A: -254,292.00 to -871.00",


# APRENDIZAJE

In [51]:
model.cpds = []
model.fit(data=df,
          estimator=BayesianEstimator,
          prior_type="BDeu",
          equivalent_sample_size=10)

print(f'Check model: {model.check_model()}\n')

for cpd in model.get_cpds():
    cpd.to_csv(filename=f'./csv/cpds/{cpd.variable}.csv')
    print(f'CPT of {cpd.variable}:')
    print(cpd, '\n')

Check model: True

CPT of Tasa de crecimiento de la población:
+-----+
| ... |
+-----+
| ... |
+-----+
| ... |
+-----+
| ... |
+-----+
| ... |
+-----+
| ... |
+-----+ 

CPT of Producto Interior Bruto:
+-----+
| ... |
+-----+
| ... |
+-----+
| ... |
+-----+
| ... |
+-----+
| ... |
+-----+ 

CPT of Tasa de paro:
+-----+
| ... |
+-----+
| ... |
+-----+
| ... |
+-----+
| ... |
+-----+
| ... |
+-----+ 

CPT of Gasto educativo:
+-----+
| ... |
+-----+
| ... |
+-----+
| ... |
+-----+
| ... |
+-----+ 

CPT of Gastos en salud:
+-----+
| ... |
+-----+
| ... |
+-----+
| ... |
+-----+
| ... |
+-----+ 

CPT of Ingreso nacional bruto:
+-----------------------------------------------------+----------+
| Ingreso nacional bruto(A: +3,240.00 to +12,210.00)  | 0.333333 |
+-----------------------------------------------------+----------+
| Ingreso nacional bruto(B: +12,210.00 to +27,120.00) | 0.333333 |
+-----------------------------------------------------+----------+
| Ingreso nacional bruto(C: +27,120.

# INFERENCIA

## INFERENCIA EXACTA

In [52]:
def query_report(infer, variables, evidence=None, elimination_order="MinFill", show_progress=False, desc=""):
    if desc:
        print(desc)
    start_time = time.time()
    print(infer.query(variables=variables,
                      evidence=evidence,
                      elimination_order=elimination_order,
                      show_progress=show_progress))
    print(f'--- Query executed in {time.time() - start_time:0,.4f} seconds ---\n')

def get_ordering(infer, variables, evidence=None, elimination_order="MinFill", show_progress=False, desc=""):
    start_time = time.time()
    ordering = infer._get_elimination_order(variables=variables,
                                        evidence=evidence,
                                        elimination_order=elimination_order,
                                        show_progress=show_progress)
    if desc:
        print(desc, ordering, sep='\n')
        print(f'--- Ordering found in {time.time() - start_time:0,.4f} seconds ---\n')
    return ordering

def padding(heuristic):
    return (heuristic + ":").ljust(16)

def compare_all_ordering(infer, variables, evidence=None, show_progress=False):
    ord_dict = {
        "MinFill": get_ordering(infer, variables, evidence, "MinFill", show_progress),
        "MinNeighbors": get_ordering(infer, variables, evidence, "MinNeighbors", show_progress),
        "MinWeight": get_ordering(infer, variables, evidence, "MinWeight", show_progress),
        "WeightedMinFill": get_ordering(infer, variables, evidence, "WeightedMinFill", show_progress)
    }
    if not evidence:
        pre = f'elimination order found for probability query of {variables} with no evidence:'
    else:
        pre = f'elimination order found for probability query of {variables} with evidence {evidence}:'
    if ord_dict["MinFill"] == ord_dict["MinNeighbors"] and ord_dict["MinFill"] == ord_dict["MinWeight"] and ord_dict["MinFill"] == ord_dict["WeightedMinFill"]:
        print(f'All heuristics find the same {pre}.\n{ord_dict["MinFill"]}\n')
    else:
        print(f'Different {pre}')
        for heuristic, order in ord_dict.items():
            print(f'{padding(heuristic)} {order}')
        print()

In [53]:
# Variable Elimination

inference_1 = VariableElimination(model)

var = [RT_CRECIMIENTO]
heuristic = "MinNeighbors"
ordering = get_ordering(inference_1, variables=var, elimination_order=heuristic,
                        desc=f'Elimination order for {var} with no evidence computed through {heuristic} heuristic:')
query_report(inference_1, variables=var, elimination_order=ordering,
             desc=f'Probability query of {var} with no evidence through precomputed elimination order:')
query_report(inference_1, variables=var, elimination_order=list(reversed(ordering)),
             desc=f'Probability query of {var} with no evidence through dummy elimination order:')

Elimination order for ['Tasa de crecimiento de la población'] with no evidence computed through MinNeighbors heuristic:
['Porcentaje de población urbana', '% de personas mayores de 25 años con al menos la ESO', 'Tasa de paro', 'Ingreso nacional bruto', 'Producto Interior Bruto', 'Gasto educativo', 'Gastos en salud', 'Tasa de fertilidad', 'Migracion neta', 'Tasa de mortalidad']
--- Ordering found in 0.0041 seconds ---

Probability query of ['Tasa de crecimiento de la población'] with no evidence through precomputed elimination order:
+--------------------------------------------------------+--------------------------------------------+
| Tasa de crecimiento de la población                    |   phi(Tasa de crecimiento de la población) |
| Tasa de crecimiento de la población(A: -0.33 to +0.41) |                                     0.4284 |
+--------------------------------------------------------+--------------------------------------------+
| Tasa de crecimiento de la población(B: +0.4

In [54]:
# Belief Propagation
belief_propagation = BeliefPropagation(model)
start_time = time.time() 
print(belief_propagation.query(variables=[RT_CRECIMIENTO]))
end_time = time.time() 
print(f'--- Query executed in {end_time - start_time:0,.4f} seconds ---\n')

+--------------------------------------------------------+--------------------------------------------+
| Tasa de crecimiento de la población                    |   phi(Tasa de crecimiento de la población) |
| Tasa de crecimiento de la población(A: -0.33 to +0.41) |                                     0.4284 |
+--------------------------------------------------------+--------------------------------------------+
| Tasa de crecimiento de la población(B: +0.41 to +0.71) |                                     0.2211 |
+--------------------------------------------------------+--------------------------------------------+
| Tasa de crecimiento de la población(C: +0.71 to +1.85) |                                     0.3505 |
+--------------------------------------------------------+--------------------------------------------+
--- Query executed in 0.0204 seconds ---



## INFERENCIA APROXIMADA

In [62]:
# Bayesian Model Sampling

inference_3 = BayesianModelSampling(model)
start_time = time.time() 
result = inference_3.likelihood_weighted_sample(size=2)
end_time = time.time() 
print(result)
print(f'--- Query executed in {end_time - start_time:0,.4f} seconds ---\n')

Generating for node: Tasa de crecimiento de la población: 100%|██████████| 11/11 [00:00<00:00, 520.96it/s]

  Tasa de crecimiento de la población Producto Interior Bruto  \
0                   A: -0.33 to +0.41       B: +1.25 to +2.30   
1                   A: -0.33 to +0.41       C: +2.30 to +6.29   

          Tasa de paro      Gasto educativo            Gastos en salud  \
0   A: +8.23 to +14.10  C: +47.86 to +55.35  A: +1,005.28 to +2,349.09   
1  B: +14.10 to +19.86  B: +42.83 to +47.86  B: +2,349.09 to +2,716.83   

        Ingreso nacional bruto Porcentaje de población urbana  \
0   A: +3,240.00 to +12,210.00            B: +75.35 to +77.50   
1  C: +27,120.00 to +32,560.00            C: +77.50 to +81.30   

  Tasa de fertilidad             Tasa de mortalidad  \
0  B: +1.27 to +1.38  C: +384,828.00 to +493,776.00   
1  C: +1.38 to +2.77  C: +384,828.00 to +493,776.00   

              Migracion neta  \
0  A: -254,292.00 to -871.00   
1  B: -871.00 to +166,516.00   

  % de personas mayores de 25 años con al menos la ESO  _weight  
0                                  B: +4.14 to +4.29    


  df = pd.DataFrame.from_records(samples)


In [56]:
# Approximate Inference Using Sampling

inference_4 = ApproxInference(model)
# ev = {RT_FERTILIDAD: "A: +1.13 to +1.27",
#       RT_MORTALIDAD: "A: +286,655.00 to +331,515.00",
#       MIGRACION_NETA: "A: -254,292.00 to -871.00"}
start_time = time.time() 
print(inference_4.query(variables=[RT_CRECIMIENTO]))
end_time = time.time() 
print(f'--- Query executed in {end_time - start_time:0,.4f} seconds ---\n')

Generating for node: Tasa de crecimiento de la población: 100%|██████████| 11/11 [00:00<00:00, 174.12it/s]

+--------------------------------------------------------+--------------------------------------------+
| Tasa de crecimiento de la población                    |   phi(Tasa de crecimiento de la población) |
| Tasa de crecimiento de la población(C: +0.71 to +1.85) |                                     0.3600 |
+--------------------------------------------------------+--------------------------------------------+
| Tasa de crecimiento de la población(A: -0.33 to +0.41) |                                     0.4173 |
+--------------------------------------------------------+--------------------------------------------+
| Tasa de crecimiento de la población(B: +0.41 to +0.71) |                                     0.2227 |
+--------------------------------------------------------+--------------------------------------------+
--- Query executed in 0.0654 seconds ---




  df = pd.DataFrame.from_records(samples)
