In [1]:
import pandas as pd
import numpy as np
import world_bank_data as wb
import polars as pl
from scipy.stats import gmean
import os 
os.chdir('..')

In [2]:
    def adjust(df):
        """
        Function for calculating the adjustment coefficient using the atkinson's method

        Parameters
        ----------
        df : <pd.DataFrame>
            dataframe with the index of the idh index

        Returns
        -------
        <float>
            coefficient of the adjustment
        <float>
            mean of the index
        <float>
            geometric mean of the index
        <float>
            atkinson's coefficient of the index
        """
        gemetric = gmean(df)
        amean = df.mean()
        atkinson = 1 - gemetric/amean
        coef = 1 - atkinson
        return coef, amean, gemetric, atkinson

    def to_category(value):
        mapping = {4:1, 5:2, 6:3, 7:4, 8:5, 
                   9:6, 10:7, 11:8, 2:9, 13:10,
                   14: 11,15: 11, 16:12, 17:12, 
                   18:12.5, 19:13, 20:24, 21:16,
        }
        return mapping.get(value, 0) if value <= 21 else 18

In [None]:
df = pd.DataFrame(wb.get_series('SP.DYN.LE00.IN', country='PR', simplify_index=True))
df = pl.from_pandas(df.reset_index())
df

In [None]:
df = df.rename({"SP.DYN.LE00.IN":"life_exp"}).fill_null(strategy="forward")
df = df.with_columns(
    pl.col("Year").cast(pl.Int64),
    ((pl.col("life_exp") - 20) / (85-20)).alias("health_index"),
    (((pl.col("life_exp") - 20) / (85-20)) * (1-0.08)).alias("health_index_adjusted"),
    arkinson=0.08
    )
df = df.with_columns(
    (pl.col("health_index").pct_change() * 100).alias("health_index_pct_change"),
    (pl.col("health_index_adjusted").pct_change() * 100).alias("health_index_adjusted_pct_change")
    )
df


# Income idex

In [None]:
# get atlas df from WB (change names)
atlas_df = pl.from_pandas(pd.DataFrame(wb.get_series('NY.GNP.PCAP.PP.CD', country='PR', simplify_index=True).reset_index()))
atlas_df = atlas_df.rename({"NY.GNP.PCAP.PP.CD": "atlas"}).drop_nulls()
atlas_df = atlas_df.with_columns(
    pl.col("Year").cast(pl.Int64),
    pl.col("atlas").cast(pl.Int64)
    )
atlas_df

In [None]:
# get gni constant df from WB
gni_df = pl.from_pandas(pd.DataFrame(wb.get_series('NY.GNP.PCAP.PP.KD', country='PR', simplify_index=True).reset_index()))
gni_df = gni_df.rename({'NY.GNP.PCAP.PP.KD': 'gni'})
gni_df = gni_df.with_columns(pl.col("Year").cast(pl.Int64))
atlas_df

In [None]:
# adjust the income index
adjusted_df = pl.DataFrame({
    "Year": [1],
    "coef": [1.1],
    "atkinson": [1.1],
}).clear()

for file in os.listdir('data/raw/'):
    if file.startswith('data_hpr'):
        adjust_df = pl.read_csv("data/raw/data_hpr_2012_raw.csv")
        adjust_df = adjust_df.select(pl.col("HINCP").drop_nulls())
        adjust_df = adjust_df.sort("HINCP")
        adjust_df = adjust_df.filter(pl.col("HINCP") > 0)

        # replace bottom 0.5% 
        bottom_max = adjust_df.select(pl.col("HINCP").quantile(0.005))
        adjust_df = adjust_df.select(
            pl.when(pl.col("HINCP") < bottom_max)
            .then(bottom_max)
            .otherwise(pl.col("HINCP")).alias("HINCP")
        )

        # drop top 0.5%
        adjust_df = adjust_df.filter(
            pl.col("HINCP") <= pl.col("HINCP").quantile(0.995))
        # get coefficient of adjustmet
        coef, amean, gemetric, atkinson = adjust(adjust_df)
        tmp_df = pl.DataFrame({
            "Year": int(file.split('_')[2]),
            "coef": coef[0][0],
            "atkinson": atkinson[0][0],
        })

        adjusted_df = pl.concat([adjusted_df, tmp_df], how="vertical")


In [None]:
# merge the two dataframes
inc_df = atlas_df.join(gni_df, on='Year')
inc_df = inc_df.with_columns(
    (pl.col("gni") / pl.col("atlas")).alias("income_ratio"))


In [None]:
# merge the income index with the pnb.csv file
pnb = pl.read_csv('data/external/pnb.csv')
merge_df = inc_df.join(pnb, on='Year', how='left').drop_nulls()
merge_df = merge_df.join(adjusted_df, on='Year', how='left')


In [None]:
# calculate the index
merge_df =  merge_df.with_columns(
    ((np.log(pl.col('pnb')) - np.log(100)) / (np.log(75000)-np.log(100))).alias('index'))
merge_df = merge_df.with_columns(
    (pl.col("index") * pl.col("coef")).alias("income_index_ajusted"))
merge_df = merge_df.select(pl.col("Year", "index", "income_index_ajusted")).drop_nulls()
merge_df

In [4]:
# get atlas df from WB (change names)
atlas_df = pl.from_pandas(pd.DataFrame(wb.get_series('NY.GNP.PCAP.PP.CD', country='PR', simplify_index=True).reset_index()))
atlas_df = atlas_df.rename({"NY.GNP.PCAP.PP.CD": "atlas"}).drop_nulls()
atlas_df = atlas_df.with_columns(
    pl.col("Year").cast(pl.Int64),
    pl.col("atlas").cast(pl.Int64))

# get gni constant df from WB
gni_df = pl.from_pandas(pd.DataFrame(wb.get_series('NY.GNP.PCAP.PP.KD', country='PR', simplify_index=True).reset_index()))
gni_df = gni_df.rename({'NY.GNP.PCAP.PP.KD': 'gni'})
gni_df = gni_df.with_columns(pl.col("Year").cast(pl.Int64))

# adjust the income index
adjusted_df = pl.DataFrame({"Year": [1],"coef": [1.1],"atkinson": [1.1],}).clear()

for file in os.listdir('data/raw/'):
    if file.startswith('data_hpr'):
        adjust_df = pl.read_csv("data/raw/data_hpr_2012_raw.csv")
        adjust_df = adjust_df.select(pl.col("HINCP").drop_nulls())
        adjust_df = adjust_df.sort("HINCP")
        adjust_df = adjust_df.filter(pl.col("HINCP") > 0)

        # replace bottom 0.5% 
        bottom_max = adjust_df.select(pl.col("HINCP").quantile(0.005))
        adjust_df = adjust_df.select(
            pl.when(pl.col("HINCP") < bottom_max)
            .then(bottom_max)
            .otherwise(pl.col("HINCP")).alias("HINCP"))

        # drop top 0.5%
        adjust_df = adjust_df.filter(
            pl.col("HINCP") <= pl.col("HINCP").quantile(0.995))
            
        # get coefficient of adjustmet
        coef, amean, gemetric, atkinson = adjust(adjust_df)
        tmp_df = pl.DataFrame({
            "Year": int(file.split('_')[2]),
            "coef": coef[0][0],
            "atkinson": atkinson[0][0]})

        adjusted_df = pl.concat([adjusted_df, tmp_df], how="vertical")

# merge the two dataframes
inc_df = atlas_df.join(gni_df, on='Year')
inc_df = inc_df.with_columns(
    (pl.col("gni") / pl.col("atlas")).alias("income_ratio"))

# merge the income index with the pnb.csv file
pnb = pl.read_csv('data/external/pnb.csv')
merge_df = inc_df.join(pnb, on='Year', how='left').drop_nulls()
merge_df = merge_df.join(adjusted_df, on='Year', how='left')

# calculate the index
merge_df =  merge_df.with_columns(
    ((np.log(pl.col('pnb')) - np.log(100)) / (np.log(75000)-np.log(100))).alias('index'))
merge_df = merge_df.with_columns(
    (pl.col("index") * pl.col("coef")).alias("income_index_ajusted"))
merge_df = merge_df.select(pl.col("Year", "index", "income_index_ajusted")).drop_nulls()