In [78]:
import pandas as pd
import numpy as np
from scipy.stats import gmean, describe
import world_bank_data as wb

In [79]:
def coef_and_stats(data):
    gemetric = gmean(data)
    amean = data.mean()
    atkinson = 1 - gemetric/amean
    coef = 1 - atkinson
    return coef, amean, gemetric, atkinson

In [80]:
def to_category(x):
    mapping = {4: 1, 5: 2, 6: 3, 7: 4, 8: 5, 
               9: 6, 10: 7, 11: 8, 12: 9, 13: 10,
               14: 11, 15: 11, 16: 12, 17: 12, 
               18: 12.5, 19: 13, 20: 14, 21: 16,
    }
    return mapping.get(x, 0) if x <= 21 else 18

In [81]:
df = pd.read_csv('/Volumes/Nergigante/Project_Anarchy/Coding/GitHub/IDH-index/data/raw/data_ppr_2021_raw.csv', low_memory=False)
df = df[['AGEP', 'SCH', 'SCHL']]

# calcualte the mean of years of schooling
edu_sch = df[df['AGEP'] >= 25].copy()
edu_sch['scholing'] = edu_sch['SCHL']
edu_sch.reset_index(inplace=True)
edu_sch['scholing'] = edu_sch['scholing'].apply(lambda x: to_category(x))
edu_sch['enroled'] = np.where(edu_sch['scholing'] > 1, 1, 0)
mean_sch = edu_sch['scholing'].mean()

# get coeficient of ajustment
edu_sch['no_zero_schooling'] = 1 + edu_sch['scholing']
coef, amean, gemetric, atkinson = coef_and_stats(edu_sch['no_zero_schooling'])

# calculate the expected years of schooling
edu_exp = df[df['AGEP'] < 25].copy()
edu_exp['enrolled'] = edu_exp['SCH'].apply(lambda x: 1 if x > 1 else 0)
edu_age = edu_exp.groupby(['AGEP'])[['AGEP','enrolled']].count()
edu_age['enrolled'] = edu_exp.groupby(['AGEP'])['enrolled'].sum()
edu_age['enrollment_rate'] = edu_age['enrolled'] / edu_age['AGEP']
edu_age = edu_age.rename (columns = {'AGEP': 'count'})
edu_age = edu_age.reset_index()
edu_age.drop([0,1,2,3,4], inplace=True)
exp_sch = edu_age['enrollment_rate'].sum()

# calculate index
edu_value = (mean_sch/15 + exp_sch/18) / 2
edu_value_ajusted = coef * edu_value
print(f'index: {edu_value}\n', f"index adj: {edu_value_ajusted}")

index: 0.8716164146530334
 index adj: 0.798147282963029


In [82]:
print(0.798 - 0.791)
print(0.872 - 0.8716)

0.007000000000000006
0.00039999999999995595


In [83]:
np.mean(edu_sch['no_zero_schooling'])

13.346702429788577

In [84]:
gmean(edu_sch['no_zero_schooling'])

12.221699937916302

In [85]:
edu_sch['no_zero_schooling'].mean()

13.346702429788577

In [86]:
coef, amean, gemetric, atkinson = coef_and_stats(edu_sch['no_zero_schooling'])
print(f' Coef: {coef}\n', 
      f'Amean: {amean}\n', 
      f'Geometric: {gemetric}\n', 
      f'Atkinson: {atkinson}\n',
      f'Index: 0.906\n',
      f'Index adj: {0.906 * coef}')

 Coef: 0.9157093298670258
 Amean: 13.346702429788577
 Geometric: 12.221699937916302
 Atkinson: 0.0842906701329742
 Index: 0.906
 Index adj: 0.8296326528595254


In [87]:
0.8296326528595254

0.8296326528595254

In [88]:
edu_sch

Unnamed: 0,index,AGEP,SCH,SCHL,scholing,enroled,no_zero_schooling
0,0,40,1.0,19.0,13.0,1,14.0
1,2,61,1.0,18.0,12.5,1,13.5
2,3,63,1.0,12.0,9.0,1,10.0
3,4,58,1.0,16.0,12.0,1,13.0
4,5,84,1.0,18.0,12.5,1,13.5
...,...,...,...,...,...,...,...
95065,124663,56,1.0,16.0,12.0,1,13.0
95066,124664,47,1.0,22.0,18.0,1,19.0
95067,124665,46,1.0,21.0,16.0,1,17.0
95068,124668,61,1.0,17.0,12.0,1,13.0


In [89]:
edu_sch['scholing'].unique()

array([13. , 12.5,  9. , 12. , 16. , 18. , 14. ,  5. ,  0. ,  8. ,  7. ,
       11. , 10. ,  6. ,  4. ,  2. ,  3. ,  1. ])

In [90]:
df[df['SCH'] == 0].count()

AGEP    0
SCH     0
SCHL    0
dtype: int64

In [91]:
edu_sch[edu_sch['scholing'] == 0].count()

index                2363
AGEP                 2363
SCH                  2363
SCHL                 2363
scholing             2363
enroled              2363
no_zero_schooling    2363
dtype: int64

In [92]:
# save to csv
edu_sch.to_csv('/Volumes/Nergigante/Project_Anarchy/Coding/GitHub/IDH-index/data/interim/dev.csv', index=False)

In [99]:
atlas_df = pd.DataFrame(wb.get_series('NY.GNP.PCAP.PP.CD', country='PR', simplify_index=True))
atlas_df.reset_index(inplace=True)
atlas_df.rename(columns={'NY.GNP.PCAP.PP.CD': 'atlas'}, inplace=True)
atlas_df['atlas'] = atlas_df['atlas'].astype(float)

# get gni constant df from WB
gni_df = pd.DataFrame(wb.get_series('NY.GNP.PCAP.PP.KD', country='PR', simplify_index=True))
gni_df.reset_index(inplace=True)
gni_df.rename(columns={'NY.GNP.PCAP.PP.KD': 'gni'}, inplace=True)
gni_df['gni'] = gni_df['gni'].astype(float)
# replace value 20

# ajust the index
ajusted_df = pd.DataFrame([], columns=['Year', 'coef', 'atkinson'])

# merge the two dataframes
inc_df = atlas_df.merge(gni_df, on='Year')
inc_df['income_ratio'] = inc_df['gni'] / inc_df['atlas']
inc_df['income_ratio'] = inc_df['income_ratio'].astype(float)
inc_df['Year'] = inc_df['Year'].astype(int)

# merge the income index with the pnb.csv file
pnb = pd.read_csv('/Volumes/Nergigante/Project_Anarchy/Coding/GitHub/IDH-index/data/external/pnb.csv')
merge_df = inc_df.merge(pnb, on='Year', how='left')
merge_df = merge_df.dropna()
merge_df.reset_index(inplace=True)
merge_df.drop(['index'], axis=1, inplace=True)

# calculate the index
merge_df['index_temp'] = merge_df['income_ratio'] * merge_df['pnb']
# replace the value of the year 2021 with 0
merge_df.loc[merge_df['Year'] == 2021, 'index_temp'] = 22342.18055
merge_df['index'] = (np.log(merge_df['index_temp']) - np.log(100)) / (np.log(70000) - np.log(100))
print(merge_df)
merge_df = merge_df[['Year', 'index']]
merge_df = merge_df.sort_values(by='Year', ascending=True)
merge_df = merge_df.merge(ajusted_df, on='Year', how='left')
merge_df['income_index_ajusted'] = merge_df['coef'] * merge_df['index']
merge_df.drop(['coef'], axis=1, inplace=True)

    Year    atlas           gni  income_ratio          pnb    index_temp  \
0   2006  19860.0  24680.190939      1.242709  15173.00000  18855.616169   
1   2007  20720.0  25214.529716      1.216917  15984.00000  19451.208638   
2   2008  20610.0  24077.452730      1.168241  16623.00000  19419.674756   
3   2009  20150.0  22697.433083      1.126423  16960.00000  19104.142188   
4   2010  20210.0  21868.952510      1.082086  17233.00000  18647.583306   
5   2011  20840.0  21820.620880      1.047055  17762.00000  18597.786376   
6   2012  21840.0  23332.436631      1.068335  18618.00000  19890.261227   
7   2013  22320.0  23645.723875      1.059396  19188.26120  20327.971594   
8   2014  22620.0  23638.712903      1.045036  19462.50418  20339.016299   
9   2015  22900.0  23569.637816      1.029242  20039.54818  20625.541161   
10  2016  23040.0  23366.533774      1.014172  20543.56862  20834.721788   
11  2017  22940.0  22937.677124      0.999899  20764.98082  20762.878183   
12  2018  23

In [None]:
merge_df