In [40]:
import pandas as pd
import numpy as np
from scipy.stats import gmean, describe

In [41]:
def coef_and_stats(data):
    gemetric = gmean(data)
    amean = data.mean()
    atkinson = 1 - gemetric/amean
    coef = 1 - atkinson
    return coef, amean, gemetric, atkinson

In [42]:
def to_category(x):
    mapping = {4: 1, 5: 2, 6: 3, 7: 4, 8: 5, 
               9: 6, 10: 7, 11: 8, 12: 9, 13: 10,
               14: 11, 15: 11, 16: 12, 17: 12, 
               18: 12.5, 19: 13, 20: 14, 21: 16,
    }
    return mapping.get(x, 0) if x <= 21 else 18

In [43]:
df = pd.read_csv('/Volumes/Nergigante/Project_Anarchy/Coding/GitHub/IDH-index/data/raw/data_ppr_2021_raw.csv', low_memory=False)
df = df[['AGEP', 'SCH', 'SCHL']]

# calcualte the mean of years of schooling
edu_sch = df[df['AGEP'] >= 25].copy()
edu_sch['scholing'] = edu_sch['SCHL']
edu_sch.reset_index(inplace=True)
edu_sch['scholing'] = edu_sch['scholing'].apply(lambda x: to_category(x))
edu_sch['enroled'] = np.where(edu_sch['scholing'] > 1, 1, 0)
mean_sch = edu_sch['scholing'].mean()

# get coeficient of ajustment
edu_sch['no_zero_schooling'] = 1 + edu_sch['scholing']
coef, amean, gemetric, atkinson = coef_and_stats(edu_sch['no_zero_schooling'])

# calculate the expected years of schooling
edu_exp = df[df['AGEP'] < 25].copy()
edu_exp['enrolled'] = edu_exp['SCH'].apply(lambda x: 1 if x > 1 else 0)
edu_age = edu_exp.groupby(['AGEP'])[['AGEP','enrolled']].count()
edu_age['enrolled'] = edu_exp.groupby(['AGEP'])['enrolled'].sum()
edu_age['enrollment_rate'] = edu_age['enrolled'] / edu_age['AGEP']
edu_age = edu_age.rename (columns = {'AGEP': 'count'})
edu_age = edu_age.reset_index()
edu_age.drop([0,1,2,3,4], inplace=True)
exp_sch = edu_age['enrollment_rate'].sum()

# calculate index
edu_value = (mean_sch/15 + exp_sch/18) / 2
edu_value_ajusted = coef * edu_value
print(f'index: {edu_value}\n', f"index adj: {edu_value_ajusted}")

index: 0.8716164146530334
 index adj: 0.798147282963029


In [56]:
print(0.798 - 0.791)
print(0.872 - 0.8716)

0.007000000000000006
0.00039999999999995595


In [44]:
np.mean(edu_sch['no_zero_schooling'])

13.346702429788577

In [45]:
gmean(edu_sch['no_zero_schooling'])

12.221699937916302

In [46]:
edu_sch['no_zero_schooling'].mean()

13.346702429788577

In [47]:
coef, amean, gemetric, atkinson = coef_and_stats(edu_sch['no_zero_schooling'])
print(f' Coef: {coef}\n', 
      f'Amean: {amean}\n', 
      f'Geometric: {gemetric}\n', 
      f'Atkinson: {atkinson}\n',
      f'Index: 0.906\n',
      f'Index adj: {0.906 * coef}')

 Coef: 0.9157093298670258
 Amean: 13.346702429788577
 Geometric: 12.221699937916302
 Atkinson: 0.0842906701329742
 Index: 0.906
 Index adj: 0.8296326528595254


In [48]:
0.8296326528595254

0.8296326528595254

In [49]:
edu_sch

Unnamed: 0,index,AGEP,SCH,SCHL,scholing,enroled,no_zero_schooling
0,0,40,1.0,19.0,13.0,1,14.0
1,2,61,1.0,18.0,12.5,1,13.5
2,3,63,1.0,12.0,9.0,1,10.0
3,4,58,1.0,16.0,12.0,1,13.0
4,5,84,1.0,18.0,12.5,1,13.5
...,...,...,...,...,...,...,...
95065,124663,56,1.0,16.0,12.0,1,13.0
95066,124664,47,1.0,22.0,18.0,1,19.0
95067,124665,46,1.0,21.0,16.0,1,17.0
95068,124668,61,1.0,17.0,12.0,1,13.0


In [50]:
edu_sch['scholing'].unique()

array([13. , 12.5,  9. , 12. , 16. , 18. , 14. ,  5. ,  0. ,  8. ,  7. ,
       11. , 10. ,  6. ,  4. ,  2. ,  3. ,  1. ])

In [51]:
df[df['SCH'] == 0].count()

AGEP    0
SCH     0
SCHL    0
dtype: int64

In [52]:
edu_sch[edu_sch['scholing'] == 0].count()

index                2363
AGEP                 2363
SCH                  2363
SCHL                 2363
scholing             2363
enroled              2363
no_zero_schooling    2363
dtype: int64

In [55]:
# save to csv
edu_sch.to_csv('/Volumes/Nergigante/Project_Anarchy/Coding/GitHub/IDH-index/data/interim/dev.csv', index=False)