# Education Data Processing (WDI)
## Data Dictionary
| Code              | Indicator Name                                                              |
|-------------------|-----------------------------------------------------------------------------|
| SE.ENR.PRSC.FM.ZS | School enrollment, primary and secondary (gross), gender parity index (GPI) |
| SE.ADT.LITR.ZS    | Literacy rate, adult total (% of people ages 15 and above)                  |
| SE.COM.DURS       | Compulsory education, duration (years)                                      |
| SE.PRM.UNER.ZS    | Children out of school (% of primary school age)                            |
| SE.PRM.ENRL.TC.ZS | Pupil-teacher ratio, primary                                                |
| SE.SEC.ENRL.TC.ZS | Pupil-teacher ratio, secondary                                              |
| SE.SEC.NENR       | School enrollment, secondary (% net)                                        |

In [1]:
import re

import numpy as np
import pandas as pd
import pycountry

%matplotlib inline

pd.set_option('display.float_format', lambda x: '%.3f' % x)

## Load The File

In [2]:
df = pd.read_excel("../data/external/Education/WDI/Data_Extract_From_World_Development_Indicators.xlsx")

In [3]:
df.sample(5)

Unnamed: 0,Time,Time Code,Country Name,Country Code,"School enrollment, primary and secondary (gross), gender parity index (GPI) [SE.ENR.PRSC.FM.ZS]","Literacy rate, adult total (% of people ages 15 and above) [SE.ADT.LITR.ZS]","Compulsory education, duration (years) [SE.COM.DURS]",Children out of school (% of primary school age) [SE.PRM.UNER.ZS],"Pupil-teacher ratio, primary [SE.PRM.ENRL.TC.ZS]","Pupil-teacher ratio, secondary [SE.SEC.ENRL.TC.ZS]","School enrollment, secondary (% net) [SE.SEC.NENR]"
2359,2003,YR2003,Middle East & North Africa (IDA & IBRD countries),TMN,0.914,71.843,9,9.910,23.312,19.504,62.504
2941,2006,YR2006,Belize,BLZ,0.994,..,8,4.284,22.939,17.280,64.775
160,1995,YR1995,Paraguay,PRY,0.992,..,..,8.295,22.306,..,..
4721,2012,YR2012,IDA & IBRD total,IBT,0.973,82.460,9,9.658,26.306,18.224,60.644
1848,2002,YR2002,Argentina,ARG,1.024,..,10,..,17.596,17.041,79.362


## Standardize Country Codes

In [4]:
""" Only Select rows with valid country codes
"""
country_locations = []
for country in df['Country Code']:
    try:
        pycountry.countries.lookup(country)
        country_locations.append(True)
    except LookupError:
        country_locations.append(False)
df = df[country_locations]

## Standardize Indexes

In [5]:
df.rename(
    {
        "Time": "Year"
    },
    axis='columns',
    inplace=True)

In [6]:
df.set_index(["Country Code", "Year"], inplace=True)

## Clean Data

### Header

In [7]:
df.drop(["Time Code", "Country Name"],
        axis='columns',
        inplace=True)

In [8]:
c = [ re.search(r"\[(\w+\.)+\w+\]",d)[0].replace("[","").replace("]","") for d in df.columns ]

In [9]:
c_names = {}
for x in range(len(c)):
    c_names[df.columns[x]] = c[x]

In [10]:
df.rename(c_names,axis='columns',inplace=True)

### Data Types

In [11]:
""" Replace '..' with np.nan for better parsing
"""
df = df.replace('..', np.NaN)

In [12]:
df = df.astype(float)

In [13]:
df.sample(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,SE.ENR.PRSC.FM.ZS,SE.ADT.LITR.ZS,SE.COM.DURS,SE.PRM.UNER.ZS,SE.PRM.ENRL.TC.ZS,SE.SEC.ENRL.TC.ZS,SE.SEC.NENR
Country Code,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
NCL,2017,,,,,,,
KWT,2011,0.987,,9.0,0.9,8.609,,86.078
KNA,2010,,,12.0,,14.12,9.267,
MAR,2000,0.832,,9.0,23.722,28.763,16.973,
GEO,2010,,,9.0,,,,


## Save Data

In [14]:
df.to_pickle("../data/processed/Education_WDI.pickle")