# Subject Dataset Cleaning and Imputation

In [1]:
import pandas as pd
from fancyimpute import IterativeImputer
import numpy as np

Using TensorFlow backend.


In [2]:
# Import the datasets
General = pd.read_csv("../data/processed/General.csv")
Labor = pd.read_csv("../data/processed/Labor.csv")
Urbanization = pd.read_csv("../data/processed/Urbanization.csv")
Education = pd.read_csv("../data/processed/Education.csv")
Inequality = pd.read_csv("../data/processed/Inequality.csv")
Economic = pd.read_csv("../data/processed/Economic.csv")

## Remove Sparse Variables

In [3]:
for df in [General,
            Labor,
            Urbanization,
            Education,
            Inequality,
            Economic]:
    ds = df.describe().transpose()
    # columns with <50% values
    sparse = ds[(ds["count"] < 2000)].index.values.tolist()
    print(sparse)
    df.drop(sparse,axis=1,inplace=True)

['SH.STA.HYGN.ZS', 'SH.H2O.SMDW.ZS', 'ER.LND.PTLD.ZS', 'DT.DOD.PVLX.GN.ZS', 'FB.CBK.DPTR.P3', 'FB.CBK.BRWR.P3', 'SG.VAW.1549.ZS', 'SG.DMK.ALLD.FN.ZS', 'SG.DMK.SRCR.FN.ZS', 'SH.STA.SUIC.P5', 'SH.STA.WASH.P5', 'SH.STA.BRTW.ZS', 'SH.STA.ANVC.ZS', 'SH.STA.FGMS.ZS', 'SH.PRV.SMOK', 'SH.ALC.PCAP.LI', 'per_si_allsi.cov_pop_tot', 'HD.HCI.OVRL']
['SL.TLF.0714.SW.TM', 'SL.TLF.0714.WK.TM', 'SL.ISV.IFRM.ZS', 'per_lm_alllm.cov_pop_tot']
['EN.POP.SLUM.UR.ZS']
['SE.ADT.LITR.ZS', 'SE.SEC.NENR']
['SI.POV.GINI', 'SI.POV.RUGP', 'SI.POV.URGP', 'SI.POV.NAGP', 'SI.DST.10TH.10', 'SI.DST.FRST.10', 'id', 'gini_reported', 'q1', 'q2', 'q3', 'q4', 'q5', 'd1', 'd2', 'd3', 'd4', 'd5', 'd6', 'd7', 'd8', 'd9', 'd10', 'bottom5', 'top5', 'mean', 'median', 'exchangerate', 'mean_usd', 'median_usd', 'gdp_ppp_pc_usd2011', 'population', 'quality_score']
['DT.DOD.PVLX.CD', 'SI.POV.NAHC', 'SI.POV.URHC', 'SI.POV.RUHC', 'SI.POV.NAGP', 'SI.POV.GINI']


In [4]:
for df in [General,
            Labor,
            Urbanization,
            Education,
            Inequality,
            Economic]:
    print(len(df.columns))

29
23
18
19
14
20


## Impute Remaining NaNs

In [5]:
for df in [General,
            Labor,
            Urbanization,
            Education,
            Inequality,
            Economic]:
    df.set_index(["Country Code", "Year"], inplace=True)

In [6]:
General_imputed = General.copy()
Labor_imputed = Labor.copy()
Urbanization_imputed = Urbanization.copy()
Education_imputed = Education.copy()
Inequality_imputed = Inequality.copy()
Economic_imputed = Economic.copy()
for df in [General_imputed,
            Labor_imputed,
            Urbanization_imputed,
            Education_imputed,
            Inequality_imputed,
            Economic_imputed]:
    XY_incomplete = df.values
    n_imputations = 5
    XY_completed = []
    for i in range(n_imputations):
        print("Imputing set {} of 5".format(i+1))
        imputer = IterativeImputer(n_iter=10, sample_posterior=True, random_state=i, initial_strategy="median")
        XY_completed.append(imputer.fit_transform(XY_incomplete))

    XY_completed_mean = np.mean(XY_completed, 0)
    XY_completed_std = np.std(XY_completed, 0)
    
    df[:] = XY_completed_mean

Imputing set 1 of 5
Imputing set 2 of 5
Imputing set 3 of 5
Imputing set 4 of 5
Imputing set 5 of 5
Imputing set 1 of 5
Imputing set 2 of 5
Imputing set 3 of 5
Imputing set 4 of 5
Imputing set 5 of 5
Imputing set 1 of 5
Imputing set 2 of 5
Imputing set 3 of 5
Imputing set 4 of 5
Imputing set 5 of 5
Imputing set 1 of 5
Imputing set 2 of 5
Imputing set 3 of 5
Imputing set 4 of 5
Imputing set 5 of 5
Imputing set 1 of 5
Imputing set 2 of 5
Imputing set 3 of 5
Imputing set 4 of 5
Imputing set 5 of 5
Imputing set 1 of 5
Imputing set 2 of 5
Imputing set 3 of 5
Imputing set 4 of 5
Imputing set 5 of 5


## Check The Integrity of Imputed Dataframes

In [7]:
def dfDiff(pair: tuple):
    """
    Takes a tuple of two pandas.DataFrames and returns a dataframe with changed values 
    
    Source: https://wellsr.com/python/pandas-compare-two-data-frames/
    
    IN:
    (
    Index | Col 
    ------|-----
    0     |0  
    1     |1  
    2     |0  
    3     |1  
    4     |1  
    ,
    Index | Col 
    ------|-----
    0     |1  
    1     |1  
    2     |1  
    3     |1  
    4     |0  
    )
    
    OUT:
    Index | Col | Old | New
    ------|-----|-----|-----
    0     |num  |0    |1
    2     |num  |0    |1
    4     |num  |1    |0
    """
    oldFrame, newFrame = pair
    dfBool = (oldFrame != newFrame).stack()
    diff = pd.concat([oldFrame.stack()[dfBool], newFrame.stack()[dfBool]], axis=1)
    diff.columns=["Old", "New"]
    return diff

In [8]:
diffs = [] # This will hold the computed difference dataframes
combinations = [(General_imputed, General),
            (Labor_imputed, Labor),
            (Urbanization_imputed, Urbanization),
            (Education_imputed, Education),
            (Inequality_imputed, Inequality),
            (Economic_imputed, Economic)] # All possible combinations of two dataframes

In [9]:
for pair in combinations:
    diffs.append(dfDiff(pair))

In [10]:
changed = [len(diff) for diff in diffs]

In [11]:
pd.DataFrame(changed).head(20)

Unnamed: 0,0
0,33926
1,24037
2,18937
3,24801
4,14102
5,22065


## Save the Data Frames

In [12]:
General.to_csv("../data/production/subject/General.csv")
Labor.to_csv("../data/production/subject/Labor.csv")
Urbanization.to_csv("../data/production/subject/Urbanization.csv")
Education.to_csv("../data/production/subject/Education.csv")
Inequality.to_csv("../data/production/subject/Inequality.csv")
Economic.to_csv("../data/production/subject/Economic.csv")