# Full Dataset Cleaning and Imputation

In [40]:
import pandas as pd
from fancyimpute import IterativeImputer
import numpy as np

In [41]:
# Import the dataset
df = pd.read_csv("../data/processed/Full.csv")
dc = df.columns.values

## Remove Unneeded Variables

In [42]:
# List out the variables
# for i in dc:
#     print(i)

In [43]:
# Select variables to remove
# population: duplicate
# quality_score: meta
# id: meta
rm_vars = ["quality_score","population","id"]

In [44]:
df.drop(rm_vars, axis="columns", inplace=True)

## Remove Sparse Variables

In [45]:
ds = df.describe().transpose()

In [46]:
# columns with <20% values
sparse = ds[(ds["count"] < 800)].index.values.tolist()

In [47]:
df.drop(sparse,axis=1,inplace=True)

## Impute Remaining NaNs

In [48]:
df.set_index(["Country Code", "Year"], inplace=True)

In [49]:
XY_incomplete = df.values

n_imputations = 5
XY_completed = []
for i in range(n_imputations):
    print("Imputing set {} of 5".format(i+1))
    imputer = IterativeImputer(n_iter=10, sample_posterior=True, random_state=i, initial_strategy="median")
    XY_completed.append(imputer.fit_transform(XY_incomplete))

XY_completed_mean = np.mean(XY_completed, 0)
XY_completed_std = np.std(XY_completed, 0)

Imputing set 1 of 5
Imputing set 2 of 5
Imputing set 3 of 5
Imputing set 4 of 5
Imputing set 5 of 5


In [50]:
df1 = df.copy()
df2 = df.copy()
df3 = df.copy()
df4 = df.copy()
df5 = df.copy()
df1[:] = XY_completed[0]
df2[:] = XY_completed[1]
df3[:] = XY_completed[2]
df4[:] = XY_completed[3]
df5[:] = XY_completed[4]

## Check The Integrity of Imputed Dataframes

In [51]:
def dfDiff(pair: tuple):
    """
    Takes a tuple of two pandas.DataFrames and returns a dataframe with changed values 
    
    Source: https://wellsr.com/python/pandas-compare-two-data-frames/
    
    IN:
    (
    Index | Col 
    ------|-----
    0     |0  
    1     |1  
    2     |0  
    3     |1  
    4     |1  
    ,
    Index | Col 
    ------|-----
    0     |1  
    1     |1  
    2     |1  
    3     |1  
    4     |0  
    )
    
    OUT:
    Index | Col | Old | New
    ------|-----|-----|-----
    0     |num  |0    |1
    2     |num  |0    |1
    4     |num  |1    |0
    """
    oldFrame, newFrame = pair
    dfBool = (oldFrame != newFrame).stack()
    diff = pd.concat([oldFrame.stack()[dfBool], newFrame.stack()[dfBool]], axis=1)
    diff.columns=["Old", "New"]
    return diff

def pairCombinations(l):
    """
    Takes a list (l) and returns a list of sets containing every possible pair
    Does not return any sets of two idential objects
    """
    l2 = l[1:]
    pairs = []
    for one in l:
        for two in l2:
            pairs.append((one,two))
        l2 = l2[1:]
    return pairs

In [52]:
diffs = [] # This will hold the computed difference dataframes
combinations = pairCombinations([df, df1, df2, df3, df4, df5]) # All possible combinations of two dataframes
combination_labels = pairCombinations(['df', 'df1', 'df2', 'df3', 'df4', 'df5'])

In [53]:
for pair in combinations:
    diffs.append(dfDiff(pair))

In [54]:
df_one = []
df_two = []

for pair in combination_labels:
    one, two = pair
    df_one.append(one)
    df_two.append(two)

changed = [len(diff) for diff in diffs]

cols = {
    "DF ONE": df_one,
    "DF TWO": df_two,
    "Changed Values": changed
}

In [55]:
pd.DataFrame(cols).head(20)

Unnamed: 0,DF ONE,DF TWO,Changed Values
0,df,df1,149842
1,df,df2,149842
2,df,df3,149842
3,df,df4,149842
4,df,df5,149842
5,df1,df2,149842
6,df1,df3,149842
7,df1,df4,149842
8,df1,df5,149842
9,df2,df3,149842


## Save the Data Frames

In [56]:
df.to_csv("../data/production/Full_clean.csv")
df1.to_csv("../data/production/Full_clean_imputed_1.csv")
df2.to_csv("../data/production/Full_clean_imputed_2.csv")
df3.to_csv("../data/production/Full_clean_imputed_3.csv")
df4.to_csv("../data/production/Full_clean_imputed_4.csv")
df5.to_csv("../data/production/Full_clean_imputed_5.csv")