In [87]:
from fancyimpute import SimpleFill, KNN, IterativeImputer, IterativeSVD
import numpy as np
import pandas as pd
from sklearn import preprocessing

In [134]:
def construct_missing_X(missing_list, df):
    nrow, ncol = df.shape
    data_incomplete = np.zeros((nrow, ncol))
    data_complete = np.zeros((nrow, ncol))
    missing_ind = missing_list.reshape(nrow, ncol)
    for i in range(nrow):
        for j in range(ncol):
            data_complete[i,j] = df.iloc[i,j]
            if missing_ind[i,j]:
                data_incomplete[i,j] = df.iloc[i,j]
            else:
                data_incomplete[i,j] = np.NaN
    return data_complete, data_incomplete

def impute_baselines(df, missing):
    X, X_incomplete = construct_missing_X(missing, df)
    n_missing = len(missing) - sum(missing)
    
    X_filled_mean = SimpleFill().fit_transform(X_incomplete)
    MAE_mean = sum(sum(abs(X_filled_mean - X))) / n_missing
    
    X_filled_knn = KNN(k=3, verbose=False).fit_transform(X_incomplete)
    diff_knn = X - X_filled_knn
    MAE_knn = sum(sum(abs(diff_knn))) / n_missing
    
    X_filled_svd = IterativeSVD(rank=9,verbose=False).fit_transform(X_incomplete)
    MAE_svd = sum(sum(abs(X_filled_svd - X))) / n_missing
    
    X_filled_mice = IterativeImputer().fit_transform(X_incomplete)
    MAE_mice = sum(sum(abs(X_filled_mice - X))) / n_missing
    
    print("Imputation methods and MAE")
    print("Mean: "+str(MAE_mean))
    print("KNN: "+str(MAE_knn))
    print("SVD: "+str(MAE_svd))
    print("MICE: "+str(MAE_mice))    

In [111]:
pwd()

'/Users/daisyding/Desktop/MDI_GNN'

In [123]:
missing = np.load("./Data/uci/pks/len4290rate0.7seed0.npy")
df = pd.read_csv("./parkinson.csv")
impute_baselines(df, missing)

Imputation methods and MAE
Mean: 0.13405062374499693
KNN: 0.06454642243109601
MICE: 0.059375814452296324


In [125]:
df.shape

(195, 22)

In [113]:
def process(df):
    df = df.drop(df.columns[[0, 1, 2]], axis=1)
    nrow, ncol = df.shape
    for i in range(nrow):
        for j in range(ncol):
            if df.iloc[i,j] == "?":
                df.iloc[i, j] = 0
                
    x = df.values
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    df_normalized = pd.DataFrame(x_scaled)
    return df_normalized

In [114]:
missing = np.load("./Data/uci/cancer/len6336rate0.7seed0.npy")

In [115]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wpbc.data',
                header=None)
df = process(df)

In [116]:
impute_baselines(df, missing)

Imputation methods and MAE
Mean: 0.13044041684456995
KNN: 0.08768465393524684
SVD: 0.07604318196034111
MICE: 0.055874663004220185




In [117]:
missing = np.load("./Data/uci/housing/len6578rate0.7seed0.npy")
df = pd.read_csv("./housing.csv")
impute_baselines(df, missing)

Imputation methods and MAE
Mean: 0.1778604781298496
KNN: 0.09068781898034092
SVD: 0.17754722699166395
MICE: 0.12474540943543222




In [135]:
missing = np.load("./Data/uci/wine/len15990rate0.7seed0.npy")
df = pd.read_csv("./wine.csv")
impute_baselines(df, missing)

Imputation methods and MAE
Mean: 0.09321925529574956
KNN: 0.07938734915204612
SVD: 0.09825681075273587
MICE: 0.07703349438628111


