In [1]:
import matplotlib.pyplot as plt
%matplotlib inline  
import numpy as np
import collections
import csv
import pandas as pd 
from scipy.stats import mode
from sklearn.neighbors import NearestNeighbors
import cvxpy #conda install -c cvxgrp cvxpy
import itertools
from sklearn.utils.extmath import randomized_svd
import math

# Imputation

Input: LongitudinalDataAnalysis.csv

Ouput: ImputedMatrix.csv

In [2]:
def imputation_mean_mode(data_train, data_test):
    """Function that provides imputation by mean for numeric columns and most frequent value for categorical columns
    Imputes test data using mean/mode of the respective training set columns
    
    
    Parameters:
    -----------
    data_train: training data as returned from longitudinal function
    data_test: test data as returned from longitudinal function
    
    Output:
    -------
    Saves csv matrix "ImputedMatrix_train.csv" and "ImputedMatrix_test.csv"
    
    Notes:
    ------
    If column is completely empty, drops it 
    """
    
    for column in data_train:
        data_train[column] = data_train[column].apply(pd.to_numeric)
          
        #if the column empty
        if pd.isnull(data_train[column]).all():
            del data_train[column]    
            del data_test[column] 
            
        #if this is a categorical column
        elif np.array_equal(data_train[column].unique(),[0,1]):
            data_train[column] = data_train[column].replace(np.nan, data_train[column].value_counts()[0])
            data_test[column] = data_test[column].replace(np.nan, data_train[column].value_counts()[0])

        else: #if numerical column
            data_train[column] = data_train[column].replace(np.nan, data_train[column].mean()) 
            data_test[column] = data_test[column].replace(np.nan, data_train[column].mean()) 
          
    data_train.to_csv("ImputedMatrix_train.csv", index = False)
    data_test.to_csv("ImputedMatrix_test.csv", index = False)



# kNN Imputation

Iterate through rows. If there is missing data in a row (say col 1 and col 4 are missing), find the nearest neighbor to this row by considering all the columns besides 1 and 4 and all the complete rows. To impute data in col 1, try to find 10 nearest neigbors's col 1 value that are non-NaN and take mean/mode depending on the type of column. Imputing does not happen in place. 

In [3]:
def imputation_knn(data_train, data_test):
    """Function that provides imputation by kNN- takes the mean/mode of the values for 10 nearest neighbors (if possible) with valid data 
    
    Parameters:
    -----------
    data_train: training data as returned from longitudinal function
    data_test: test data as returned from longitudinal function
    
    Output:
    -------
    Saves csv matrix "ImputedMatrix_train.csv" and "ImputedMatrix_test.csv"
    
    Notes:
    ------
    If column is completely empty, drops it 
    """

    #add index column so that original row numbers are maintained 
    data_train['INDEX'] = range(len(data_train))
    
    #deletes any columns that are empty in training set
    for column in data_train:
        if pd.isnull(data_train[column]).all():
            del data_train[column] 
            del data_test[column] 
           
    matrix = data_train.as_matrix()
    complete_matrix = np.copy(matrix)
    
    for x in xrange(matrix.shape[0]):
        if np.isnan(matrix[x]).any(): #if we need to impute for this row
            
            #figure out which columns have nan's for this row 
            indices_to_impute = list(np.transpose(np.argwhere(np.isnan(matrix[x])))[0])
            
            ######
            #In terms of kNN input, drop columns that are missing for this example, drop the test example row 
            #itself, and drop any rows that still have missing data
            ######
            training_X = np.delete(matrix, indices_to_impute, axis=1) #drop columns that are missing for this example
            test_example = training_X[x]
            training_X = np.delete(training_X, x, axis = 0) #drop the test example
            training_X = training_X[~np.isnan(training_X).any(axis=1)] #drop the rows with missing data 
            
            neigh = NearestNeighbors(n_neighbors=len(training_X)-1)
            
            #model inputs- ignore last "Index" Column and 2nd to last Diagnoistics column
            model_X = training_X[:,:-2]
            model_Y = test_example[:-2]
            
            #fit model and find nearest neighbors indices
            neigh.fit(model_X) 
            ans = neigh.kneighbors([model_Y])[1][0] #these are the indices
            
            #iterate over each entry in row that needs imputing
            for index in indices_to_impute:
                i = 0
                neighbors = []                            
                true_row = int(training_X[ans[i],-1]) #recover "true" index (before things were deleted)
 
                #get 10 (if possible) nearest neighbor values that are non-Nan
                while i < len(ans) and len(neighbors) < 11:
                    if not np.isnan(matrix[true_row,index]):
                        neighbors.append(matrix[true_row,index])
                    
                    i += 1
                    true_row = int(training_X[ans[i],-1])
                
                #fill with mode of neighbors if categorical column
                if np.array_equal(set(neighbors),[0,1]):
                    complete_matrix[x,index] = neighbors.value_counts()[0]
                
                else: #fill with mean of neighbors if numerical column
                    complete_matrix[x,index] = np.mean(neighbors)
    
    
    ################## Now impute test set #####################
    #complete matrix has the training data
    
    matrix_test = data_test.as_matrix()
    complete_matrix_test = np.copy(matrix_test)
    
    for x in xrange(matrix_test.shape[0]):
        if np.isnan(matrix_test[x]).any(): #if we need to impute for this row
            
            #figure out which columns have nan's for this row 
            indices_to_impute = list(np.transpose(np.argwhere(np.isnan(matrix_test[x])))[0])
            
            ######
            #In terms of kNN input, drop training columns that are missing for this example, drop the test example row 
            #itself, and drop any rows that still have missing data
            ######
            training_X = np.delete(complete_matrix, indices_to_impute, axis=1) #drop columns that are missing for this example
            test_example = np.delete(matrix_test, indices_to_impute, axis=1)
            test_example = test_example[x]
            
            neigh = NearestNeighbors(n_neighbors=len(training_X)-1)
    
            #model inputs- ignore last "Index" Column and 2nd to last Diagnoistics column
            model_X = training_X[:,:-2]
            model_Y = test_example[:-1]
            
            #fit model and find nearest neighbors indices
            neigh.fit(model_X) 
            ans = neigh.kneighbors([model_Y])[1][0] #these are the indices
    
            #iterate over each entry in row that needs imputing
            for index in indices_to_impute:
                i = 0
                neighbors = []                            
                true_row = int(training_X[ans[i],-1]) #recover "true" index (before things were deleted)
 
                #get 10 (if possible) nearest neighbor values that are non-Nan
                while i < len(ans) and len(neighbors) < 11:
                    if not np.isnan(complete_matrix[true_row,index]):
                        neighbors.append(complete_matrix[true_row,index])
                    
                    i += 1
                    true_row = int(training_X[ans[i],-1])
                
                #fill with mode of neighbors if categorical column
                if np.array_equal(set(neighbors),[0,1]):
                    complete_matrix_test[x,index] = neighbors.value_counts()[0]
                
                else: #fill with mean of neighbors if numerical column
                    complete_matrix_test[x,index] = np.mean(neighbors)
    
    
    #convert Training data to DataFrame, delete Index column and Save
    data = pd.DataFrame(data=complete_matrix[:,:],  
                 columns=list(data_train))  
    
    del data['INDEX']
    data.to_csv("ImputedMatrix_train.csv", index = False)
    
    
    #convert Testing data to DataFrame, Save
    data = pd.DataFrame(data=complete_matrix_test[:,:],  
                 columns=list(data_test))  
    
    data.to_csv("ImputedMatrix_test.csv", index = False)

In [28]:
def generate_random_column_samples(column):
    col_mask = np.isnan(column)
    n_missing = np.sum(col_mask)
    
    if n_missing == len(column):
        return np.zeros_like(column)

    mean = np.nanmean(column)
    std = np.nanstd(column)        
    
    return np.random.randn(n_missing) * std + mean


def imputation_convexOptimization(data_train, data_test): 
    """Function that provides imputation using exact matrix completion via convex optimization
    
    Parameters:
    -----------
    data: data as returned from longitudinal function
    
    Output:
    -------
    Saves csv matrix "ImputedMatrix.csv" 
    """
    #model inputs- ignore last Diagnoistics column
    data_train = data_train.ix[:,:-1]
    data_test = data_test.ix[:,:-1]
    
    #deletes any columns that are empty in training set
    for column in data_train:
        if pd.isnull(data_train[column]).all():
            del data_train[column] 
            del data_test[column] 
           
    matrix = data_train.as_matrix()
    complete_matrix = np.copy(matrix)
    
    X_incomplete = matrix
    X = np.asarray(X_incomplete)
    error_tolerance=0.0001
    missing_mask = np.isnan(X)
    for col_idx in range(X.shape[1]):
        missing_col = missing_mask[:, col_idx]
        n_missing = missing_col.sum()
        if n_missing == 0:
            continue
        col_data = X[:, col_idx]
        fill_values = generate_random_column_samples(col_data)
        X[missing_col, col_idx] = fill_values
    m, n = X.shape   
    
    S = cvxpy.Variable(m, n, name="S")
    
    ok_mask = ~missing_mask
    masked_X = cvxpy.mul_elemwise(ok_mask, X)
    masked_S = cvxpy.mul_elemwise(ok_mask, S)
    
    abs_diff = cvxpy.abs(masked_S - masked_X)
    close_to_data = abs_diff <= error_tolerance
    constraints = [close_to_data]
    print constraints
    
    norm = cvxpy.norm(S, "nuc")
    objective = cvxpy.Minimize(norm)
    
    problem = cvxpy.Problem(objective, constraints)
    problem.solve(verbose = True, solver=cvxpy.SCS)
    
    complete_matrix_train = S.value
    
    ol = data_train
    data = pd.DataFrame(data=complete_matrix_train[:,:],  
                 columns=list(data_train))  
    cols_missing = (ol.columns)[pd.isnull(ol).sum() > 0]
    col_uniq = []
    for column in cols_missing:
        if np.unique(ol[column][~ol[column].isnull()]).sum() < 20:
            col_uniq.append(column)
    col_uniq_int = []
    for i in col_uniq:
        if len(str(np.unique(ol[i])[0])) < 6:
            col_uniq_int.append(i) 
    col_uniq_int = col_uniq_int[0:5]
    for col in col_uniq_int:
        for i in data[col][ol[col].isnull()].index:
            x = data[col][ol[col].isnull()][i]
            if int((x*10) % 10) < 5:
                data.set_value(i, col, math.floor(x))
            else:
                data.set_value(i, col, math.ceil(x))
    for column in cols_missing:
        ol[column][ol[column].isnull()] = data[column][ol[column].isnull()]
        
   
    ol.to_csv("ImputedMatrix_train.csv", index = False)
                
    ################## Now impute test set #####################
    #complete matrix has the training data
    
    #add index column so that original row numbers are maintained 
    matrix_test = data_test.as_matrix()
    complete_matrix_test = np.copy(matrix_test)
    
    X_incomplete = matrix_test
    X = np.asarray(X_incomplete)
    error_tolerance=0.0001
    missing_mask = np.isnan(X)
    for col_idx in range(X.shape[1]):
            missing_col = missing_mask[:, col_idx]
            n_missing = missing_col.sum()
            if n_missing == 0:
                continue
            col_data = X[:, col_idx]
            fill_values = generate_random_column_samples(col_data)
            X[missing_col, col_idx] = fill_values
    m, n = X.shape   
    
    S = cvxpy.Variable(m, n, name="S")
    
    ok_mask = ~missing_mask
    masked_X = cvxpy.mul_elemwise(ok_mask, X)
    masked_S = cvxpy.mul_elemwise(ok_mask, S)
    
    abs_diff = cvxpy.abs(masked_S - masked_X)
    close_to_data = abs_diff <= error_tolerance
    constraints = [close_to_data]
    print constraints
    
    norm = cvxpy.norm(S, "nuc")
    objective = cvxpy.Minimize(norm)
    
    problem = cvxpy.Problem(objective, constraints)
    problem.solve(verbose = True, solver=cvxpy.SCS)
    
    complete_matrix_test = S.value

    ol = data_test
    data = pd.DataFrame(data=complete_matrix_test[:,:],  
                 columns=list(data_test))  
    cols_missing = (ol.columns)[pd.isnull(ol).sum() > 0]
    col_uniq = []
    for column in cols_missing:
        if np.unique(ol[column][~ol[column].isnull()]).sum() < 20:
            col_uniq.append(column)
    col_uniq_int = []
    for i in col_uniq:
        if len(str(np.unique(ol[i])[0])) < 6:
            col_uniq_int.append(i) 
    col_uniq_int = col_uniq_int[0:5]
    for col in col_uniq_int:
        for i in data[col][ol[col].isnull()].index:
            x = data[col][ol[col].isnull()][i]
            if int((x*10) % 10) < 5:
                data.set_value(i, col, math.floor(x))
            else:
                data.set_value(i, col, math.ceil(x))
    for column in cols_missing:
        ol[column][ol[column].isnull()] = data[column][ol[column].isnull()]
    
    ol.to_csv("ImputedMatrix_test.csv", index = False)

In [29]:
def soft_impute(data_train, data_test):
    """
    Implementation of the SoftImpute algorithm via spectral regularization algorithm
    """
    #model inputs- ignore last Diagnoistics column
    data_train = data_train.ix[:,:-1]
    data_test = data_test.ix[:,:-1]
    
    #deletes any columns that are empty in training set
    for column in data_train:
        if pd.isnull(data_train[column]).all():
            del data_train[column] 
            del data_test[column] 
           
    matrix = data_train.as_matrix()
    complete_matrix = np.copy(matrix)
    
    X_incomplete = matrix
    X = np.asarray(X_incomplete)
    error_tolerance=0.0001
    missing_mask = np.isnan(X)
    
    for col_idx in range(X.shape[1]):
            missing_col = missing_mask[:, col_idx]
            n_missing = missing_col.sum()
            if n_missing == 0:
                continue
            col_data = X[:, col_idx]
            fill_values = generate_random_column_samples(col_data)
            X[missing_col, col_idx] = fill_values

    X_init = X.copy()

    X_filled = X 
    
    observed_mask = ~missing_mask
    _, s, _ = randomized_svd(
        X_filled,
        1,
        n_iter=5)
    max_singular_value = s[0]

    shrinkage_value = max_singular_value / 50.0

    max_iters=100
    max_rank = None
    for i in range(max_iters):
        n_power_iterations=1
        if max_rank:
            (U, s, V) = randomized_svd(
                X_filled,
                max_rank = None,
                n_iter=n_power_iterations)
        else:
            # perform a full rank SVD using ARPACK
            (U, s, V) = np.linalg.svd(
                X_filled,
                full_matrices=False,
                compute_uv=True)
        s_thresh = np.maximum(s - shrinkage_value, 0)
        rank = (s_thresh > 0).sum()
        s_thresh = s_thresh[:rank]
        U_thresh = U[:, :rank]
        V_thresh = V[:rank, :]
        S_thresh = np.diag(s_thresh)
        X_reconstruction = np.dot(U_thresh, np.dot(S_thresh, V_thresh))
        masked_diff = X_init[observed_mask] - X_reconstruction[observed_mask]
        mae = np.mean(np.abs(masked_diff))
        
        X_old = X_filled
        X_new=X_reconstruction
        
        old_missing_values = X_old[missing_mask]
        new_missing_values = X_new[missing_mask]
        
        difference = old_missing_values - new_missing_values
        
        ssd = np.sum(difference ** 2)
        old_norm = np.sqrt((old_missing_values ** 2).sum())
        
        convergence_threshold=0.001
        converged = (np.sqrt(ssd) / old_norm) < convergence_threshold
        
        X_filled[missing_mask] = X_reconstruction[missing_mask]
        
        complete_matrix = X_filled
    
    ################## Now impute test set #####################
    #complete matrix has the training data
        
    matrix_test = data_test.as_matrix()
    complete_matrix_test = np.copy(matrix_test)
    
    X_incomplete = matrix_test
    X = np.asarray(X_incomplete)
    error_tolerance=0.0001
    missing_mask = np.isnan(X)
    
    for col_idx in range(X.shape[1]):
            missing_col = missing_mask[:, col_idx]
            n_missing = missing_col.sum()
            if n_missing == 0:
                continue
            col_data = X[:, col_idx]
            fill_values = generate_random_column_samples(col_data)
            X[missing_col, col_idx] = fill_values

    X_init = X.copy()

    X_filled = X 
    
    observed_mask = ~missing_mask
    _, s, _ = randomized_svd(
        X_filled,
        1,
        n_iter=5)
    max_singular_value = s[0]

    shrinkage_value = max_singular_value / 50.0

    max_iters=100
    max_rank = None
    for i in range(max_iters):
        n_power_iterations=1
        if max_rank:
            (U, s, V) = randomized_svd(
                X_filled,
                max_rank = None,
                n_iter=n_power_iterations)
        else:
            # perform a full rank SVD using ARPACK
            (U, s, V) = np.linalg.svd(
                X_filled,
                full_matrices=False,
                compute_uv=True)
        s_thresh = np.maximum(s - shrinkage_value, 0)
        rank = (s_thresh > 0).sum()
        s_thresh = s_thresh[:rank]
        U_thresh = U[:, :rank]
        V_thresh = V[:rank, :]
        S_thresh = np.diag(s_thresh)
        X_reconstruction = np.dot(U_thresh, np.dot(S_thresh, V_thresh))
        masked_diff = X_init[observed_mask] - X_reconstruction[observed_mask]
        mae = np.mean(np.abs(masked_diff))
        
        X_old = X_filled
        X_new=X_reconstruction
        
        old_missing_values = X_old[missing_mask]
        new_missing_values = X_new[missing_mask]
        
        difference = old_missing_values - new_missing_values
        
        ssd = np.sum(difference ** 2)
        old_norm = np.sqrt((old_missing_values ** 2).sum())
        
        convergence_threshold=0.001
        converged = (np.sqrt(ssd) / old_norm) < convergence_threshold
        
        X_filled[missing_mask] = X_reconstruction[missing_mask]
        
        complete_matrix_test = X_filled
        
    #convert Training data to DataFrame, delete Index column and Save
    data = pd.DataFrame(data=complete_matrix[:,:],  
                 columns=list(data_train))  

    data.to_csv("ImputedMatrix_train.csv", index = False)
    
    
    #convert Testing data to DataFrame, Save
    data = pd.DataFrame(data=complete_matrix_test[:,:],  
                 columns=list(data_test))  
    
    data.to_csv("ImputedMatrix_test.csv", index = False)
    

In [30]:
def imputation(imputationType):
    """ Driver imputation function
    
    imputationType: how to impute; "meanmode", 'knn', 'nuclearnorm' or 'softimpute'
    """
    data_train = pd.read_csv("LongitudinalDataAnalysis_train.csv")
    data_test = pd.read_csv("LongitudinalDataAnalysis_test.csv")
    
    if imputationType == "meanmode":
        imputation_mean_mode(data_train, data_test)  
        
    if imputationType == 'knn':
        imputation_knn(data_train, data_test)
    
    if imputationType == 'nuclearnorm':
        imputation_convexOptimization(data_train, data_test)
    
    if imputationType == 'softimpute':
        soft_impute(data_train, data_test)
    

imputation('nuclearnorm')