In [52]:
import numpy as np
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.impute import KNNImputer
from fancyimpute import IterativeImputer

In [2]:
def LoadData(fileName):
    # LoadData loads the relevant files.
    # Input: fileName - the name of the file to load
    # Output: exprData - the imputed expression data matrix
    #         residuals - the residuals (dependent variable)
    #         geneTissue - the names of the gene-tissue names (feature names)

    with open(fileName, 'r') as fid:
        tissues = fid.readline().strip().split('\t')
        geneNames = fid.readline().strip().split('\t')
        geneTissue = [tissues[i] + geneNames[i] for i in range(1, len(tissues))]

    data = np.loadtxt(fileName, delimiter='\t', skiprows=2)
    residuals = data[:, 0]
    exprData = data[:, 1:]

    return exprData, residuals, geneTissue

In [3]:
aa_expr,aa_res,aa_gt=LoadData('AA_train.txt')
aa_expr_test,aa_res_test,_=LoadData('AA_validation.txt')
e_expr,e_res,e_gt=LoadData('EUR_train.txt')
e_expr_test,e_res_test,_=LoadData('EUR_validation.txt')
X = np.vstack((aa_expr,e_expr,aa_expr_test,e_expr_test))
y = np.concatenate((aa_res,e_res,aa_res_test,e_res_test))

In [4]:
def missing_masking(data, prob):
    missing_mask = np.random.binomial(1, 1 - prob, data.shape)
    missing_data = data * missing_mask
    return missing_mask, missing_data

In [5]:
data = X

In [6]:
# Create missing mask
missing_prob = 0.2  # Probability of missing values
missing_mask, missing_data = missing_masking(data, missing_prob)

In [39]:
missing_data[missing_mask==0]=np.nan

# Average

#### ROW Average Imputation

In [23]:
row_mean = np.nanmean(missing_data, axis=1)
matrix_row_filled = np.where(np.isnan(missing_data), np.tile(row_mean, (539, 1)).T, missing_data)

In [33]:
rmse_row = np.sqrt(mean_squared_error(data, matrix_row_filled))
mae_row = mean_absolute_error(data, matrix_row_filled)
r2_row = r2_score(data,matrix_row_filled)
print("Evaluation\n","MAE : ",mae_row," | RMSE : ",rmse_row," | R2 : ",r2_row) 

Evaluation
 MAE :  0.021897565501802545  | RMSE :  0.08713727210959035  | R2 :  -1.6331038030048143e+56


#### Column average imputation

In [36]:
col_mean = np.nanmean(missing_data, axis=0)
matrix_col_filled = np.where(np.isnan(missing_data), np.tile(col_mean, (741, 1)), missing_data)

In [37]:
rmse_col = np.sqrt(mean_squared_error(data, matrix_col_filled))
mae_col = mean_absolute_error(data, matrix_col_filled)
r2_col = r2_score(data,matrix_col_filled)
print("Evaluation\n","MAE : ",mae_col," | RMSE : ",rmse_col," | R2 : ",r2_col) 

Evaluation
 MAE :  0.020936393938202923  | RMSE :  0.08483969786628859  | R2 :  0.7286998773477955


# Median

#### Row median imputation

In [41]:
row_median = np.nanmedian(missing_data, axis=1)
row_median[np.isnan(row_median)] = 0
matrix_row_filled = np.where(np.isnan(missing_data), np.tile(row_median, (539, 1)).T, missing_data)

In [42]:
rmse_row = np.sqrt(mean_squared_error(data, matrix_row_filled))
mae_row = mean_absolute_error(data, matrix_row_filled)
r2_row = r2_score(data,matrix_row_filled)
print("Evaluation\n","MAE : ",mae_row," | RMSE : ",rmse_row," | R2 : ",r2_row)

Evaluation
 MAE :  0.020986604877105134  | RMSE :  0.08732684120066023  | R2 :  -5.330269851044054e+46


#### Column median imputation

In [46]:
col_median = np.nanmedian(missing_data, axis=0)
col_median[np.isnan(col_median)] = 0
matrix_col_filled = np.where(np.isnan(missing_data), np.tile(col_median, (741, 1)), missing_data)

In [47]:
rmse_col = np.sqrt(mean_squared_error(data, matrix_col_filled))
mae_col = mean_absolute_error(data, matrix_col_filled)
r2_col = r2_score(data,matrix_col_filled)
print("Evaluation\n","MAE : ",mae_col," | RMSE : ",rmse_col," | R2 : ",r2_col) 

Evaluation
 MAE :  0.020266310535232545  | RMSE :  0.0868692981605298  | R2 :  0.8200005874841568


# KNN

In [49]:
knn_imputer = KNNImputer(n_neighbors=5)
data_imputed = knn_imputer.fit_transform(missing_data)

In [50]:
rmse_knn = np.sqrt(mean_squared_error(data, data_imputed))
mae_knn = mean_absolute_error(data, data_imputed)
r2_knn = r2_score(data,data_imputed)
print("Evaluation\n","MAE : ",mae_knn," | RMSE : ",rmse_knn," | R2 : ",r2_knn) 

Evaluation
 MAE :  0.018292652814925835  | RMSE :  0.07693944181078974  | R2 :  0.8417765581298801


# MICE (Multiple Imputation by Chained Equations)


In [53]:
imputer = IterativeImputer()
data_imputed = imputer.fit_transform(missing_data)



In [54]:
rmse_mice = np.sqrt(mean_squared_error(data, data_imputed))
mae_mice = mean_absolute_error(data, data_imputed)
r2_mice = r2_score(data,data_imputed)
print("Evaluation\n","MAE : ",mae_mice," | RMSE : ",rmse_mice," | R2 : ",r2_mice) 

Evaluation
 MAE :  0.01541234813673493  | RMSE :  0.06569846958992737  | R2 :  0.8763551394590406
