In [1]:
import numpy as np
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.impute import KNNImputer
from fancyimpute import IterativeImputer
from sklearn.decomposition import NMF
from sklearn.preprocessing import MinMaxScaler

In [2]:
def LoadData(fileName):
    # LoadData loads the relevant files.
    # Input: fileName - the name of the file to load
    # Output: exprData - the imputed expression data matrix
    #         residuals - the residuals (dependent variable)
    #         geneTissue - the names of the gene-tissue names (feature names)

    with open(fileName, 'r') as fid:
        tissues = fid.readline().strip().split('\t')
        geneNames = fid.readline().strip().split('\t')
        geneTissue = [tissues[i] + geneNames[i] for i in range(1, len(tissues))]

    data = np.loadtxt(fileName, delimiter='\t', skiprows=2)
    residuals = data[:, 0]
    exprData = data[:, 1:]

    return exprData, residuals, geneTissue

In [3]:
aa_expr,aa_res,aa_gt=LoadData('AA_train.txt')
aa_expr_test,aa_res_test,_=LoadData('AA_validation.txt')
e_expr,e_res,e_gt=LoadData('EUR_train.txt')
e_expr_test,e_res_test,_=LoadData('EUR_validation.txt')
X = np.vstack((aa_expr,e_expr,aa_expr_test,e_expr_test))
y = np.concatenate((aa_res,e_res,aa_res_test,e_res_test))

In [4]:
def missing_masking(data, prob):
    missing_mask = np.random.binomial(1, 1 - prob, data.shape)
    missing_data = data * missing_mask
    return missing_mask, missing_data

In [5]:
data = X

In [6]:
# Create missing mask
missing_prob = 0.2  # Probability of missing values
missing_mask, missing_data = missing_masking(data, missing_prob)

In [7]:
missing_data[missing_mask==0]=np.nan

# Non negative Matrix Factorization

In [11]:
data_with_missing = missing_data
# Preprocessing
scaler = MinMaxScaler()
data_scaled = scaler.fit_transform(data_with_missing)

# Imputation using NMF
n_components = 5  # Set the number of components (you can tune this parameter)

# Mask the missing values as 0 to ensure non-negativity during NMF
data_scaled[np.isnan(data_scaled)] = 0

# Perform NMF
nmf_model = NMF(n_components=n_components)
data_imputed = nmf_model.fit_transform(data_scaled)

# Reconstruct the imputed data
data_imputed_reconstructed = np.dot(data_imputed, nmf_model.components_)
data_imputed = scaler.inverse_transform(data_imputed_reconstructed)

data_imputed = scaler.inverse_transform(data_imputed_reconstructed)
mis = data[missing_mask==0]
imp = data_imputed[missing_mask==0]



In [12]:
rmse_nmf = np.sqrt(mean_squared_error(mis, imp))
mae_nmf = mean_absolute_error(mis, imp)
r2_nmf = r2_score(mis, imp)
print("Evaluation\n","MAE : ",mae_nmf," | RMSE : ",rmse_nmf," | R2 : ",r2_nmf) 

Evaluation
 MAE :  0.13668973146724298  | RMSE :  0.22396284990762078  | R2 :  -0.3273176738003447


# Average

#### ROW Average Imputation

In [13]:
row_mean = np.nanmean(missing_data, axis=1)
matrix_row_filled = np.where(np.isnan(missing_data), np.tile(row_mean, (539, 1)).T, missing_data)

In [14]:
mis = data[missing_mask==0]
imp = matrix_row_filled[missing_mask==0]

In [15]:
rmse_row = np.sqrt(mean_squared_error(mis, imp))
mae_row = mean_absolute_error(mis, imp)
r2_row = r2_score(mis,imp)
print("Evaluation\n","MAE : ",mae_row," | RMSE : ",rmse_row," | R2 : ",r2_row) 

Evaluation
 MAE :  0.10990698568268753  | RMSE :  0.19404783937271755  | R2 :  0.0035843737865243197


#### Column average imputation

In [16]:
col_mean = np.nanmean(missing_data, axis=0)
matrix_col_filled = np.where(np.isnan(missing_data), np.tile(col_mean, (741, 1)), missing_data)

In [17]:
mis = data[missing_mask==0]
imp = matrix_col_filled[missing_mask==0]

In [18]:
rmse_col = np.sqrt(mean_squared_error(mis, imp))
mae_col = mean_absolute_error(mis, imp)
r2_col = r2_score(mis,imp)
print("Evaluation\n","MAE : ",mae_col," | RMSE : ",rmse_col," | R2 : ",r2_col) 

Evaluation
 MAE :  0.10506802388385234  | RMSE :  0.18862343073243357  | R2 :  0.058513306889333094


# Median

#### Row median imputation

In [19]:
row_median = np.nanmedian(missing_data, axis=1)
row_median[np.isnan(row_median)] = 0
matrix_row_filled = np.where(np.isnan(missing_data), np.tile(row_median, (539, 1)).T, missing_data)

In [20]:
mis = data[missing_mask==0]
imp = matrix_row_filled[missing_mask==0]

In [21]:
rmse_row = np.sqrt(mean_squared_error(mis, imp))
mae_row = mean_absolute_error(mis, imp)
r2_row = r2_score(mis,imp)
print("Evaluation\n","MAE : ",mae_row," | RMSE : ",rmse_row," | R2 : ",r2_row)

Evaluation
 MAE :  0.10522863634928194  | RMSE :  0.1943966390832384  | R2 :  -9.46638131882338e-07


#### Column median imputation

In [22]:
col_median = np.nanmedian(missing_data, axis=0)
col_median[np.isnan(col_median)] = 0
matrix_col_filled = np.where(np.isnan(missing_data), np.tile(col_median, (741, 1)), missing_data)

In [23]:
mis = data[missing_mask==0]
imp = matrix_col_filled[missing_mask==0]

In [24]:
rmse_col = np.sqrt(mean_squared_error(mis, imp))
mae_col = mean_absolute_error(mis, imp)
r2_col = r2_score(mis,imp)
print("Evaluation\n","MAE : ",mae_col," | RMSE : ",rmse_col," | R2 : ",r2_col) 

Evaluation
 MAE :  0.10142592327859619  | RMSE :  0.1929202638129641  | R2 :  0.015130697505526203


# KNN

In [27]:
knn_imputer = KNNImputer(n_neighbors=5)
data_imputed = knn_imputer.fit_transform(missing_data)
mis = data[missing_mask==0]
imp = data_imputed[missing_mask==0]

In [28]:
mis = data[missing_mask==0]
imp = data_imputed[missing_mask==0]

In [29]:
rmse_knn = np.sqrt(mean_squared_error(mis, imp))
mae_knn = mean_absolute_error(mis, imp)
r2_knn = r2_score(mis,imp)
print("Evaluation\n","MAE : ",mae_knn," | RMSE : ",rmse_knn," | R2 : ",r2_knn) 

Evaluation
 MAE :  0.09240432632521534  | RMSE :  0.17242449305494584  | R2 :  0.2132788561232336


# MICE (Multiple Imputation by Chained Equations)


In [30]:
imputer = IterativeImputer()
data_imputed = imputer.fit_transform(missing_data)



In [31]:
mis = data[missing_mask==0]
imp = data_imputed[missing_mask==0]

In [32]:
rmse_mice = np.sqrt(mean_squared_error(mis, imp))
mae_mice = mean_absolute_error(mis, imp)
r2_mice = r2_score(mis,imp)
print("Evaluation\n","MAE : ",mae_mice," | RMSE : ",rmse_mice," | R2 : ",r2_mice) 

Evaluation
 MAE :  0.07751760632134129  | RMSE :  0.14672789258795174  | R2 :  0.4302973144779638
