In [1]:
import numpy as np
import pandas as pd
import joblib
from pandas import DataFrame
import xlsxwriter
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
#Get base from IBIS Data to fit
data = pd.ExcelFile("KNN Imputation.xlsx") 

ibis_sa_sheet = data.sheet_names[0]

sa_data = data.parse(ibis_sa_sheet)

data_features_sa = sa_data.loc[:, sa_data.columns] 
data_features_sa = data_features_sa.drop(['ROI'], axis=1)
#Already dropped '1SA-11142','1SA-12142','2SA-11142','2SA-12142'
X_train, X_validation_truth = train_test_split(data_features_sa, test_size=0.10, random_state=42)

#Now add in 50% gilmore data for training
gilmore_sa_sheet = data.sheet_names[3]
gilmore_sa_data = data.parse(gilmore_sa_sheet)
gilmore_sa_data = gilmore_sa_data.loc[:, gilmore_sa_data.columns]
gilmore_sa_data = gilmore_sa_data.drop(['ROI'], axis=1)

X_train_Gilmore, X_test_Gilmore = train_test_split(gilmore_sa_data, test_size=0.50, random_state=42)

print(X_train.shape) #261
#print(gilmore_sa_data.shape) #129
print(X_train_Gilmore.shape) #64
#print(X_test_Gilmore.shape) #65

X_train_full = pd.concat([X_train, X_train_Gilmore], ignore_index=True)
print(X_train_full.shape) #354

(261, 296)
(64, 296)
(325, 296)


In [2]:
#Get the Gilmore data we want imputed to calculate MAE
#'1SA-11142','1SA-12142','2SA-11142','2SA-12142' Already dropped
gilmore_sa_sheet_1yToImpute2y = data.sheet_names[1]
gilmore_sa_sheet_2yToImpute1y = data.sheet_names[2]

gilmore_sa1y_data = data.parse(gilmore_sa_sheet_1yToImpute2y)
gilmore_data_features_sa1y = gilmore_sa1y_data.loc[:, gilmore_sa1y_data.columns]
gilmore_data_features_sa1y = gilmore_data_features_sa1y.drop(['ROI'], axis=1)

gilmore_sa2y_data = data.parse(gilmore_sa_sheet_2yToImpute1y)
gilmore_data_features_sa2y = gilmore_sa2y_data.loc[:, gilmore_sa2y_data.columns]
gilmore_data_features_sa2y = gilmore_data_features_sa2y.drop(['ROI'], axis=1)

print("Individual data feature shapes")
print(gilmore_data_features_sa1y.shape)
print(gilmore_data_features_sa2y.shape)

X_train_Gilmore_1y, X_test_Gilmore_2y = train_test_split(gilmore_data_features_sa1y, test_size=0.50, random_state=42)
X_train_Gilmore_2y, X_test_Gilmore_1y = train_test_split(gilmore_data_features_sa2y, test_size=0.50, random_state=42)

#X_train_full already has all of the training IBIS and Gilmore data we want
#X_train_Gilmore_1y has the training 1y data appearing in the set above -- Ignore
#X_train_Gilmore2y has the training 2y data appearing in the set above -- Ignore
#X_test_Gilmore1y has the testing 1y data that does not appear in the set above and is missing 2y data
#X_test_Gilmore2y has the testing 2y data that does not appear in the set above and is missing 1y data

print('X_validation_truth shape {}'.format(X_validation_truth.shape))
X_validation_1y = X_validation_truth.loc[:, X_validation_truth.columns]
X_validation_1y.drop(X_validation_1y.iloc[:, 0:148], axis=1, inplace=True) #Missing 1 yr
X_validation_2y = X_validation_truth.loc[:, X_validation_truth.columns]
X_validation_2y.drop(X_validation_2y.iloc[:, 148:296], axis=1, inplace=True) #Missing 2 yr
print(X_validation_1y.shape)
print(X_validation_2y.shape)
#Add back NaNs
for i in range(0,148):
    X_validation_2y.insert(i+148,i+148, np.nan) #Add nans for missing 2yr
    X_validation_1y.insert(i,i, np.nan) #Add nans for missing 1yr
print(X_validation_1y.shape)
print(X_validation_2y.shape)

Individual data feature shapes
(129, 296)
(129, 296)
X_validation_truth shape (29, 296)
(29, 148)
(29, 148)
(29, 296)
(29, 296)


In [3]:
sa_imputer = KNNImputer(n_neighbors=50)
#Fit to IBIS Data
sa_imputer.fit(X_train_full)

imputed_validation_2y = sa_imputer.transform(X_validation_2y) #Predict 2yr
imputed_validation_1y = sa_imputer.transform(X_validation_1y) #Predict 1yr

#MAE Calculation
from sklearn.metrics import mean_absolute_error
print('MAE 1y Validation KNN {} '.format(mean_absolute_error(imputed_validation_1y, X_validation_truth.to_numpy())))
print('MAE 2y Validation KNN {} '.format(mean_absolute_error(imputed_validation_2y, X_validation_truth.to_numpy())))

MAE 1y Validation KNN 0.050690232993067366 
MAE 2y Validation KNN 0.0527292552594669 


In [4]:
#Predict Gilmore Data for MAE
imputed_sa2y = sa_imputer.transform(X_test_Gilmore_2y)
imputed_sa1y = sa_imputer.transform(X_test_Gilmore_1y)
print("Imputed Individual Shapes")
print(imputed_sa2y.shape)
print(imputed_sa1y.shape)

#DFs for Individuals
df_sa_2y = pd.DataFrame(imputed_sa2y)
df_sa_1y = pd.DataFrame(imputed_sa1y)

df_sa_2y.to_excel("Imputed SA 2y KNN with Gilmore in Training.xlsx", index=False)
df_sa_1y.to_excel("Imputed SA 1y KNN with Gilmore in Training.xlsx", index=False)

#Ground Truth
#X_test_Gilmore_1y - Holds GT for 2y data
#X_test_Gilmore_2y - Holds GT for 1y data

#Imputed
df_imputed_sa_1y_xlsx = pd.ExcelFile("Imputed SA 1y KNN with Gilmore in Training.xlsx")
imputed_sa_sheet = df_imputed_sa_1y_xlsx.sheet_names[0]
imputed_sa_data = df_imputed_sa_1y_xlsx.parse(imputed_sa_sheet)
imputed_data_features_sa1y = imputed_sa_data.loc[:, imputed_sa_data.columns]
imputed_data_features_sa1y.drop(imputed_data_features_sa1y.iloc[:, 148:296], axis=1, inplace=True)
#print(imputed_data_features_sa1y)

df_imputed_sa_2y_xlsx = pd.ExcelFile("Imputed SA 2y KNN with Gilmore in Training.xlsx")
imputed_sa_sheet = df_imputed_sa_2y_xlsx.sheet_names[0]
imputed_sa_data = df_imputed_sa_2y_xlsx.parse(imputed_sa_sheet)
imputed_data_features_sa2y = imputed_sa_data.loc[:, imputed_sa_data.columns]
imputed_data_features_sa2y.drop(imputed_data_features_sa2y.iloc[:, 0:148], axis=1, inplace=True)
#print(imputed_data_features_sa2y)

X_test_Gilmore_1y_148 = X_test_Gilmore_1y.loc[:, X_test_Gilmore_1y.columns]
X_test_Gilmore_1y_148.drop(X_test_Gilmore_1y_148.iloc[:, 0:148], axis=1, inplace=True) #Missing 1 yr
X_test_Gilmore_2y_148 = X_test_Gilmore_2y.loc[:, X_test_Gilmore_2y.columns]
X_test_Gilmore_2y_148.drop(X_test_Gilmore_2y_148.iloc[:, 148:296], axis=1, inplace=True) #Missing 2 yr

from sklearn.metrics import mean_absolute_error
print('MAE 1y KNN {} '.format(mean_absolute_error(X_test_Gilmore_2y_148.to_numpy(), imputed_data_features_sa1y.to_numpy())))
print('MAE 2y KNN {} '.format(mean_absolute_error(X_test_Gilmore_1y_148.to_numpy(), imputed_data_features_sa2y.to_numpy())))

#Remove extra features for MAE calculation - Done
#Find Optimal number of neighbors - todo
#Create a validation set for KNN to see if the trend for AC having high MAE continues - todo
#N = 2
#MAE 1y KNN 0.16280582283255568 
#MAE 2y KNN 0.16321320699559033 
#N = 3
#MAE 1y KNN 0.1577243075443826 
#MAE 2y KNN 0.15720122874457262 
#N = 4
#MAE 1y KNN 0.15519481307686972 
#MAE 2y KNN 0.15487959605664114 
#N = 5
#MAE 1y KNN 0.15356291288903134 
#MAE 2y KNN 0.15294622209838285 
#N = 10
#MAE 1y KNN 0.15003403232716447 
#MAE 2y KNN 0.14948200320277555 
#N = 25
#MAE 1y KNN 0.14792755622564427 
#MAE 2y KNN 0.14750929111677524 
#N = 50
#MAE 1y KNN 0.14757113789488976 
#MAE 2y KNN 0.1475000914221722 
#N = 100
#MAE 1y KNN 0.14826055109796843 
#MAE 2y KNN 0.1483661596808016 

Imputed Individual Shapes
(65, 296)
(65, 296)
MAE 1y KNN 0.12542137299967474 
MAE 2y KNN 0.12618903043571436 


In [5]:
print(imputed_data_features_sa2y)

         148       149       150       151       152       153       154  \
0   0.396001  0.397246  0.343011  0.313636  0.332195  0.390709  0.360133   
1   0.389542  0.436375  0.474733  0.374017  0.391385  0.525810  0.507901   
2   0.309279  0.406053  0.479079  0.321853  0.369903  0.433040  0.416044   
3   0.297995  0.399774  0.468808  0.291760  0.360219  0.407497  0.395010   
4   0.299402  0.397265  0.499247  0.309854  0.344679  0.429097  0.439816   
..       ...       ...       ...       ...       ...       ...       ...   
60  0.344096  0.380394  0.451422  0.326936  0.343845  0.420056  0.392495   
61  0.315501  0.392317  0.445339  0.298543  0.341061  0.402917  0.407480   
62  0.321159  0.434382  0.473962  0.344127  0.373336  0.462657  0.479178   
63  0.349072  0.398529  0.401790  0.330062  0.358569  0.444004  0.430183   
64  0.332941  0.429262  0.451497  0.346123  0.376922  0.484302  0.491031   

         155       156       157  ...       286       287       288       289  \
0   0.

In [6]:
#Get the data we want imputed
interpolate_data = pd.ExcelFile("Data to be Interpolated.xlsx") 
sa1y = interpolate_data.sheet_names[4]
sa2y = interpolate_data.sheet_names[9]

sa1y_data = interpolate_data.parse(sa1y)
data_features_sa1y = sa1y_data.loc[:, sa1y_data.columns] 
data_features_sa1y = data_features_sa1y.drop(['ROI'], axis=1)

sa2y_data = interpolate_data.parse(sa2y)
data_features_sa2y = sa2y_data.loc[:, sa2y_data.columns] 
data_features_sa2y = data_features_sa2y.drop(['ROI'], axis=1)

print(data_features_sa1y.shape)
print(data_features_sa2y.shape)

(36, 296)
(20, 296)


In [7]:
#Predict the data we want to impute here for the downstream task
predicted_sa2y = sa_imputer.transform(data_features_sa1y)
df_predicted_sa2y = pd.DataFrame(predicted_sa2y)
df_predicted_sa2y.to_excel("Interpolated SA2y Downstream with Gilmore in Training.xlsx", index=False)

predicted_sa1y = sa_imputer.transform(data_features_sa2y)
df_predicted_sa1y = pd.DataFrame(predicted_sa1y)
df_predicted_sa1y.to_excel("Interpolated SA1y Downstream with Gilmore in Training.xlsx", index=False)