In [1]:
import numpy as np
import pandas as pd
import joblib
from pandas import DataFrame
import xlsxwriter
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

#Get base from IBIS Data to fit
data = pd.ExcelFile("RF Imputation Scaled.xlsx") 
ibis_sa1y_sheet = data.sheet_names[0]
ibis_sa2y_sheet = data.sheet_names[1]

ibis_sa1y_data = data.parse(ibis_sa1y_sheet)
ibis_sa2y_data = data.parse(ibis_sa2y_sheet)

ibis_data_features_sa1y = ibis_sa1y_data.loc[:, ibis_sa1y_data.columns] 
ibis_data_features_sa1y = ibis_data_features_sa1y.drop(['ROI'], axis=1)
ibis_data_features_sa2y = ibis_sa2y_data.loc[:, ibis_sa2y_data.columns] 
ibis_data_features_sa2y = ibis_data_features_sa2y.drop(['ROI'], axis=1)
#Already dropped ,'1SA-11142','1SA-12142','2SA-11142','2SA-12142'

print(ibis_data_features_sa1y.shape)
print(ibis_data_features_sa2y.shape)

(290, 148)
(290, 148)


In [2]:
#Get the Gilmore data we want imputed to calculate MAE
#'1SA-11142','1SA-12142','2SA-11142','2SA-12142' Already dropped
gilmore_sa_sheet_1yToImpute2y = data.sheet_names[2]
gilmore_sa_sheet_2yToImpute1y = data.sheet_names[3]

gilmore_sa1y_data = data.parse(gilmore_sa_sheet_1yToImpute2y)
gilmore_data_features_sa1y = gilmore_sa1y_data.loc[:, gilmore_sa1y_data.columns]
gilmore_data_features_sa1y = gilmore_data_features_sa1y.drop(['ROI'], axis=1)

gilmore_sa2y_data = data.parse(gilmore_sa_sheet_2yToImpute1y)
gilmore_data_features_sa2y = gilmore_sa2y_data.loc[:, gilmore_sa2y_data.columns]
gilmore_data_features_sa2y = gilmore_data_features_sa2y.drop(['ROI'], axis=1)

print("Individual data feature shapes")
print(gilmore_data_features_sa1y.shape)
print(gilmore_data_features_sa2y.shape)

Individual data feature shapes
(129, 148)
(129, 148)


In [3]:
#Combine some Gilmore for training with IBIS data

X_train_sa1y, X_test_sa1y, X_train_sa2y, X_test_sa2y = train_test_split(gilmore_data_features_sa1y, gilmore_data_features_sa2y, test_size=0.50, random_state=20)
print(X_train_sa1y.shape)
print(X_test_sa1y.shape)
print(X_train_sa2y.shape)
print(X_test_sa2y.shape)

X_train_sa1y_full = pd.concat([ibis_data_features_sa1y, X_train_sa1y])
X_train_sa2y_full = pd.concat([ibis_data_features_sa2y, X_train_sa2y])

print("--------")
print(X_train_sa1y_full.shape)
print(X_train_sa2y_full.shape)

(64, 148)
(65, 148)
(64, 148)
(65, 148)
--------
(354, 148)
(354, 148)


In [4]:
impute_sa2y_from_sa1y = RandomForestRegressor()
impute_sa1y_from_sa2y = RandomForestRegressor()

#Fit to Data
impute_sa2y_from_sa1y.fit(X_train_sa1y_full, X_train_sa2y_full)
impute_sa1y_from_sa2y.fit(X_train_sa2y_full, X_train_sa1y_full)

RandomForestRegressor()

In [5]:
#Predict Gilmore Data for MAE
imputed_sa2y_gilmore = impute_sa2y_from_sa1y.predict(X_test_sa1y)
imputed_sa1y_gilmore = impute_sa1y_from_sa2y.predict(X_test_sa2y)
print("Imputed Individual Shapes")
print(imputed_sa2y_gilmore.shape)
print(imputed_sa1y_gilmore.shape)

#DFs for Individuals
df_sa_2y = pd.DataFrame(imputed_sa2y_gilmore)
df_sa_1y = pd.DataFrame(imputed_sa1y_gilmore)

df_sa_2y.to_excel("Imputed SA 2y RF with Gilmore in Training.xlsx", index=False)
df_sa_1y.to_excel("Imputed SA 1y RF with Gilmore in Training.xlsx", index=False)

Imputed Individual Shapes
(65, 148)
(65, 148)


In [6]:
#Calculate MAEs
from sklearn.metrics import mean_absolute_error
print('MAE 1y RF {} '.format(mean_absolute_error(X_test_sa1y, imputed_sa1y_gilmore)))
print('MAE 2y RF {} '.format(mean_absolute_error(X_test_sa2y, imputed_sa2y_gilmore)))

MAE 1y RF 0.11326950472454522 
MAE 2y RF 0.12378693029060446 


In [7]:
#Get the data we want imputed
interpolate_data = pd.ExcelFile("Data to be Interpolated.xlsx") 
sa1y = interpolate_data.sheet_names[4]
sa2y = interpolate_data.sheet_names[9]

sa1y_data = interpolate_data.parse(sa1y)
data_features_sa1y = sa1y_data.loc[:, sa1y_data.columns] 
data_features_sa1y = data_features_sa1y.drop(['ROI'], axis=1)

sa2y_data = interpolate_data.parse(sa2y)
data_features_sa2y = sa2y_data.loc[:, sa2y_data.columns] 
data_features_sa2y = data_features_sa2y.drop(['ROI'], axis=1)

In [8]:
#Predict the data we want to impute here for the downstream task
predicted_sa1y = impute_sa1y_from_sa2y.predict(data_features_sa2y)
df_predicted_sa1y = pd.DataFrame(predicted_sa1y)
df_predicted_sa1y.to_excel("Interpolated sa1y Downstream with Gilmore in Training.xlsx", index=False)

predicted_sa2y = impute_sa2y_from_sa1y.predict(data_features_sa1y)
df_predicted_sa2y = pd.DataFrame(predicted_sa2y)
df_predicted_sa2y.to_excel("Interpolated sa2y Downstream with Gilmore in Training.xlsx", index=False)