A comparison of the end-to-end and patient similarity-based predictive approaches.      
For simplicity, we use ridge regression for modelling and K-Means for clustering.

In [1]:
import numpy as np
import sklearn as sk
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split

# generate a toy 2d regression dataset
X, y = make_regression(n_samples=1000, n_features=20, n_informative=15, random_state=1)
# split to train:test = 800:200
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
print('X_shape = {}, y_shape = {}'.format(np.shape(X), np.shape(y)))
print('N_predictor = {}'.format(np.shape(X)[1]))

X_shape = (1000, 20), y_shape = (1000,)
N_predictor = 20


In [2]:
from sklearn.cluster import KMeans
from sklearn.linear_model import Ridge

# end-to-end
model_end = Ridge()
model_end.fit(X_train, y_train)
pred_end = model_end.predict(X_test)

# patient similarity-based
N_CLUSTER = 4
kmeans = KMeans(n_clusters=N_CLUSTER, random_state=1).fit(X_train)
# (unique, counts) = np.unique(kmeans.labels_, return_counts=True)
# counts
cluster_label_train = kmeans.labels_
cluster_label_test = kmeans.predict(X_test)

pred_sim = np.zeros(X_test.shape[0])
for i in range(N_CLUSTER):
    idx_train, idx_test = np.where(cluster_label_train==i)[0], np.where(cluster_label_test==i)[0]
    temp_X_train, temp_y_train = X_train[idx_train, :], y_train[idx_train]
    temp_X_test, temp_y_test = X_test[idx_test, :], y_test[idx_test]
    
    model = Ridge()
    model.fit(temp_X_train, temp_y_train)
    temp_pred = model.predict(temp_X_test)
    pred_sim[idx_test] = temp_pred

In [3]:
# MAE evaluation
from sklearn.metrics import mean_absolute_error
print('MAE_end = {:0.4f} \nMAE_sim = {:0.4f}'.format(
            mean_absolute_error(y_test, pred_end),
            mean_absolute_error(y_test, pred_sim)))

MAE_end = 0.2380 
MAE_sim = 1.0738
