In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

def RF_complete(data_t, data_s_list, feature_t, n_tree=50, n_feature=5, f_sample=0.3, n_best_tree=5):
    K = len(data_s_list)
    print(f"{K} additional datasets used for prediction.")

    if feature_t not in data_t.columns:
        print("Feature-of-interest not found in data_t! Please check column names of input data.")
        return None
    else:
        print("Feature-of-interest located!")

    mu = []
    sigma = []

    for k in range(1, K+1):
        data_assist = data_s_list[k-1]

        if feature_t in data_assist.columns:
            f_t_ind = data_assist.columns.get_loc(feature_t)
        else:
            print(f"feature_t not found in assisting data {k}!")
            continue

        if not data_t.columns.equals(data_assist.columns):
            print(f"Features not matched for assisting data {k}! Skipped to next data.")
            continue

        trans_true_err = []
        trans_pred_err = []

        for j in range(1, data_assist.shape[1]):
            feature_trans = data_assist.columns[j]
            if not data_t.iloc[:, j].isna().sum() > 0:
                rf_1t1 = RF_complete_1t1(data_assist, data_t, feature_t=feature_trans, n_tree=n_tree,
                                         n_feature=n_feature, f_sample=f_sample, k=k)

                if rf_1t1 is not None:
                    trans_true_err.extend(rf_1t1["true_err"])
                    trans_pred_err.extend(rf_1t1["pred_err"])

        if not trans_true_err:
            continue

        lm_coeff = np.polyfit(trans_pred_err, trans_true_err, 1)
        a = lm_coeff[0]
        b = lm_coeff[1]
        c = np.sqrt(np.mean((a * np.array(trans_pred_err) + b - np.array(trans_true_err)) ** 2))

        def f_transfer(x):
            return max(x, a * x + b + c)

        print([f"a={round(a, 3)}", f"b={round(b, 3)}", f"c={round(c, 3)}"])

        rf_1t1 = RF_complete_1t1(data_assist, data_t, feature_t=feature_t, n_best_tree=n_best_tree, n_tree=n_tree,
                                 n_feature=n_feature, f_sample=f_sample, k=k)

        mu.append(rf_1t1["mu"])
        sigma.append(f_transfer(np.mean(rf_1t1["pred_err"])))

    A = 0
    B = 0
    tt = 0
    for k in range(K):
        if sigma[k] is not None:
            tt += 1
            A += mu[k] / sigma[k] ** 2
            B += 1 / sigma[k] ** 2

    print(f"{tt} assisting data used for prediction.")
    return {"predictions": A / B, "errors": 1 / np.sqrt(B)}

def RF_complete_1t1(data_assist, data_t, feature_t, n_tree=50, n_feature=5, f_sample=0.3, n_best_tree=5, k=1):
    if feature_t in data_assist.columns:
        f_t_ind = data_assist.columns.get_loc(feature_t)
    else:
        print(f"feature_t not found in assisting data {k}!")
        return None

    if (data_assist.apply(lambda x: x.count(), axis=0) / data_assist.shape[0] > 0.8).sum() > n_feature:
        f_ind = data_assist.columns[
            (data_assist.apply(lambda x: x.count(), axis=0) / data_assist.shape[0] > 0.8)].tolist()

        if feature_t in f_ind:
            f_ind.remove(feature_t)
    else:
        print(f"n_feature too large for assisting data {k}! Skipped to next data.")
        return None

    f_tmp_ind = [data_t.columns.get_loc(f) for f in f_ind if f in data_t.columns]
    f_feasible = [f for f in data_t.columns[f_tmp_ind] if data_t[f].count() > 2]

    if len(f_feasible) < 2:
        print(f"n_feature too large for assisting data {k}! Skipped to next data.")
        return None

    data_assist = data_assist.dropna(subset=[feature_t])

    RMSE = []
    f_sel_ind = []
    tree = []

    for i in range(n_tree):
        f_sel_ind.append(np.random.choice(f_ind, n_feature, replace=True))
        sample_sel = np.random.choice(data_assist.shape[0], int(data_assist.shape[0] * f_sample), replace=True)
        data_train = data_assist.iloc[sample_sel, f_sel_ind[i] + [f_t_ind]]

        colm_t = data_train.apply(lambda x: x.mean(), axis=1)
        data_train = data_train - np.outer(np.ones(data_train.shape[1]), colm_t)
        data_train.columns = f_sel_ind[i] + ["target"]

        tree.append(DecisionTreeRegressor(min_samples_split=5))
        tree[i].fit(data_train.iloc[:, :-1], data_train["target"])

        data_test = data_assist.iloc[~sample_sel, f_sel_ind[i] + [f_t_ind]]
        colm_t = data_test.apply(lambda x: x.mean(), axis=1)
        data_test = data_test - np.outer(np.ones(data_test.shape[1]), colm_t)
        pred_t = tree[i].predict(data_test.iloc[:, :-1])
        RMSE.append(np.sqrt(np.mean((pred_t - data_test["target"]) ** 2)))

    pred_list = np.zeros((data_t.shape[0], n_best_tree))

    for i in range(n_best_tree):
        j = np.argsort(RMSE)
        f_t_ind = data_t.columns.get_loc(feature_t)
        f_t_sel_ind = [data_t.columns.get_loc(f) for f in f_sel_ind[j[i]]]

        data_test = data_t.iloc[:, f_t_sel_ind + [f_t_ind]]
        colm_t = data_test.apply(lambda x: x.mean(), axis=1)
        data_test = data_test - np.outer(np.ones(data_test.shape[1]), colm_t)
        data_test.columns = f_sel_ind[j[i]] + ["target"]

        pred_t = tree[j[i]].predict(data_test.iloc[:, :-1])
        pred_t[np.where(data_test.iloc[:, :-1].isna().sum(axis=1) > 0)] = np.nan
        pred
