In [1]:
import keras
import tensorflow as tf
import numpy as np
import copy
from sklearn.tree import DecisionTreeRegressor

In [2]:
# coding: UTF-8
import numpy as np
import copy
from sklearn.tree import DecisionTreeRegressor

# =============================================================================
# Public estimators
# =============================================================================

def AdaBoost_R2_T(trans_S, response_S, test, weight,frozen_N, N = 20):
    """Boosting for Regression Transfer.

    Please feel free to open issues in the Github : https://github.com/Bin-Cao/TrAdaboost
    or 
    contact Bin Cao (bcao@shu.edu.cn)
    in case of any problems/comments/suggestions in using the code. 

    Parameters
    ----------
    trans_S : feature matrix 

    response_S : response of training data, real values 

    test : feature matrix of test data

    weights : initial data weights  

    frozen_N : int, the weights of first [frozen_N] instances in trans_S  are never modified 

    N : int, default=20, the number of weak estimators

    Examples
    --------
    import pandas as pd
    # training data
    tarin_data = pd.read_csv('Sdata.csv')
    # test data
    test_data = pd.read_csv('Tdata.csv')

    trans_S = tarin_data.iloc[:,:-1]
    response_S = tarin_data.iloc[:, -1]
    test = test_data.iloc[:,:-1]
    N = 10

    AdaBoost_R2_T(trans_S, response_S, test, weights, frozen_N, N)

    References
    ----------
    .. [1] Algorithm 3 
    Pardoe, D., & Stone, P. (2010, June). 
    Boosting for regression transfer. 
    In Proceedings of the 27th International Conference 
    on International Conference on Machine Learning (pp. 863-870).
    """

    trans_data =  copy.deepcopy(trans_S)
    trans_response =  copy.deepcopy(response_S)

    row_S = trans_S.shape[0]
    row_T = test.shape[0]

    test_data = np.concatenate((trans_data, test), axis=0)
    weights = copy.deepcopy(weight)
    # initilize data weights
    _weights = weights / sum(weights)

    # Save prediction responses and bata_t
    bata_T = np.zeros(N)
    result_response = np.ones([row_S + row_T, N])

    # Save the prediction responses of test data 
    predict = np.zeros(row_T)

    trans_data = np.asarray(trans_data, order='C')
    trans_response = np.asarray(trans_response, order='C')
    test_data = np.asarray(test_data, order='C')

    Total_S_weight = np.sum(weights[-frozen_N:])
    for i in range(N):
        _weights = calculate_P(_weights, frozen_N,Total_S_weight)
        result_response[:, i] = train_reg(trans_data, trans_response, test_data, _weights)
        error_rate = calculate_error_rate(response_S, result_response[0: row_S, i],_weights)
        if error_rate > 0.5 or error_rate <= 1e-10: break

        bata_T[i] = error_rate / (1 - error_rate)

        # Changing the data weights of unfrozen training data
        D_t = np.abs(result_response[frozen_N:row_S, i] - response_S[frozen_N:row_S]).max()
        for j in range(row_S - frozen_N):
            _weights[frozen_N + j] = _weights[frozen_N + j] * np.power(bata_T[i], (1-np.abs(result_response[frozen_N + j, i] - response_S[frozen_N+j])/D_t))
    
    
    Cal_res = result_response[row_S:,:]
    # Sort the predictions
    sorted_idx = np.argsort(Cal_res, axis=1)

    # Find index of median prediction for each sample
    weight_cdf = np.cumsum(bata_T[sorted_idx], axis=1)
    # return True - False
    median_or_above = weight_cdf >= 0.5 * weight_cdf[:, -1][:, np.newaxis]
    median_idx = median_or_above.argmax(axis=1)

    median_estimators = sorted_idx[np.arange(row_T), median_idx]
    for j in range(row_T):
        predict[j] = Cal_res[j,median_estimators[j]]
    return predict

def calculate_P(weights,frozen_N,Total_S_weight):
    total = np.sum(weights[-frozen_N:])
    weights[-frozen_N:] / total * Total_S_weight
    return np.asarray(weights, order='C')

def train_reg(trans_data, trans_response, test_data, weights):
    # In order to ensure that the results are not random,
    # the weights are adjusted by the built-in method 
    reg = DecisionTreeRegressor(max_depth=2,splitter='random',max_features="log2",random_state=0)
    reg.fit(trans_data, trans_response,sample_weight = weights)
    return reg.predict(test_data)

def calculate_error_rate(response_R, response_H, weight):
    total = np.abs(response_R - response_H).max()
    return np.sum(weight[:] * np.abs(response_R - response_H) / total)

In [4]:
# coding: UTF-8
import copy
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import LeaveOneOut
from sklearn.tree import DecisionTreeRegressor



def Two_stage_TrAdaboost_R2(trans_S, Multi_trans_A, response_S, Multi_response_A, test, steps_S, N):
    # prepare trans_A
    trans_A = list(Multi_trans_A.values())[0]
    if len(Multi_trans_A) == 1:
        pass
    else:
        for i in range(len(Multi_trans_A)-1):
            p = i + 1
            trans_A = np.concatenate((trans_A, list(Multi_trans_A.values())[p]), axis=0)
    # prepare response_A
    response_A = list(Multi_response_A.values())[0]
    if len(Multi_response_A) == 1:
        pass 
    else:
        for i in range(len(Multi_response_A)-1):
            p = i + 1
            response_A = np.concatenate((response_A, list(Multi_response_A.values())[p]), axis=0)
   
    trans_data = np.concatenate((trans_A, trans_S), axis=0)
    trans_response = np.concatenate((response_A, response_S), axis=0)

    row_A = trans_A.shape[0]
    row_S = trans_S.shape[0]

    # Initialize the weights
    weight  = np.ones(row_A+row_S)/(row_A+row_S)
    bata_T = np.zeros(steps_S)
    
    print ('params initial finished.')
    print('='*60)

    # generate a pool of AdaBoost_R2_T
    AdaBoost_pre = []
    model_error = []
    for i in range(steps_S):
        res_ = AdaBoost_R2_T(trans_data, trans_response, test, weight,row_A, N )
        AdaBoost_pre.append(res_)
        LOOCV_MSE = LOOCV_test(trans_data, trans_response,  weight,row_A, N)
        model_error.append(LOOCV_MSE)
        """
        The paper says that:
        In addition, it is not necessary to progress through all S steps once it has been determined that errors are increasing.
        """

        if len(model_error) > 2 and model_error[-1] > model_error[-2]:
            steps_S = i
            break
        # In order to ensure that the results are not random,
        # the weights are adjusted by the built-in method 
        reg = DecisionTreeRegressor(max_depth=2,splitter='random',max_features="log2",random_state=0)
        reg.fit(trans_data, trans_response,sample_weight = weight)
        pre_res = reg.predict(trans_data)
        E_t = calculate_error_rate(trans_response, pre_res, weight)

        bata_T[i] =  E_t / (1 - E_t)

        # Changing the data weights of same-distribution training data
        total_w_S = row_S/(row_A+row_S) + i/(steps_S-1)*(1 - row_S/(row_A+row_S))
        weight[row_A : row_A+row_S] =  (weight[row_A : row_A+row_S] / weight[row_A : row_A+row_S].sum()) * total_w_S
        # Changing the data weights of diff-distribution training data
        beta_t = binary_search(total_w_S,weight,trans_response,pre_res,row_A,beta_t_range = (0.01,1,0.01),tal=0.03)
        if beta_t == None:
            for j in range(row_A):
                weight[j] = weight[j] * np.exp(-bata_T[i] * np.abs(trans_response[j] - pre_res[j]))
            weight[0:row_A] =  weight[0:row_A] * (1-total_w_S) / weight[0:row_A].sum()
        else:
            D_t = np.abs(trans_response[0:row_A] - pre_res[0:row_A]).max()
            for j in range(row_A):
                weight[j] = weight[j] * np.power(beta_t, np.abs(trans_response[j] - pre_res[j])/D_t)
            weight[0:row_A] =  weight[0:row_A] * (1-total_w_S) / weight[0:row_A].sum()

        print('Iter {}-th result :'.format(i))
        print('{} AdaBoost_R2_T model has been instantiated :'.format(len(model_error)), '|| E_t :', E_t )
        print('The LOOCV MSE on TARGET DOMAIN DATA : ',LOOCV_MSE)
        print('The beta_t calculated by binary search is : ',beta_t)
        print('-'*60)
      
    model_error = np.array(model_error)
    min_index = np.random.choice(np.flatnonzero(model_error == model_error.min()))
    print('Two_stage_TrAdaboost_R2 is done')
    print('='*60)
    print('The minimum mean square error :',model_error[min_index])
    print('The prediction responses of test data are :')
    print(AdaBoost_pre[min_index])
    return AdaBoost_pre[min_index]


def LOOCV_test(trans_data, trans_response, weight,row_A, N):
    loo = LeaveOneOut()
    X = np.array(trans_data)
    Y = np.array(trans_response)
    y_pre_loocv = []
    cal = 0
    for train_index, test_index in loo.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, _ = Y[train_index], Y[test_index]
        w_train, _ = weight[train_index], weight[test_index]
        if cal <= row_A-1:
            y_pre = AdaBoost_R2_T(X_train, y_train, X_test, w_train,row_A-1, N )
        else:
            y_pre = AdaBoost_R2_T(X_train, y_train, X_test, w_train,row_A, N )
        y_pre_loocv.append(y_pre[0])
    return mean_squared_error(trans_response[row_A:],y_pre_loocv[row_A:])


def calculate_error_rate(response_R, response_H, weight):
    total = np.abs(response_R - response_H).max()
    return np.sum(weight[:] * np.abs(response_R - response_H) / total)

# binary_search strategy
def binary_search(total_w_S,__weight,trans_response,pre_res,row_A,beta_t_range = (0.01,1,0.01),tal=0.03):
    # beta_t_range is the search range of beta_t, default = (0.01,1,0.01)
    # viz., beta_t is searched in the interval of 0 to 1, with the step of 0.01 by binary_search
    
    D_t = np.abs(trans_response[0:row_A] - pre_res[0:row_A]).max()
    _list = np.arange(beta_t_range[0],beta_t_range[1],beta_t_range[2])
    low = 0          
    high = len(_list)-1
    while low <= high:   
        weight = copy.deepcopy(__weight) 
        mid = int(np.floor((low+high)/2))
        guess = _list[mid]
        # test beta_t
        for j in range(row_A):
            weight[j] = weight[j] * np.power(guess, np.abs(trans_response[j] - pre_res[j])/D_t)
        diff = (1-total_w_S) -  weight[0:row_A].sum()
        if abs(diff) <= tal:     
            return guess
        # exceed the convergence crtiterion
        elif diff > 0:
            low = mid + 1   
        else:  
            high = mid -1               
    print("UNABLE TO COVERGEE IN BINARY SEARCHING")
    return None

In [5]:
import pandas as pd
Domain_0 = pd.read_csv('C:/Users/acer/OneDrive - Nuce/Desktop/HỌC/ML/Transfer Learning/Paper/Data/1_1/Domain0.csv')
Domain_1 = pd.read_csv('C:/Users/acer/OneDrive - Nuce/Desktop/HỌC/ML/Transfer Learning/Paper/Data/1_1/Domain1.csv')
Domain_2 = pd.read_csv('C:/Users/acer/OneDrive - Nuce/Desktop/HỌC/ML/Transfer Learning/Paper/Data/1_1/Domain2.csv')
Domain_3 = pd.read_csv('C:/Users/acer/OneDrive - Nuce/Desktop/HỌC/ML/Transfer Learning/Paper/Data/1_1/Domain3.csv') 
Domain_Target = pd.read_csv('C:/Users/acer/OneDrive - Nuce/Desktop/HỌC/ML/Transfer Learning/Paper/Data/1_1/Domain4.csv')

In [19]:
Domain_1.shape

(10, 2)

In [15]:
nums_label = 1
n_nums_label = -1*nums_label
Multi_trans_A = {
    'trans_A_1':Domain_0.iloc[:,:n_nums_label],
    'trans_A_2':Domain_1.iloc[:,:n_nums_label],
    'trans_A_3':Domain_2.iloc[:,:n_nums_label],
    'trans_A_4':Domain_3.iloc[:,:n_nums_label]
}
Multi_response_A = {
    'response_A_1':Domain_0.iloc[:,n_nums_label:],
    'response_A_2':Domain_1.iloc[:,n_nums_label:],
    'response_A_3':Domain_2.iloc[:,n_nums_label:],
    'response_A_4':Domain_3.iloc[:,n_nums_label:]
}
trans_S = Domain_Target[:4].iloc[:,:n_nums_label]
response_S = Domain_Target[:4]. iloc[:,n_nums_label:]
test = Domain_Target[4:].iloc[:,:n_nums_label]
steps_S = 10
N = 10

In [16]:
display(response_S, trans_S)

Unnamed: 0,1
0,0.665653
1,0.72749
2,0.0
3,0.697146


Unnamed: 0,0
0,0.522149
1,0.187979
2,0.372427
3,0.299171


In [17]:
Two_stage_TrAdaboost_R2(trans_S, Multi_trans_A, response_S,
                        Multi_response_A, test, steps_S, N)

params initial finished.
Iter 0-th result :
1 AdaBoost_R2_T model has been instantiated : || E_t : 11.673892362115422
The LOOCV MSE on TARGET DOMAIN DATA :  0.07677783249388258
The beta_t calculated by binary search is :  0.87
------------------------------------------------------------
Iter 1-th result :
2 AdaBoost_R2_T model has been instantiated : || E_t : 11.692475635483476
The LOOCV MSE on TARGET DOMAIN DATA :  0.07656561972924959
The beta_t calculated by binary search is :  0.5
------------------------------------------------------------
Iter 2-th result :
3 AdaBoost_R2_T model has been instantiated : || E_t : 12.049216970525732
The LOOCV MSE on TARGET DOMAIN DATA :  0.07516161586543221
The beta_t calculated by binary search is :  0.5
------------------------------------------------------------
Two_stage_TrAdaboost_R2 is done
The minimum mean square error : 0.07516161586543221
The prediction responses of test data are :
[0.36491886 0.36491886 0.36491886 0.59547111 0.59547111 0.36

array([0.36491886, 0.36491886, 0.36491886, 0.59547111, 0.59547111,
       0.36491886])