# 1. Triển khai multiple Source trong TrAdaboost R2

In [1]:
# coding: UTF-8
import keras
import tensorflow as tf
import numpy as np
import copy
from sklearn.tree import DecisionTreeRegressor

In [2]:


# =============================================================================
# Public estimators
# =============================================================================


def AdaBoost_R2_T(trans_S, response_S, test, weight, frozen_N, N=20):

    trans_data = copy.deepcopy(trans_S)
    trans_response = copy.deepcopy(response_S)

    row_S = trans_S.shape[0]
    row_T = test.shape[0]

    test_data = np.concatenate((trans_data, test), axis=0)
    weights = copy.deepcopy(weight)
    # initilize data weights
    _weights = weights / sum(weights)

    # Save prediction responses and bata_t
    bata_T = np.zeros(N)
    result_response = np.ones([row_S + row_T, N])

    # Save the prediction responses of test data
    predict = np.zeros(row_T)

    trans_data = np.asarray(trans_data, order='C')
    trans_response = np.asarray(trans_response, order='C')
    test_data = np.asarray(test_data, order='C')

    Total_S_weight = np.sum(weights[-frozen_N:])
    for i in range(N):
        _weights = calculate_P(_weights, frozen_N, Total_S_weight)
        print("+"*60)
        print(train_reg(
            trans_data, trans_response, test_data, _weights).shape)
        result_response[:, i] = train_reg(
            trans_data, trans_response, test_data, _weights).ravel()
        error_rate = calculate_error_rate(
            response_S, result_response[0: row_S, i], _weights)
        if error_rate > 0.5 or error_rate <= 1e-10:
            break

        bata_T[i] = error_rate / (1 - error_rate)

        # Changing the data weights of unfrozen training data
        D_t = np.abs(result_response[frozen_N:row_S,
                     i] - response_S[frozen_N:row_S]).max()
        for j in range(row_S - frozen_N):
            _weights[frozen_N + j] = _weights[frozen_N + j] * np.power(
                bata_T[i], (1-np.abs(result_response[frozen_N + j, i] - response_S[frozen_N+j])/D_t))

    Cal_res = result_response[row_S:, :]
    # Sort the predictions
    sorted_idx = np.argsort(Cal_res, axis=1)

    # Find index of median prediction for each sample
    weight_cdf = np.cumsum(bata_T[sorted_idx], axis=1)
    # return True - False
    median_or_above = weight_cdf >= 0.5 * weight_cdf[:, -1][:, np.newaxis]
    median_idx = median_or_above.argmax(axis=1)

    median_estimators = sorted_idx[np.arange(row_T), median_idx]
    for j in range(row_T):
        predict[j] = Cal_res[j, median_estimators[j]]
    return predict


def calculate_P(weights, frozen_N, Total_S_weight):
    total = np.sum(weights[-frozen_N:])
    weights[-frozen_N:] / total * Total_S_weight
    return np.asarray(weights, order='C')


def train_reg(trans_data, trans_response, test_data, weights):
    """
    # weight resampling 
    cdf = np.cumsum(weights)
    cdf_ = cdf / cdf[-1]
    uniform_samples = np.random.random_sample(len(trans_data))
    bootstrap_idx = cdf_.searchsorted(uniform_samples, side='right')
    # searchsorted returns a scalar
    bootstrap_idx = np.array(bootstrap_idx, copy=False)
    reg = DecisionTreeRegressor(max_depth=2,splitter='random',max_features="log2",random_state=0)
    reg.fit(trans_data[bootstrap_idx], trans_response[bootstrap_idx])
    return reg.predict(test_dat)
    """
    # In order to ensure that the results are not random,
    # the weights are adjusted by the built-in method
    # reg = DecisionTreeRegressor(
    #     max_depth=2, splitter='random', max_features="log2", random_state=0)
    # print("_"*60)
    # display(trans_data.shape, trans_response.shape)
    # reg.fit(trans_data, trans_response, sample_weight=weights)
    # return reg.predict(test_data)
    return base_LSTM(trans_data, test_data, trans_response, weights)

def base_LSTM(train, test, response, weight, N=20):
    model = keras.Sequential()
    model.add(keras.layers.LSTM(32, input_shape=(train.shape[1], 1)))
    model.add(keras.layers.Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam')
    train = np.reshape(train, (train.shape[0], train.shape[1], 1))
    test = np.reshape(test, (test.shape[0], test.shape[1], 1))
    model.fit(train, response, epochs=N, batch_size=1, verbose=2)
    return model.predict(test)


def calculate_error_rate(response_R, response_H, weight):
    total = np.abs(response_R - response_H).max()
    return np.sum(weight[:] * np.abs(response_R - response_H) / total)


In [3]:
# coding: UTF-8
import copy
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import LeaveOneOut
from sklearn.tree import DecisionTreeRegressor

# =============================================================================
# Public estimators
# =============================================================================


def Two_stage_TrAdaboost_R2(trans_S, Multi_trans_A, response_S, Multi_response_A, test, steps_S, N):
    # prepare trans_A
    trans_A = list(Multi_trans_A.values())[0]
    if len(Multi_trans_A) == 1:
        pass
    else:
        for i in range(len(Multi_trans_A)-1):
            p = i + 1
            trans_A = np.concatenate((trans_A, list(Multi_trans_A.values())[p]), axis=0)
    # prepare response_A
    response_A = list(Multi_response_A.values())[0]
    if len(Multi_response_A) == 1:
        pass 
    else:
        for i in range(len(Multi_response_A)-1):
            p = i + 1
            response_A = np.concatenate((response_A, list(Multi_response_A.values())[p]), axis=0)

    trans_data = np.concatenate((trans_A, trans_S), axis=0)
    trans_response = np.concatenate((response_A, response_S), axis=0)

    row_A = trans_A.shape[0]
    row_S = trans_S.shape[0]

    # Initialize the weights
    weight  = np.ones(row_A+row_S)/(row_A+row_S)
    bata_T = np.zeros(steps_S)
    
    print ('params initial finished.')
    print('='*60)

    # generate a pool of AdaBoost_R2_T
    AdaBoost_pre = []
    model_error = []
    for i in range(steps_S):
        res_ = AdaBoost_R2_T(trans_data, trans_response, test, weight,row_A, N )
        AdaBoost_pre.append(res_)
        LOOCV_MSE = LOOCV_test(trans_data, trans_response,  weight,row_A, N)
        model_error.append(LOOCV_MSE)
        if len(model_error) > 2 and model_error[-1] > model_error[-2]:
            steps_S = i
            break
        # In order to ensure that the results are not random,
        # the weights are adjusted by the built-in method 
        reg = DecisionTreeRegressor(max_depth=2,splitter='random',max_features="log2",random_state=0)
        reg.fit(trans_data, trans_response,sample_weight = weight)
        pre_res = reg.predict(trans_data)
        E_t = calculate_error_rate(trans_response, pre_res, weight)

        bata_T[i] =  E_t / (1 - E_t)

        # Changing the data weights of same-distribution training data
        total_w_S = row_S/(row_A+row_S) + i/(steps_S-1)*(1 - row_S/(row_A+row_S))
        weight[row_A : row_A+row_S] =  (weight[row_A : row_A+row_S] / weight[row_A : row_A+row_S].sum()) * total_w_S
        # Changing the data weights of diff-distribution training data
        beta_t = binary_search(total_w_S,weight,trans_response,pre_res,row_A,beta_t_range = (0.01,1,0.01),tal=0.03)
        if beta_t == None:
            for j in range(row_A):
                weight[j] = weight[j] * np.exp(-bata_T[i] * np.abs(trans_response[j] - pre_res[j]))
            weight[0:row_A] =  weight[0:row_A] * (1-total_w_S) / weight[0:row_A].sum()
        else:
            D_t = np.abs(trans_response[0:row_A] - pre_res[0:row_A]).max()
            for j in range(row_A):
                weight[j] = weight[j] * np.power(beta_t, np.abs(trans_response[j] - pre_res[j])/D_t)
            weight[0:row_A] =  weight[0:row_A] * (1-total_w_S) / weight[0:row_A].sum()

        print('Iter {}-th result :'.format(i))
        print('{} AdaBoost_R2_T model has been instantiated :'.format(len(model_error)), '|| E_t :', E_t )
        print('The LOOCV MSE on TARGET DOMAIN DATA : ',LOOCV_MSE)
        print('The beta_t calculated by binary search is : ',beta_t)
        print('-'*60)
      
    model_error = np.array(model_error)
    min_index = np.random.choice(np.flatnonzero(model_error == model_error.min()))
    print('Two_stage_TrAdaboost_R2 is done')
    print('='*60)
    print('The minimum mean square error :',model_error[min_index])
    print('The prediction responses of test data are :')
    print(AdaBoost_pre[min_index])
    return AdaBoost_pre[min_index]


def LOOCV_test(trans_data, trans_response, weight,row_A, N):
    loo = LeaveOneOut()
    X = np.array(trans_data)
    Y = np.array(trans_response)
    y_pre_loocv = []
    cal = 0
    for train_index, test_index in loo.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, _ = Y[train_index], Y[test_index]
        w_train, _ = weight[train_index], weight[test_index]
        if cal <= row_A-1:
            y_pre = AdaBoost_R2_T(X_train, y_train, X_test, w_train,row_A-1, N )
        else:
            y_pre = AdaBoost_R2_T(X_train, y_train, X_test, w_train,row_A, N )
        y_pre_loocv.append(y_pre[0])
    return mean_squared_error(trans_response[row_A:],y_pre_loocv[row_A:])

def calculate_error_rate(response_R, response_H, weight):
    total = np.abs(response_R - response_H).max()
    return np.sum(weight[:] * np.abs(response_R - response_H) / total)

# binary_search strategy
def binary_search(total_w_S,__weight,trans_response,pre_res,row_A,beta_t_range = (0.01,1,0.01),tal=0.03):
    # beta_t_range is the search range of beta_t, default = (0.01,1,0.01)
    # viz., beta_t is searched in the interval of 0 to 1, with the step of 0.01 by binary_search
    
    D_t = np.abs(trans_response[0:row_A] - pre_res[0:row_A]).max()
    _list = np.arange(beta_t_range[0],beta_t_range[1],beta_t_range[2])
    low = 0          
    high = len(_list)-1
    while low <= high:   
        weight = copy.deepcopy(__weight) 
        mid = int(np.floor((low+high)/2))
        guess = _list[mid]
        # test beta_t
        for j in range(row_A):
            weight[j] = weight[j] * np.power(guess, np.abs(trans_response[j] - pre_res[j])/D_t)
        diff = (1-total_w_S) -  weight[0:row_A].sum()
        if abs(diff) <= tal:     
            return guess
        # exceed the convergence crtiterion
        elif diff > 0:
            low = mid + 1   
        else:  
            high = mid -1               
      
    print("UNABLE TO COVERGEE IN BINARY SEARCHING")
    return None

In [4]:
import pandas as pd
Domain_0 = pd.read_csv('C:/Users/acer/OneDrive - Nuce/Desktop/HỌC/ML/Transfer Learning/Paper/Data/Domain0.csv')
Domain_1 = pd.read_csv('C:/Users/acer/OneDrive - Nuce/Desktop/HỌC/ML/Transfer Learning/Paper/Data/Domain1.csv')
Domain_2 = pd.read_csv('C:/Users/acer/OneDrive - Nuce/Desktop/HỌC/ML/Transfer Learning/Paper/Data/Domain2.csv')
Domain_3 = pd.read_csv('C:/Users/acer/OneDrive - Nuce/Desktop/HỌC/ML/Transfer Learning/Paper/Data/Domain3.csv')
Domain_Target = pd.read_csv('C:/Users/acer/OneDrive - Nuce/Desktop/HỌC/ML/Transfer Learning/Paper/Data/Domain5.csv')

In [5]:
Multi_trans_A = {
    'trans_A_1':Domain_0.iloc[:,:-1],
    'trans_A_2':Domain_1.iloc[:,:-1],
    'trans_A_3':Domain_2.iloc[:,:-1],
    'trans_A_4':Domain_3.iloc[:,:-1]
}
Multi_response_A = {
    'response_A_1':Domain_0.iloc[:,:-1],
    'response_A_2':Domain_1.iloc[:,:-1],
    'response_A_3':Domain_2.iloc[:,:-1],
    'response_A_4':Domain_3.iloc[:,:-1]
}
trans_S = Domain_Target[:4].iloc[:,:-1]
response_S = Domain_Target[:4]. iloc[:,-1:]
test = Domain_Target[4:].iloc[:,:-1]
steps_S = 3
N = 10

In [6]:
display(response_S, trans_S)

Unnamed: 0,1
0,0.325699
1,0.344934
2,0.792876
3,0.262028


Unnamed: 0,0
0,0.514219
1,1.0
2,0.433249
3,0.184482


In [7]:
Two_stage_TrAdaboost_R2(trans_S, Multi_trans_A, response_S, Multi_response_A, test, steps_S,N)

params initial finished.
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Epoch 1/20
44/44 - 2s - loss: 0.2541 - 2s/epoch - 52ms/step
Epoch 2/20
44/44 - 0s - loss: 0.1196 - 106ms/epoch - 2ms/step
Epoch 3/20
44/44 - 0s - loss: 0.0453 - 98ms/epoch - 2ms/step
Epoch 4/20
44/44 - 0s - loss: 0.0264 - 98ms/epoch - 2ms/step
Epoch 5/20
44/44 - 0s - loss: 0.0251 - 100ms/epoch - 2ms/step
Epoch 6/20
44/44 - 0s - loss: 0.0237 - 102ms/epoch - 2ms/step
Epoch 7/20
44/44 - 0s - loss: 0.0234 - 98ms/epoch - 2ms/step
Epoch 8/20
44/44 - 0s - loss: 0.0218 - 102ms/epoch - 2ms/step
Epoch 9/20
44/44 - 0s - loss: 0.0211 - 104ms/epoch - 2ms/step
Epoch 10/20
44/44 - 0s - loss: 0.0200 - 105ms/epoch - 2ms/step
Epoch 11/20
44/44 - 0s - loss: 0.0193 - 106ms/epoch - 2ms/step
Epoch 12/20
44/44 - 0s - loss: 0.0186 - 104ms/epoch - 2ms/step
Epoch 13/20
44/44 - 0s - loss: 0.0180 - 112ms/epoch - 3ms/step
Epoch 14/20
44/44 - 0s - loss: 0.0176 - 103ms/epoch - 2ms/step
Epoch 15/20
44/44 - 0s - loss: 0.0170 - 107ms/

array([0.48414111, 0.27658093, 0.39983696, 0.21004099, 0.63859016,
       0.23142371, 0.30727255, 0.32318285, 0.61331946, 0.46409091,
       0.45541608, 0.58039027, 0.45394582, 0.4468078 , 0.48571655,
       0.44146907])

vector 3 giá trị, theo dạng time-series