### Python packages used in this code

In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import random
from random import shuffle
import os
import pickle
import time
import sklearn
import platform
import sys
from sklearn.kernel_ridge import KernelRidge
from sklearn.gaussian_process.kernels import Matern, RBF
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.model_selection import train_test_split, GridSearchCV
# from sklearn.linear_model import Ridge, LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import warnings
warnings.simplefilter('ignore')
from IPython.display import clear_output
from scipy import io
import math

%matplotlib inline

In [2]:
"""
Environments

--Platform--
OS : macOS-10.16-x86_64-i386-64bit
--Version--
python :  3.9.12 (main, Jun  1 2022, 06:36:29) 
[Clang 12.0.0 ]
numpy : 1.23.1
pandas : 1.4.3
sklearn : 1.1.1
"""

print('--Platform--')
print('OS :', platform.platform())
print('--Version--')
print('python : ', sys.version)
print('numpy :', np.__version__)
print('pandas :', pd.__version__)
print('sklearn :', sklearn.__version__)

--Platform--
OS : macOS-10.16-x86_64-i386-64bit
--Version--
python :  3.9.12 (main, Jun  1 2022, 06:36:29) 
[Clang 12.0.0 ]
numpy : 1.23.1
pandas : 1.4.3
sklearn : 1.1.1


# Preparation

## Define the model class proposed in the paper

In [3]:
class KernelRidge_HM(BaseEstimator, RegressorMixin):
    def __init__(self, gamma=1, lambda1=1, nu=1.5, kernel='linear'):
        """
        Define the kernel ridge model with intercept.
            h(x) = <\theta, x> + intercept
                x  : discriptor
        
        Parameters
        ----------
            gamma   : length scale of the kernel
            lambda1 : regularization parameter for theta
            nu      : parameter controlling the smoothness of Matern kernel
            kernel  : kernel used for the model ('rbf' or 'matern')
            
        """
        self.gamma = gamma
        self.lambda1 = lambda1
        self.nu = nu
        self.kernel = kernel
        
    def make_gram(self, X, Y, gamma, nu, kernel):
        """
        Making the Gram matrix
            In sklearn, RBF kernel is defined as exp(-|x-x'|^2/(2l^2)).
            To be consistent with the kernel used in the proposed method, in which RBF kernel is defined as exp(-\gamma |x-x'|^2), we use 'length_sacle=np.sqrt(1/(2*gamma))'.
        """
        if kernel=='rbf':
            K = RBF(length_scale=np.sqrt(1/(2*gamma)))(X,Y)
        elif kernel=='matern':
            K = Matern(length_scale=np.sqrt(1/(2*gamma)), nu=nu)(X,Y)
        return pd.DataFrame(K)
    
    def low_rank_inv(self, A, tol):
        """
        Computing the low-rank approximation of a inverse matrix with SVD.
        
        Parameters
        ----------
            tol : Threshold below which SVD values are considered zero.
        """
        u, s, vh = np.linalg.svd(A)
        r = np.linalg.matrix_rank(np.diag(s), tol=tol)
        ur = u[:, :r]
        sr = s[:r]
        vhr = vh[:r, :]
        Ar = np.matmul(ur, np.multiply(sr[..., np.newaxis], vhr))
        Ard = np.matmul(vhr.T, np.multiply(1/sr[..., np.newaxis], ur.T))
        return Ard
    
    def fit(self, X, y=None):
        """
        Model fitting
        
        Required grobal variables
        -----------------------
            None
        
        Returns
        -------
            X             : descriptors
            y             : output
            n_sample      : number of sumples
            dim_x         : dimension of the features
            gram          : Gram matrix using X
            
            theta         : Estimated parameter
            intercept     : Estimated intercept
        """
        # Setting
        tol = 1e-5
        self.X = X
        self.y = y
        self.n_sample, self.dim_x = self.X.shape   
        self.gram = self.make_gram(X, X, gamma=self.gamma, nu=self.nu, kernel=self.kernel)
        
        # Parameter estimation
        tmp_mat = np.block([
            [self.gram.dot(self.gram)+self.lambda1*self.gram                   , pd.DataFrame(self.gram.dot(np.ones(self.n_sample)))],
            [pd.DataFrame(np.ones(self.n_sample).reshape(1,-1).dot(self.gram)), pd.DataFrame(np.ones(self.n_sample).reshape(1,-1).dot(np.ones(self.n_sample)))]
        ])
        tmp_gram = np.block([
            [self.gram],
            [pd.DataFrame(np.ones(self.n_sample).reshape(1,-1))]
        ])
        tmp_theta = self.low_rank_inv(tmp_mat, tol=tol).dot(tmp_gram).dot(self.y)
        self.theta = tmp_theta[:self.n_sample]
        self.intercept = tmp_theta[self.n_sample]
        return self
    
    def predict(self, X):
        """
        Prediction function
            h(x) = Gram \alpha + intercept
            
        Returns
        -------
            y_pred : Gram \alpha + intercept
        """
        pred_gram = self.make_gram(X, self.X, gamma=self.gamma, nu=self.nu, kernel=self.kernel)
        y_pred = pred_gram.dot(self.theta) + self.intercept
        y_pred.index = X.index
        return y_pred

    def score(self, X, y=None):
        """
        Score function for cross-validation
        
        Returns
        -------
            -\sum(y-\hat{y})/n (Consider the minus value because 'GridSearchCV' maximize the score.)
        """
        return -sum((y.values - self.predict(X).values)**2)/self.n_sample
    
    def get_params(self, deep=True):
        """
        Create parameter dictionary for cross-validation
        
        Returns
        -------
            {'gamma', 'lambda1', 'nu', 'kernel'}
        """
        return {'gamma' : self.gamma,
                'lambda1' : self.lambda1,
                'nu' : self.nu,
                'kernel' : self.kernel}
    
    def set_params(self, **parameters):
        """
        For cross-validation
        """
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self     

In [4]:
class AffineTrans(BaseEstimator, RegressorMixin):
    def __init__(self, gamma1=1, gamma2=1, gamma3=1, lambda1=1, lambda2=1, lambda3=1, nu=1.5, kernel='linear'):
        """
        Define the model class proposed in the paper
            h(x) = intercept_a + <\alpha, \Phi_1> + (<\beta, \Phi_2> + 1)<\gamma, \Phi_3>
        
        Parameters
        ----------
            gamma1  : length scale of the kernel1
            gamma2  : length scale of the kernel2
            gamma3  : length scale of the kernel3
            lambda1 : regularization parameter for alpha
            lambda2 : regularization parameter for beta
            lambda3 : reguralization parameter for gamma
            nu      : parameter controlling the smoothness of Matern kernel
            kernel  : kernel used for the model ('rbf' or 'matern')
        """
        self.gamma1 = gamma1
        self.gamma2 = gamma2
        self.gamma3 = gamma3
        self.lambda1 = lambda1
        self.lambda2 = lambda2
        self.lambda3 = lambda3
        self.nu = nu
        self.kernel = kernel
        
    def make_gram(self, X, Y, gamma, nu, kernel):
        """
        Making the Gram matrix
            In sklearn, RBF kernel is defined as exp(-|x-x'|^2/(2l^2)).
            To be consistent with the kernel used in the proposed method, in which RBF kernel is defined as exp(-\gamma |x-x'|^2), we use 'length_sacle=np.sqrt(1/(2*gamma))'.
        """
        if kernel=='rbf':
            K = RBF(length_scale=np.sqrt(1/(2*gamma)))(X,Y)
        elif kernel=='matern':
            K = Matern(length_scale=np.sqrt(1/(2*gamma)), nu=nu)(X,Y)
        return pd.DataFrame(K)
    
    def low_rank_inv(self, A, tol):
        """
        Computing the low-rank approximation of a inverse matrix with SVD.
        
        Parameters
        ----------
            tol : Threshold below which SVD values are considered zero.
        """
        u, s, vh = np.linalg.svd(A)
        r = np.linalg.matrix_rank(np.diag(s), tol=tol)
        ur = u[:, :r]
        sr = s[:r]
        vhr = vh[:r, :]
        Ar = np.matmul(ur, np.multiply(sr[..., np.newaxis], vhr))
        Ard = np.matmul(vhr.T, np.multiply(1/sr[..., np.newaxis], ur.T))
        return Ard

    def make_diff(self, w_new, w_old):
        """
        Function to calculate parameter changes for algorithm convergence determination
            We use \max{|w_new - w_old|}/\max{|w_old|} for determining the convergence.
            This criterion is used in some algorithms in scikit-learn, for example, see https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html
            We apply this criterion to each of \alpha, \beta and \gamma separately and use their maximum value for the convergence decision.
        """
        diff1 = np.max(np.abs(w_new-w_old))
        diff2 = np.max(np.abs(w_old))
        if diff2 < 1e-10:
            out = 0
        else:
            out = diff1/diff2
        return out

    def estimation_from_b(self):
        """
        Block relaxation algorthm
            See paper for the updated formula
        """
        # Repeat until convergence
        self.convergence = False
        for i_loop in range(self.n_loop):
            # Setting
            self.i_count += 1
            lambda_n = 1
            tol = 1e-5
            
            K1a1 = self.gram1.dot(self.a) + self.intercept_a
            K3c = self.gram3.dot(self.c)
            
            # Updating b
            ## Rounding of numbers for stability of estimation
            if np.linalg.norm(self.c) < 1e-2:
                self.b = np.zeros(self.n_sample)
                self.intercept_b = 0
            else:
                tmp_b = np.diag(K3c).dot(np.diag(K3c)).dot(self.gram2) + lambda_n*self.lambda2*np.diag(np.ones(self.n_sample))
                self.b = self.low_rank_inv(tmp_b, tol=tol).dot(K3c*(self.y.values - K1a1 - self.intercept_b*K3c))
                self.intercept_b = 1
            ## Rounding of numbers for stability of estimation
            if np.linalg.norm(self.b) < 1e-2:
                self.b = np.zeros(self.n_sample)
            K2b1 = self.gram2.dot(self.b) + self.intercept_b
            
            # Updating c
            tmp_c = np.diag(K2b1).dot(np.diag(K2b1)).dot(self.gram3) + lambda_n*self.lambda3*np.diag(np.ones(self.n_sample))
            self.c = self.low_rank_inv(tmp_c, tol=tol).dot(K2b1*(self.y.values - K1a1))
            ## Rounding of numbers for stability of estimation
            if np.linalg.norm(self.c) < 1e-2:
                self.c = np.zeros(self.n_sample)
            K3c = self.gram3.dot(self.c)
            
            # Updating a
            self.a = self.Minv.dot(self.y.values - self.intercept_a - K2b1*K3c)
            self.intercept_a = np.sum(self.y.values - self.gram1.dot(self.a) - K2b1*K3c)/(self.n_sample + self.lambda_a)
            
            # Store the parameters
            self.result_b[self.i_count] = self.b
            self.result_c[self.i_count] = self.c
            self.result_a[self.i_count] = self.a
            
            # Compute the difference
            diff_a = self.make_diff(w_new=self.result_a[self.i_count], w_old=self.result_a[self.i_count-1])
            diff_b = self.make_diff(w_new=self.result_b[self.i_count], w_old=self.result_b[self.i_count-1])
            diff_c = self.make_diff(w_new=self.result_c[self.i_count], w_old=self.result_c[self.i_count-1])
            diff = np.max([diff_a, diff_b, diff_c])
            self.diff_a[self.i_count] = diff_a
            self.diff_b[self.i_count] = diff_b
            self.diff_c[self.i_count] = diff_c
            self.diff[self.i_count] = diff

            # Check the convergence
            if diff < 1e-4:
                self.convergence = True
                break
        return self
    
    def fit(self, X, y=None):
        """
        Model fitting
        
        Required grobal variables
        -----------------------
            dim_x         : dimension of the discriptor
            ini_alpha     : initial value for \alpha_1
            ini_intercept : initial value for \alpha_0
            ini_beta      : initial value for \beta
            ini_gamma     : initial value for \gamma
        
        Returns
        -------
            X             : discriptors + source features
            X_train       : descriptors
            X_source      : source features
            n_sample      : number of sumples
            dim_x         : dimension of the discriptor
            
            gram1         : Gram matrix using X_source
            gram2         : Gram matrix using X_source
            gram3         : Gram matrix using X_train
            
            n_loop        : maximum number of iterations

            result_a      : dataframe to store a in all iterations
            result_b      : dataframe to store b in all iterations
            result_c      : dataframe to store c in all iterations
            diff_a        : series to store the difference between \alpha_1_new and \alpha_1_old 
            diff_b        : series to store the difference between \beta_new and \beta_old
            diff_c        : series to store the difference between \gamma_new and \gamma_old 
            diff          : series to store the difference between AllParams_new and AllParams_old 
            
            Minv          : (K1 + \Lambda_1 I)^{-1}

        """
        # Setting
        self.X_train = X.iloc[:,:dim_x]
        self.X_source = X.iloc[:,dim_x:]
        self.n_sample, self.dim_x = self.X_train.shape
        self.X = X
        self.y = y
        self.n_loop = 1000
        self.result_a = np.zeros([self.n_loop+1, self.n_sample])
        self.result_b = np.zeros([self.n_loop+1, self.n_sample])
        self.result_c = np.zeros([self.n_loop+1, self.n_sample])
        self.diff_a = np.zeros(self.n_loop+1)
        self.diff_b = np.zeros(self.n_loop+1)
        self.diff_c = np.zeros(self.n_loop+1)
        self.diff = np.zeros(self.n_loop+1)
               
        # Compute Gram matrices
        self.gram1 = self.make_gram(self.X_source, self.X_source, gamma=self.gamma1, nu=self.nu, kernel=self.kernel)
        self.gram2 = self.make_gram(self.X_source, self.X_source, gamma=self.gamma2, nu=self.nu, kernel=self.kernel)
        self.gram3 = self.make_gram(self.X_train, self.X_train, gamma=self.gamma3, nu=self.nu, kernel=self.kernel)
        
        # Initialization
        self.Minv = np.linalg.pinv(self.gram1 + self.lambda1*np.diag(np.ones(self.n_sample)), hermitian=True)
        fix_seed(373)
        self.b = np.random.randn(self.n_sample)
        self.c = np.random.randn(self.n_sample)
        self.a = self.Minv.dot(self.y.values)
        self.i_count = 0
        self.lambda_a = 0
        self.lambda_b = 0
        self.intercept_a = np.sum(self.y.values - self.gram1.dot(self.a))/(self.n_sample + self.lambda_a)
        self.intercept_b = 0.5
        
        # Store
        self.result_a[self.i_count] = self.a.reshape([-1])
        self.result_b[self.i_count] = self.b.reshape([-1])
        self.result_c[self.i_count] = self.c.reshape([-1])
        self.diff_a[self.i_count] = np.nan
        self.diff_b[self.i_count] = np.nan
        self.diff_c[self.i_count] = np.nan
        self.diff[self.i_count] = np.nan
        
        # Estimation
        self.estimation_from_b()
        
        # Dataframe shaping
        self.result_a = self.result_a[:(self.i_count+1),:]
        self.result_b = self.result_b[:(self.i_count+1),:]
        self.result_c = self.result_c[:(self.i_count+1),:]
        self.diff_a = self.diff_a[:(self.i_count+1)]
        self.diff_b = self.diff_b[:(self.i_count+1)]
        self.diff_c = self.diff_c[:(self.i_count+1)]
        self.diff = self.diff[:(self.i_count+1)]
        
        return self
    
    def predict(self, X):
        """
        Prediction function
            h(x) = <\alpha, \Phi_1> + (<\beta, \phi_2> + 1) * (<\gamma, \Phi_3>) + intercept
            
        Returns
        -------
            y_pred : Gram1 a + (Gram2 * b + 1) * (Gram3 * b) + intercept
        """
        X_source_pred = X.iloc[:,dim_x:]
        X_train_pred = X.iloc[:,:dim_x]
        
        pred_gram1 = self.make_gram(X_source_pred, self.X_source, gamma=self.gamma1, nu=self.nu, kernel=self.kernel)
        pred_gram2 = self.make_gram(X_source_pred, self.X_source, gamma=self.gamma2, nu=self.nu, kernel=self.kernel)
        pred_gram3 = self.make_gram(X_train_pred, self.X_train, gamma=self.gamma3, nu=self.nu, kernel=self.kernel)
        
        self.pred1 = pred_gram1.dot(self.a)
        self.pred2 = pred_gram2.dot(self.b)
        self.pred3 = pred_gram3.dot(self.c)
        y_pred = pred_gram1.dot(self.a) + self.intercept_a + (pred_gram2.dot(self.b)+self.intercept_b)*(pred_gram3.dot(self.c))
        y_pred.index = X.index
        
        return y_pred

    def score(self, X, y=None):
        """
        Score function for cross-validation
        
        Returns
        -------
            -\sum(y-\hat{y})/n (Consider the minus value because 'GridSearchCV' maximize the score.)
        """
        return -sum((y.values - self.predict(X).values)**2)/self.n_sample
    
    def get_params(self, deep=True):
        """
        Create parameter dictionary for cross-validation
        
        Returns
        -------
            {'gamma1', 'gamma2', 'gamma3', 'lambda1', 'lambda2', 'lambda3', 'nu', 'kernel'}
        """
        return {'gamma1' : self.gamma1,
                'gamma2' : self.gamma2,
                'gamma3' : self.gamma3,
                'lambda1' : self.lambda1,
                'lambda2' : self.lambda2,
                'lambda3' : self.lambda3,
                'nu' : self.nu,
                'kernel' : self.kernel}
    
    def set_params(self, **parameters):
        """
        For cross-validation
        """
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self     

In [5]:
class AffineTrans2(BaseEstimator, RegressorMixin):
    def __init__(self, gamma1=1, gamma2=1, gamma3=1, lambda1=1, lambda2=1, lambda3=1, nu=1.5, kernel='linear'):
        """
        Define the model class proposed in the paper
            h(x) = intercept_a + (<\beta, \Phi_2> + 1)<\gamma, \Phi_3>
        
        Parameters
        ----------
            gamma1  : length scale of the kernel1 (not used)
            gamma2  : length scale of the kernel2
            gamma3  : length scale of the kernel3
            lambda1 : regularization parameter for alpha (not used)
            lambda2 : regularization parameter for beta
            lambda3 : reguralization parameter for gamma
            nu      : parameter controlling the smoothness of Matern kernel
            kernel  : kernel used for the model ('rbf' or 'matern')
        """
        self.gamma1 = gamma1
        self.gamma2 = gamma2
        self.gamma3 = gamma3
        self.lambda1 = lambda1
        self.lambda2 = lambda2
        self.lambda3 = lambda3
        self.nu = nu
        self.kernel = kernel
        
    def make_gram(self, X, Y, gamma, nu, kernel):
        """
        Making the Gram matrix
            In sklearn, RBF kernel is defined as exp(-|x-x'|^2/(2l^2)).
            To be consistent with the kernel used in the proposed method, in which RBF kernel is defined as exp(-\gamma |x-x'|^2), we use 'length_sacle=np.sqrt(1/(2*gamma))'.
        """
        if kernel=='rbf':
            K = RBF(length_scale=np.sqrt(1/(2*gamma)))(X,Y)
        elif kernel=='matern':
            K = Matern(length_scale=np.sqrt(1/(2*gamma)), nu=nu)(X,Y)
        return pd.DataFrame(K)
    
    def low_rank_inv(self, A, tol):
        """
        Computing the low-rank approximation of a inverse matrix with SVD.
        
        Parameters
        ----------
            tol : Threshold below which SVD values are considered zero.
        """
        u, s, vh = np.linalg.svd(A)
        r = np.linalg.matrix_rank(np.diag(s), tol=tol)
        ur = u[:, :r]
        sr = s[:r]
        vhr = vh[:r, :]
        Ar = np.matmul(ur, np.multiply(sr[..., np.newaxis], vhr))
        Ard = np.matmul(vhr.T, np.multiply(1/sr[..., np.newaxis], ur.T))
        return Ard

    def make_diff(self, w_new, w_old):
        """
        Function to calculate parameter changes for algorithm convergence determination
            We use \max{|w_new - w_old|}/\max{|w_old|} for determining the convergence.
            This criterion is used in some algorithms in scikit-learn, for example, see https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html
            We apply this criterion to each of \alpha, \beta and \gamma separately and use their maximum value for the convergence decision.
        """
        diff1 = np.max(np.abs(w_new-w_old))
        diff2 = np.max(np.abs(w_old))
        if diff2 < 1e-10:
            out = 0
        else:
            out = diff1/diff2
        return out

    def estimation_from_b(self):
        """
        Block relaxation algorthm
            See paper for the updated formula
        """
        # Repeat until convergence
        self.convergence = False
        for i_loop in range(self.n_loop):
            # Setting
            self.i_count += 1
            lambda_n = 1
            tol = 1e-5
            
            K1a1 = self.gram1.dot(self.a) + self.intercept_a
            K3c = self.gram3.dot(self.c)
            
            # Updating b
            ## Rounding of numbers for stability of estimation
            if np.linalg.norm(self.c) < 1e-2:
                self.b = np.zeros(self.n_sample)
                self.intercept_b = 0
            else:
                tmp_b = np.diag(K3c).dot(np.diag(K3c)).dot(self.gram2) + lambda_n*self.lambda2*np.diag(np.ones(self.n_sample))
                self.b = self.low_rank_inv(tmp_b, tol=tol).dot(K3c*(self.y.values - K1a1 - self.intercept_b*K3c))
                self.intercept_b = 1
            K2b1 = self.gram2.dot(self.b) + self.intercept_b
            
            # Updating c
            tmp_c = np.diag(K2b1).dot(np.diag(K2b1)).dot(self.gram3) + lambda_n*self.lambda3*np.diag(np.ones(self.n_sample))
            self.c = self.low_rank_inv(tmp_c, tol=tol).dot(K2b1*(self.y.values - K1a1))
            ## Rounding of numbers for stability of estimation
            if np.linalg.norm(self.c) < 1e-2:
                self.c = np.zeros(self.n_sample)
            K3c = self.gram3.dot(self.c)
            
            # Set zero to a and estimate the intercept
            self.a = np.zeros(self.n_sample)
            self.intercept_a = np.sum(self.y.values - self.gram1.dot(self.a) - K2b1*K3c)/(self.n_sample + self.lambda_a)
            
            # Store the parameters
            self.result_b[self.i_count] = self.b
            self.result_c[self.i_count] = self.c
            self.result_a[self.i_count] = self.a

            # Compute the difference
            diff_a = self.make_diff(w_new=self.result_a[self.i_count], w_old=self.result_a[self.i_count-1])
            diff_b = self.make_diff(w_new=self.result_b[self.i_count], w_old=self.result_b[self.i_count-1])
            diff_c = self.make_diff(w_new=self.result_c[self.i_count], w_old=self.result_c[self.i_count-1])
            diff = np.max([diff_a, diff_b, diff_c])
            self.diff_a[self.i_count] = diff_a
            self.diff_b[self.i_count] = diff_b
            self.diff_c[self.i_count] = diff_c
            self.diff[self.i_count] = diff

            # Check the convergence
            if diff < 1e-4:
                self.convergence = True
                break
        return self
    
    def fit(self, X, y=None):
        """
        Model fitting
        
        Required grobal variables
        -----------------------
            dim_x         : dimension of the discriptor
            ini_alpha     : initial value for \alpha_1
            ini_intercept : initial value for \alpha_0
            ini_beta      : initial value for \beta
            ini_gamma     : initial value for \gamma
        
        Returns
        -------
            X             : discriptors + source features
            X_train       : descriptors
            X_source      : source features
            n_sample      : number of sumples
            dim_x         : dimension of the discriptor
            
            gram1         : Gram matrix using X_source
            gram2         : Gram matrix using X_source
            gram3         : Gram matrix using X_train
            
            n_loop        : maximum number of iterations

            result_a      : dataframe to store a in all iterations
            result_b      : dataframe to store b in all iterations
            result_c      : dataframe to store c in all iterations
            diff_a        : series to store the difference between \alpha_1_new and \alpha_1_old 
            diff_b        : series to store the difference between \beta_new and \beta_old
            diff_c        : series to store the difference between \gamma_new and \gamma_old 
            diff          : series to store the difference between AllParams_new and AllParams_old 
        """
        # Setting
        self.X_train = X.iloc[:,:dim_x]
        self.X_source = X.iloc[:,dim_x:]
        self.n_sample, self.dim_x = self.X_train.shape
        self.X = X
        self.y = y
        self.n_loop = 1000
        self.result_a = np.zeros([self.n_loop+1, self.n_sample])
        self.result_b = np.zeros([self.n_loop+1, self.n_sample])
        self.result_c = np.zeros([self.n_loop+1, self.n_sample])
        self.diff_a = np.zeros(self.n_loop+1)
        self.diff_b = np.zeros(self.n_loop+1)
        self.diff_c = np.zeros(self.n_loop+1)
        self.diff = np.zeros(self.n_loop+1)
               
        # Compute Gram matrices
        self.gram1 = self.make_gram(self.X_source, self.X_source, gamma=self.gamma1, nu=self.nu, kernel=self.kernel)
        self.gram2 = self.make_gram(self.X_source, self.X_source, gamma=self.gamma2, nu=self.nu, kernel=self.kernel)
        self.gram3 = self.make_gram(self.X_train, self.X_train, gamma=self.gamma3, nu=self.nu, kernel=self.kernel)
        
        # Initialization
        fix_seed(373)
        self.b = np.random.randn(self.n_sample)
        self.c = np.random.randn(self.n_sample)
        self.a = np.zeros(self.n_sample)
        self.i_count = 0
        self.lambda_a = 0
        self.lambda_b = 0
        self.intercept_a = np.sum(self.y.values - self.gram1.dot(self.a))/(self.n_sample + self.lambda_a)
        self.intercept_b = 0.5
        
        # Store
        self.result_a[self.i_count] = self.a.reshape([-1])
        self.result_b[self.i_count] = self.b.reshape([-1])
        self.result_c[self.i_count] = self.c.reshape([-1])
        self.diff_a[self.i_count] = np.nan
        self.diff_b[self.i_count] = np.nan
        self.diff_c[self.i_count] = np.nan
        self.diff[self.i_count] = np.nan
        
        # Estimation
        self.estimation_from_b()
        
        # Dataframe shaping
        self.result_a = self.result_a[:(self.i_count+1),:]
        self.result_b = self.result_b[:(self.i_count+1),:]
        self.result_c = self.result_c[:(self.i_count+1),:]
        self.diff_a = self.diff_a[:(self.i_count+1)]
        self.diff_b = self.diff_b[:(self.i_count+1)]
        self.diff_c = self.diff_c[:(self.i_count+1)]
        self.diff = self.diff[:(self.i_count+1)]
        
        return self
    
    def predict(self, X):
        """
        Prediction function
            h(x) = (<\beta, \phi_2> + 1) * (<\gamma, \Phi_3>) + intercept
            
        Returns
        -------
            y_pred : (Gram2 * b + 1) * (Gram3 * b) + intercept
        """
        X_source_pred = X.iloc[:,dim_x:]
        X_train_pred = X.iloc[:,:dim_x]
        
        pred_gram1 = self.make_gram(X_source_pred, self.X_source, gamma=self.gamma1, nu=self.nu, kernel=self.kernel)
        pred_gram2 = self.make_gram(X_source_pred, self.X_source, gamma=self.gamma2, nu=self.nu, kernel=self.kernel)
        pred_gram3 = self.make_gram(X_train_pred, self.X_train, gamma=self.gamma3, nu=self.nu, kernel=self.kernel)
        
        self.pred1 = pred_gram1.dot(self.a)
        self.pred2 = pred_gram2.dot(self.b)
        self.pred3 = pred_gram3.dot(self.c)
        y_pred = pred_gram1.dot(self.a) + self.intercept_a + (pred_gram2.dot(self.b)+self.intercept_b)*(pred_gram3.dot(self.c))
        y_pred.index = X.index
        
        return y_pred

    def score(self, X, y=None):
        """
        Score function for cross-validation
        
        Returns
        -------
            -\sum(y-\hat{y})/n (Consider the minus value because 'GridSearchCV' maximize the score.)
        """
        return -sum((y.values - self.predict(X).values)**2)/self.n_sample
    
    def get_params(self, deep=True):
        """
        Create parameter dictionary for cross-validation
        
        Returns
        -------
            {'gamma1', 'gamma2', 'gamma3', 'lambda1', 'lambda2', 'lambda3', 'nu', 'kernel'}
        """
        return {'gamma1' : self.gamma1,
                'gamma2' : self.gamma2,
                'gamma3' : self.gamma3,
                'lambda1' : self.lambda1,
                'lambda2' : self.lambda2,
                'lambda3' : self.lambda3,
                'nu' : self.nu,
                'kernel' : self.kernel}
    
    def set_params(self, **parameters):
        """
        For cross-validation
        """
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self     

In [6]:
class AffineTrans3(BaseEstimator, RegressorMixin):
    def __init__(self, gamma1=1, gamma3=1, lambda1=1, lambda3=1, nu=1.5, kernel='linear'):
        """
        Define the model class proposed in the paper
            h(x) = intercept_a + <\alpha, \Phi_1> + <\gamma, \Phi_3>
        
        Parameters
        ----------
            gamma1  : length scale of the kernel1
            gamma3  : length scale of the kernel3
            lambda1 : regularization parameter for alpha
            lambda3 : reguralization parameter for gamma
            nu      : parameter controlling the smoothness of Matern kernel
            kernel  : kernel used for the model ('rbf' or 'matern')
        """
        self.gamma1 = gamma1
        self.gamma3 = gamma3
        self.lambda1 = lambda1
        self.lambda3 = lambda3
        self.nu = nu
        self.kernel = kernel
        
    def make_gram(self, X, Y, gamma, nu, kernel):
        """
        Making the Gram matrix
            In sklearn, RBF kernel is defined as exp(-|x-x'|^2/(2l^2)).
            To be consistent with the kernel used in the proposed method, in which RBF kernel is defined as exp(-\gamma |x-x'|^2), we use 'length_sacle=np.sqrt(1/(2*gamma))'.
        """
        if kernel=='rbf':
            K = RBF(length_scale=np.sqrt(1/(2*gamma)))(X,Y)
        elif kernel=='matern':
            K = Matern(length_scale=np.sqrt(1/(2*gamma)), nu=nu)(X,Y)
        return pd.DataFrame(K)
    
    def low_rank_inv(self, A, tol):
        """
        Computing the low-rank approximation of a inverse matrix with SVD.
        
        Parameters
        ----------
            tol : Threshold below which SVD values are considered zero.
        """
        u, s, vh = np.linalg.svd(A)
        r = np.linalg.matrix_rank(np.diag(s), tol=tol)
        ur = u[:, :r]
        sr = s[:r]
        vhr = vh[:r, :]
        Ar = np.matmul(ur, np.multiply(sr[..., np.newaxis], vhr))
        Ard = np.matmul(vhr.T, np.multiply(1/sr[..., np.newaxis], ur.T))
        return Ard
    
    def fit(self, X, y=None):
        """
        Model fitting
            Note that unlike proposed method 1 and proposed method 2, the optimal parameters can be obtained analytically.
        
        Required grobal variables
        -----------------------
            dim_x         : dimension of the discriptor
            ini_alpha     : initial value for \alpha_1
            ini_intercept : initial value for \alpha_0
            ini_beta      : initial value for \beta
            ini_gamma     : initial value for \gamma
        
        Returns
        -------
            X             : discriptors + source features
            X_train       : descriptors
            X_source      : source features
            n_sample      : number of sumples
            dim_x         : dimension of the discriptor
            
            gram1         : Gram matrix using X_source
            gram3         : Gram matrix using X_train
        """
        # Setting
        self.X_train = X.iloc[:,:dim_x]
        self.X_source = X.iloc[:,dim_x:]
        tol = 1e-5
        self.n_sample, self.dim_x = self.X_train.shape
        self.X = X
        self.y = y
               
        # Compute Gram matrices
        self.gram1 = self.make_gram(self.X_source, self.X_source, gamma=self.gamma1, nu=self.nu, kernel=self.kernel)
        self.gram3 = self.make_gram(self.X_train, self.X_train, gamma=self.gamma3, nu=self.nu, kernel=self.kernel)
        
        # Estimate the optimal parameters
        tmp_mat = np.block([
            [self.gram1.dot(self.gram1) + self.lambda1 * self.gram1, self.gram1.dot(self.gram3)                          , pd.DataFrame(self.gram1.dot(np.ones(self.n_sample)))],
            [self.gram3.dot(self.gram1),                             self.gram3.dot(self.gram3) + self.lambda3*self.gram3, pd.DataFrame(self.gram3.dot(np.ones(self.n_sample)))],
            [pd.DataFrame(np.ones(self.n_sample).reshape(1,-1).dot(self.gram1)),
                 pd.DataFrame(np.ones(self.n_sample).reshape(1,-1).dot(self.gram3)), 
                 pd.DataFrame(np.ones(self.n_sample).reshape(1,-1).dot(np.ones(self.n_sample)))]
        ])
        tmp_gram = np.block([
            [self.gram1],
            [self.gram3],
            [pd.DataFrame(np.ones(self.n_sample).reshape(1,-1))]
        ])
        tmp_theta = self.low_rank_inv(tmp_mat, tol=tol).dot(tmp_gram).dot(self.y)
        
        # Split the parameters
        self.a = tmp_theta[:self.n_sample]
        self.c = tmp_theta[self.n_sample:(2*self.n_sample)]
        self.intercept = tmp_theta[(2*self.n_sample):][0]
        
        return self
    
    def predict(self, X):
        """
        Prediction function
            h(x) = <\alpha, \Phi_1> + (<\gamma, \Phi_3>) + intercept
            
        Returns
        -------
            y_pred : Gram1 a + (Gram3 * b) + intercept
        """
        X_source_pred = X.iloc[:,dim_x:]
        X_train_pred = X.iloc[:,:dim_x]
        
        pred_gram1 = self.make_gram(X_source_pred, self.X_source, gamma=self.gamma1, nu=self.nu, kernel=self.kernel)
        pred_gram3 = self.make_gram(X_train_pred, self.X_train, gamma=self.gamma3, nu=self.nu, kernel=self.kernel)
        
        self.pred1 = pred_gram1.dot(self.a)
        self.pred3 = pred_gram3.dot(self.c)
        y_pred = pred_gram1.dot(self.a) + (pred_gram3.dot(self.c)) + self.intercept
        y_pred.index = X.index
        
        return y_pred

    def score(self, X, y=None):
        """
        Score function for cross-validation
        
        Returns
        -------
            -\sum(y-\hat{y})/n (Consider the minus value because 'GridSearchCV' maximize the score.)
        """
        return -sum((y.values - self.predict(X).values)**2)/self.n_sample
    
    def get_params(self, deep=True):
        """
        Create parameter dictionary for cross-validation
        
        Returns
        -------
            {'gamma1', 'gamma3', 'lambda1', 'lambda3', 'nu', 'kernel'}
        """
        return {'gamma1' : self.gamma1,
                'gamma3' : self.gamma3,
                'lambda1' : self.lambda1,
                'lambda3' : self.lambda3,
                'nu' : self.nu,
                'kernel' : self.kernel}
    
    def set_params(self, **parameters):
        """
        For cross-validation
        """
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self     

## fix_seed function

In [7]:
def fix_seed(seed):
    # Numpy
    np.random.seed(seed)
    # for built-in random
    random.seed(seed)
    # for hash seed
    os.environ["PYTHONHASHSEED"] = str(seed)

## Plot function

In [8]:
def plot_scatter(y_obs_list, 
                 y_prd_list, 
                 title_list, 
                 plt_row, 
                 plt_col, 
                 position_list, 
                 col_list,
                 alpha_list,
                 fig_size, 
                 save_name, 
                 title, 
                 show_flg=True):
    fig = plt.figure(figsize=fig_size)

    for i_plt in range(len(position_list)):
        ax = fig.add_subplot(plt_row, plt_col, position_list[i_plt], 
                             title=title_list[i_plt], 
                             xlabel='Observation', 
                             ylabel='Prediction')
        ax.scatter(y_obs_list[i_plt], y_prd_list[i_plt], color=col_list[i_plt], alpha=alpha_list[i_plt])
        xy_min = min(ax.get_xlim()[0], ax.get_ylim()[0])
        xy_max = max(ax.get_xlim()[1], ax.get_ylim()[1])
        ax.axis('equal')
        ax.axis('square')
        ax.set_xlim([xy_min, xy_max])
        ax.set_ylim([xy_min, xy_max])
        ax.grid(color='gray', linestyle='dotted', linewidth=1, alpha=0.5)
        ax.text(0.03, 0.93, 'Corr : '+str(round(np.corrcoef(y_prd_list[i_plt], y_obs_list[i_plt])[0,1], 4)), size=15, transform=ax.transAxes)
        ax.text(0.03, 0.87, 'MSE : '+str(round(mean_squared_error(y_obs_list[i_plt], y_prd_list[i_plt]), 4)), size=15, transform=ax.transAxes)
        ax.text(0.03, 0.81, 'MAE : '+str(round(mean_absolute_error(y_obs_list[i_plt], y_prd_list[i_plt]), 4)), size=15, transform=ax.transAxes)
        _ = ax.plot([-300, 300], [-300, 300], color='gray', linewidth=0.5)

    fig.tight_layout(rect=[0,0,1,0.96])
    
    plt.suptitle(title,fontsize=20)

    fig.savefig(save_name)
    if show_flg==False:
        plt.close(fig)

## Function to avoid zero division

In [9]:
def avoid_zero(x, tsh=1, _add=0.1):
    if np.abs(x) < tsh:
        if x >= 0:
            return tsh
        else:
            return -tsh
    else:
        return x

# Main codes

## Load data

In [10]:
# train_path = '../10_Data/SARCOS_train.csv'
# test_path = '../10_Data/SARCOS_test.csv'
train_path = '../10_Data/sarcos_inv.mat'
test_path = '../10_Data/sarcos_inv_test.mat'
axis_names = ['Position1','Position2','Position3','Position4','Position5','Position6','Position7',
              'Velocity1','Velocity2','Velocity3','Velocity4','Velocity5','Velocity6','Velocity7',
              'Acceleration1','Acceleration2','Acceleration3','Acceleration4','Acceleration5','Acceleration6','Acceleration7',
              'Torque1','Torque2','Torque3','Torque4','Torque5','Torque6','Torque7']

# sar_train = pd.read_csv(train_path, header=None)
# sar_test = pd.read_csv(test_path, header=None)
# sar_train = sar_train.set_axis(axis_names, axis=1)
# sar_test = sar_test.set_axis(axis_names, axis=1)

sar_train_all = io.loadmat(train_path)
sar_test = io.loadmat(test_path)
sar_train_all = pd.DataFrame(sar_train_all['sarcos_inv'], columns=axis_names)
sar_test = pd.DataFrame(sar_test['sarcos_inv_test'], columns=axis_names)
sar_train = sar_train_all.iloc[:30000, :]

x_train = sar_train.iloc[:,0:21]
x_test = sar_test.iloc[:,0:21]
y_train_all = sar_train.iloc[:,21:]
y_test_all = sar_test.iloc[:,21:]

## User parameter setting

In [11]:
fix_seed(373)
target_name_list = ['Torque1','Torque2','Torque3','Torque4','Torque5','Torque6','Torque7']
n_sample_list = [5, 10, 15, 20, 30, 40, 50]
max_itr = 20

dim_x = 21
n_all = 30000
num_SourceTasks = 6

# Kernel setting
kernel_name = 'rbf'
nu_ = 1.5

## Make training sample ID list

In [12]:
fix_seed(373)
sample_list = list()
for n_try in range(max_itr):
    fix_seed(n_try)
    tmp_list = list(range(n_all))
    random.shuffle(tmp_list)
    sample_list.append(tmp_list)

In [13]:
# Storing dataframe
df_result = pd.DataFrame(columns=['data_name','n_sample','n_itr','type', 'MSE', 'Corr', 'MAE', 'R2'])

t0 = time.time()
# Repeat for the different number of samples
for num_train in n_sample_list:
    
    # Repeat for the different target torques
    for target_name in target_name_list:

        # Make target outputs and source features
        y_train = y_train_all[target_name].copy()
        y_test = y_test_all[target_name].copy()
        s_train = y_train_all[[s for s in target_name_list if s!=target_name]].copy()
        s_test = y_test_all[[s for s in target_name_list if s!=target_name]].copy()
        
        # Repeat for the different sample splits
        for n_itr in range(max_itr):
            print(target_name+'   n : '+str(num_train)+',  try : '+str(n_itr))
            t1 = time.time()
            
            # Hyperparameter settings
            alpha_list = np.append(10**np.linspace(-4, 2, 49), 0)
            gamma_list = np.array([1])
            
            # Make training data
            sample_id = sample_list[n_itr][:num_train]
            x_train_tmp = x_train.iloc[sample_id,]
            y_train_tmp = y_train.iloc[sample_id,]
            s_train_tmp = s_train.iloc[sample_id,]

            # Scaling parameters
            ## Inputs
            x_mean_tmp = x_train_tmp.mean()
            x_std_tmp = x_train_tmp.std()
            x_train_scal_tmp = (x_train_tmp - x_mean_tmp) / x_std_tmp.replace(0,1)
            x_test_scal_tmp = (x_test - x_mean_tmp) / x_std_tmp.replace(0,1)
            ## Outputs
            y_mean_tmp = y_train_tmp.mean()
            y_std_tmp = y_train_tmp.std()
            y_train_scal_tmp = (y_train_tmp - y_mean_tmp) / y_std_tmp
            y_test_scal_tmp = (y_test - y_mean_tmp) / y_std_tmp
            # Source features
            s_mean_tmp = s_train_tmp.mean()
            s_std_tmp = s_train_tmp.std()
            s_train_scal_tmp = (s_train_tmp - s_mean_tmp) / s_std_tmp.replace(0,1)
            s_test_scal_tmp = (s_test - s_mean_tmp) / s_std_tmp.replace(0,1)

            # Make combined dataframe
            x_train_adds = pd.merge(x_train_scal_tmp, s_train_scal_tmp, left_index=True, right_index=True)
            x_test_adds = pd.merge(x_test_scal_tmp, s_test_scal_tmp, left_index=True, right_index=True)

            # Model training
            ## Learn without Transfer
            t_tmp = time.time()
            ### Grid search
            gsr_wotl = GridSearchCV(
                KernelRidge_HM(),
                {'lambda1' : alpha_list,
                 'gamma'   : gamma_list/(2*dim_x),
                 'nu'      : [nu_],
                 'kernel'  : [kernel_name]},
                scoring = 'neg_mean_squared_error',
                cv = 5,
                n_jobs = -1,
                verbose = False
            )
            fix_seed(373)
            gsr_wotl.fit(x_train_scal_tmp, y_train_scal_tmp)
            model_wotl = KernelRidge_HM(
                lambda1 = gsr_wotl.best_params_['lambda1'],                  
                gamma   = gsr_wotl.best_params_['gamma'],
                nu      = gsr_wotl.best_params_['nu'],
                kernel  = kernel_name
            )
            ### Final model training
            fix_seed(373)
            model_wotl.fit(x_train_scal_tmp, y_train_scal_tmp)
            y_fits_wotl = model_wotl.predict(x_train_scal_tmp)*y_std_tmp + y_mean_tmp
            y_pred_wotl = model_wotl.predict(x_test_scal_tmp)*y_std_tmp + y_mean_tmp
            print('   Learn without transfer has been done.    '+str(time.time()-t_tmp))

            ## Learn using only source features
            t_tmp = time.time()
            ### Grid search
            gsr_only = GridSearchCV(
                KernelRidge_HM(),
                {'lambda1' : alpha_list,
                 'gamma'   : gamma_list/(2*num_SourceTasks),
                 'nu'      : [nu_],
                 'kernel'  : [kernel_name]},
                scoring = 'neg_mean_squared_error',
                cv = 5,
                n_jobs = -1,
                verbose = False
            )
            fix_seed(373)
            gsr_only.fit(s_train_scal_tmp, y_train_scal_tmp)
            model_only = KernelRidge_HM(
                lambda1 = gsr_only.best_params_['lambda1'],                  
                gamma   = gsr_only.best_params_['gamma'],
                nu      = gsr_only.best_params_['nu'],
                kernel  = kernel_name
            )
            fix_seed(373)
            ### Final model training
            model_only.fit(s_train_scal_tmp, y_train_scal_tmp)
            y_fits_only = model_only.predict(s_train_scal_tmp)*y_std_tmp + y_mean_tmp
            y_pred_only = model_only.predict(s_test_scal_tmp)*y_std_tmp + y_mean_tmp
            print('   Learn only using the source features has been done.    '+str(time.time()-t_tmp))

            ## Learn with source features
            t_tmp = time.time()
            ### Grid search
            gsr_with = GridSearchCV(
                KernelRidge_HM(),
                {'lambda1' : alpha_list,
                 'gamma'   : gamma_list/(2*(dim_x+num_SourceTasks)),
                 'nu'      : [nu_],
                 'kernel'  : [kernel_name]},
                scoring = 'neg_mean_squared_error',
                cv = 5,
                n_jobs = -1,
                verbose = False
            )
            fix_seed(373)
            gsr_with.fit(x_train_adds, y_train_scal_tmp)
            model_with = KernelRidge_HM(
                lambda1 = gsr_with.best_params_['lambda1'],                  
                gamma   = gsr_with.best_params_['gamma'],
                nu      = gsr_with.best_params_['nu'],
                kernel  = kernel_name
            )
            ### Final model training
            fix_seed(373)
            model_with.fit(x_train_adds, y_train_scal_tmp)
            y_fits_with = model_with.predict(x_train_adds)*y_std_tmp + y_mean_tmp
            y_pred_with = model_with.predict(x_test_adds)*y_std_tmp + y_mean_tmp
            print('   Learn with the source features has been done.    '+str(time.time()-t_tmp))

            ## Learn residuals from only source model
            ### Compute the data
            t_tmp = time.time()
            y_train_res = y_train_tmp - y_fits_only
            y_std_res = y_train_res.std()
            if y_std_res == 0:
                y_std_res = 1
            y_mean_res = y_train_res.mean()
            y_train_res_scal = (y_train_res - y_mean_res)/y_std_res
            ### Grid search
            gsr_res = GridSearchCV(
                KernelRidge_HM(),
                {'lambda1' : alpha_list,
                 'gamma'   : gamma_list/(2*dim_x),
                 'nu'      : [nu_],
                 'kernel'  : [kernel_name]},
                scoring = 'neg_mean_squared_error',
                cv = 5,
                n_jobs = -1,
                verbose = False
            )
            ### Final model training
            fix_seed(373)
            gsr_res.fit(x_train_scal_tmp, y_train_res_scal)
            model_res = KernelRidge_HM(
                lambda1 = gsr_res.best_params_['lambda1'],                  
                gamma   = gsr_res.best_params_['gamma'],
                nu      = gsr_res.best_params_['nu'],
                kernel  = kernel_name
            )
            fix_seed(373)
            model_res.fit(x_train_scal_tmp, y_train_res_scal)
            y_fits_res = model_res.predict(x_train_scal_tmp)*y_std_res + y_mean_res + y_fits_only
            y_pred_res = model_res.predict(x_test_scal_tmp)*y_std_res + y_mean_res + y_pred_only
            print('   Learn the residual has been done.    '+str(time.time()-t_tmp))

            ## Learn ratio from only source model
            t_tmp = time.time()
            ### Compute the data
            y_fits_only_tmp = pd.Series(y_fits_only).apply(avoid_zero,args=(np.max(np.abs(y_fits_only))/20, 0.1))
            y_train_rate = y_train_tmp/y_fits_only_tmp
            y_std_rate = y_train_rate.std()
            y_mean_rate = y_train_rate.mean()
            if y_std_rate == 0:
                y_std_rate = 1
            y_train_rate_scal = (y_train_rate - y_mean_rate)/y_std_rate
            ### Grid search
            gsr_rate = GridSearchCV(
                KernelRidge_HM(),
                {'lambda1' : alpha_list,
                 'gamma'   : gamma_list/(2*dim_x),
                 'nu'      : [nu_],
                 'kernel'  : [kernel_name]},
                scoring = 'neg_mean_squared_error',
                cv = 5,
                n_jobs = -1,
                verbose = False
            )
            fix_seed(373)
            gsr_rate.fit(x_train_scal_tmp, y_train_rate_scal)
            model_rate = KernelRidge_HM(
                lambda1 = gsr_rate.best_params_['lambda1'],                  
                gamma   = gsr_rate.best_params_['gamma'],
                nu      = gsr_rate.best_params_['nu'],
                kernel  = kernel_name
            )
            ### Final model training
            fix_seed(373)
            model_rate.fit(x_train_scal_tmp, y_train_rate_scal)
            y_fits_rate = (model_rate.predict(x_train_scal_tmp)*y_std_rate + y_mean_rate) * y_fits_only_tmp
            y_pred_rate = (model_rate.predict(x_test_scal_tmp)*y_std_rate + y_mean_rate) * y_pred_only#_tmp
            print('   Learn the ratio has been done.    '+str(time.time()-t_tmp))

            ## Proposed mathod
            t_tmp = time.time()
            ### Grid search
            SearchParams_AffineTrans = {
                'gamma1'  : [gsr_only.best_params_['gamma']],
                'gamma2'  : [gsr_only.best_params_['gamma']],
                'gamma3'  : [gsr_wotl.best_params_['gamma']],
                'lambda1' : [1e-3, 1e-2, 1e-1, 1],
                'lambda2' : [1e-2, 1e-1, 1, 10],
                'lambda3' : [1e-2, 1e-1, 1, 10],
                'nu'      : [nu_],
                'kernel'  : [kernel_name]
            }
            gsr_AffineTrans = GridSearchCV(
                AffineTrans(),
                SearchParams_AffineTrans,
                scoring = 'neg_mean_squared_error',
                cv = 5,
                n_jobs = -1,
                verbose = False
            )
            fix_seed(373)
            gsr_AffineTrans.fit(X=x_train_adds, y=y_train_scal_tmp)
            model_AffineTrans = AffineTrans(
                gamma1  = gsr_AffineTrans.best_params_['gamma1'],
                gamma2  = gsr_AffineTrans.best_params_['gamma2'],
                gamma3  = gsr_AffineTrans.best_params_['gamma3'],
                lambda1 = gsr_AffineTrans.best_params_['lambda1'],
                lambda2 = gsr_AffineTrans.best_params_['lambda2'],
                lambda3 = gsr_AffineTrans.best_params_['lambda3'],
                nu      = gsr_AffineTrans.best_params_['nu'],    
                kernel  = gsr_AffineTrans.best_params_['kernel']
            )
            ### Final model training
            fix_seed(373)
            model_AffineTrans.fit(X=x_train_adds, y=y_train_scal_tmp)
            y_fits_AffineTrans = model_AffineTrans.predict(x_train_adds)*y_std_tmp + y_mean_tmp
            y_pred_AffineTrans = model_AffineTrans.predict(x_test_adds)*y_std_tmp + y_mean_tmp
            print('   Proposed method has been done.    '+str(time.time()-t_tmp))
            
            ## Proposed method 2
            t_tmp = time.time()
            ### Grid search
            SearchParams_AffineTrans2 = {
                'gamma1'  : [gsr_only.best_params_['gamma']],
                'gamma2'  : [gsr_only.best_params_['gamma']],
                'gamma3'  : [gsr_wotl.best_params_['gamma']],
                'lambda1' : [0],
                'lambda2' : [1e-3, 1e-2, 1e-1, 1, 10],
                'lambda3' : [1e-3, 1e-2, 1e-1, 1, 10],
                'nu'      : [nu_],
                'kernel'  : [kernel_name]
            }
            gsr_AffineTrans2 = GridSearchCV(
                AffineTrans2(),
                SearchParams_AffineTrans2,
                scoring = 'neg_mean_squared_error',
                cv = 5,
                n_jobs = -1,
                verbose = False
            )
            fix_seed(373)
            gsr_AffineTrans2.fit(X=x_train_adds, y=y_train_scal_tmp)
            model_AffineTrans2 = AffineTrans2(
                gamma1  = gsr_AffineTrans2.best_params_['gamma1'],
                gamma2  = gsr_AffineTrans2.best_params_['gamma2'],
                gamma3  = gsr_AffineTrans2.best_params_['gamma3'],
                lambda1 = gsr_AffineTrans2.best_params_['lambda1'],
                lambda2 = gsr_AffineTrans2.best_params_['lambda2'],
                lambda3 = gsr_AffineTrans2.best_params_['lambda3'],
                nu      = gsr_AffineTrans.best_params_['nu'],    
                kernel  = gsr_AffineTrans2.best_params_['kernel']
            )
            ### Final model training
            fix_seed(373)
            model_AffineTrans2.fit(X=x_train_adds, y=y_train_scal_tmp)
            y_fits_AffineTrans2 = model_AffineTrans2.predict(x_train_adds)*y_std_tmp + y_mean_tmp
            y_pred_AffineTrans2 = model_AffineTrans2.predict(x_test_adds)*y_std_tmp + y_mean_tmp
            print('   Proposed method 2 has been done.    '+str(time.time()-t_tmp))

            ## Proposed method 3
            t_tmp = time.time()
            ### Grid search
            SearchParams_AffineTrans3 = {
                'gamma1'  : [gsr_only.best_params_['gamma']],
                'gamma3'  : [gsr_wotl.best_params_['gamma']],
                'lambda1' : [1e-3, 1e-2, 1e-1, 1, 10],
                'lambda3' : [1e-3, 1e-2, 1e-1, 1, 10],
                'nu'      : [nu_],
                'kernel'  : [kernel_name]
            }
            gsr_AffineTrans3 = GridSearchCV(
                AffineTrans3(),
                SearchParams_AffineTrans3,
                cv = 5,
                n_jobs = -1,
                verbose = False
            )
            fix_seed(373)
            gsr_AffineTrans3.fit(X=x_train_adds, y=y_train_scal_tmp)
            model_AffineTrans3 = AffineTrans3(
                gamma1  = gsr_AffineTrans3.best_params_['gamma1'],
                gamma3  = gsr_AffineTrans3.best_params_['gamma3'],
                lambda1 = gsr_AffineTrans3.best_params_['lambda1'],
                lambda3 = gsr_AffineTrans3.best_params_['lambda3'],
                nu      = gsr_AffineTrans.best_params_['nu'],    
                kernel  = gsr_AffineTrans3.best_params_['kernel']
            )
            ### Final model training
            fix_seed(373)
            model_AffineTrans3.fit(X=x_train_adds, y=y_train_scal_tmp)
            y_fits_AffineTrans3 = model_AffineTrans3.predict(x_train_adds)*y_std_tmp + y_mean_tmp
            y_pred_AffineTrans3 = model_AffineTrans3.predict(x_test_adds)*y_std_tmp + y_mean_tmp
            print('   Proposed method 3 has been done.    '+str(time.time()-t_tmp))

            # Save results
            ## Plot
            if not os.path.isdir('../30_Output/20_Plot/300_TransferLearning/'+target_name+'/n'+str(num_train)):
                os.makedirs('../30_Output/20_Plot/300_TransferLearning/'+target_name+'/n'+str(num_train))
            plot_scatter(y_obs_list = [y_train_tmp, y_train_tmp, y_train_tmp, y_train_tmp, y_train_tmp, y_train_tmp, y_train_tmp, y_train_tmp,
                                      y_test, y_test, y_test, y_test, y_test, y_test, y_test, y_test],
                        y_prd_list = [y_fits_wotl, y_fits_only, y_fits_with, y_fits_res, y_fits_rate, y_fits_AffineTrans, y_fits_AffineTrans2, y_fits_AffineTrans3,
                                     y_pred_wotl, y_pred_only, y_pred_with, y_pred_res, y_pred_rate, y_pred_AffineTrans, y_pred_AffineTrans2, y_pred_AffineTrans3],
                        title_list = ['Without transfer (train)', 
                                      'Using only source features (train)', 
                                      'With source features (train)', 
                                      'Learning the residual (train)', 
                                      'Learning the ratio (train)', 
                                      'Proposed method (train)',
                                      'Proposed method 2 (train)',
                                      'Proposed method 3 (train)',
                                      'Without transfer (test)', 
                                      'Using only source features (test)', 
                                      'With source features (test)', 
                                      'Learning the residual (test)', 
                                      'Learning the ratio (test)', 
                                      'Proposed method (test)',
                                      'Proposed method 2 (test)',
                                      'Proposed method 3 (test)'],
                        plt_row = 4,
                        plt_col = 4,
                        position_list = [1, 3, 5, 7, 9, 11, 13, 15,
                                         2, 4, 6, 8, 10, 12, 14, 16],
                        col_list = ['steelblue','steelblue','steelblue','steelblue','steelblue','steelblue','steelblue','steelblue',
                                    'tomato', 'tomato', 'tomato', 'tomato', 'tomato', 'tomato', 'tomato', 'tomato'],
                        alpha_list = [1,1,1,1,1,1,1,1,
                                      0.1, 0.1, 0.1, 0.1, 0.1, 0.1,0.1,0.1],
                        fig_size = (20,20),
                        save_name = '../30_Output/20_Plot/300_TransferLearning/'+target_name+'/n'+str(num_train)+'/'+'301_'+target_name+'_n'+str(num_train)+'_'+str(n_itr)+'.png',
                        title=target_name+',   n : '+str(num_train)+',  try : '+str(n_itr),
                        show_flg=False)

            ## Dataframe
            df_result = pd.concat([df_result,
                                pd.DataFrame(np.array([
                                    target_name, 
                                    num_train,
                                    n_itr,
                                    'Without transfer',
                                    mean_squared_error(y_test, y_pred_wotl),
                                    np.corrcoef(y_test, y_pred_wotl)[0,1],
                                    mean_absolute_error(y_test, y_pred_wotl),
                                    r2_score(y_test, y_pred_wotl)
                                 ]).reshape(1, -1), columns=['data_name','n_sample','n_itr','type', 'MSE', 'Corr', 'MAE', 'R2'], 
                                index=[target_name+'_n'+str(num_train)+'_itr'+str(n_itr)+'_WOTL'])], axis=0)
            df_result = pd.concat([df_result,
                                pd.DataFrame(np.array([
                                    target_name, 
                                    num_train,
                                    n_itr,
                                    'Using only source features',
                                    mean_squared_error(y_test, y_pred_only),
                                    np.corrcoef(y_test, y_pred_only)[0,1],
                                    mean_absolute_error(y_test, y_pred_only),
                                    r2_score(y_test, y_pred_only)
                                 ]).reshape(1, -1), columns=['data_name','n_sample','n_itr','type', 'MSE', 'Corr', 'MAE', 'R2'], 
                                index=[target_name+'_n'+str(num_train)+'_itr'+str(n_itr)+'_ONLY'])], axis=0)
            df_result = pd.concat([df_result,
                                pd.DataFrame(np.array([
                                    target_name, 
                                    num_train,
                                    n_itr,
                                    'With source features',
                                    mean_squared_error(y_test, y_pred_with),
                                    np.corrcoef(y_test, y_pred_with)[0,1],
                                    mean_absolute_error(y_test, y_pred_with),
                                    r2_score(y_test, y_pred_with)
                                 ]).reshape(1, -1), columns=['data_name','n_sample','n_itr','type', 'MSE', 'Corr', 'MAE', 'R2'], 
                                index=[target_name+'_n'+str(num_train)+'_itr'+str(n_itr)+'_WITH'])], axis=0)
            df_result = pd.concat([df_result,
                                pd.DataFrame(np.array([
                                    target_name, 
                                    num_train,
                                    n_itr,
                                    'Leraning the residuals',
                                    mean_squared_error(y_test, y_pred_res),
                                    np.corrcoef(y_test, y_pred_res)[0,1],
                                    mean_absolute_error(y_test, y_pred_res),
                                    r2_score(y_test, y_pred_res)
                                 ]).reshape(1, -1), columns=['data_name','n_sample','n_itr','type', 'MSE', 'Corr', 'MAE', 'R2'], 
                                index=[target_name+'_n'+str(num_train)+'_itr'+str(n_itr)+'_RES'])], axis=0)

            df_result = pd.concat([df_result,
                                pd.DataFrame(np.array([
                                    target_name, 
                                    num_train,
                                    n_itr,
                                    'Learning the rates',
                                    mean_squared_error(y_test, y_pred_rate),
                                    np.corrcoef(y_test, y_pred_rate)[0,1],
                                    mean_absolute_error(y_test, y_pred_rate),
                                    r2_score(y_test, y_pred_rate)
                                 ]).reshape(1, -1), columns=['data_name','n_sample','n_itr','type', 'MSE', 'Corr', 'MAE', 'R2'], 
                                index=[target_name+'_n'+str(num_train)+'_itr'+str(n_itr)+'_RATE'])], axis=0)
            df_result = pd.concat([df_result,
                                pd.DataFrame(np.array([
                                    target_name, 
                                    num_train,
                                    n_itr,
                                    'Proposed method',
                                    mean_squared_error(y_test, y_pred_AffineTrans),
                                    np.corrcoef(y_test, y_pred_AffineTrans)[0,1],
                                    mean_absolute_error(y_test, y_pred_AffineTrans),
                                    r2_score(y_test, y_pred_AffineTrans)
                                 ]).reshape(1, -1), columns=['data_name','n_sample','n_itr','type', 'MSE', 'Corr', 'MAE', 'R2'], 
                                index=[target_name+'_n'+str(num_train)+'_itr'+str(n_itr)+'_AffineTrans'])], axis=0)
            df_result = pd.concat([df_result,
                                pd.DataFrame(np.array([
                                    target_name, 
                                    num_train,
                                    n_itr,
                                    'Proposed method 2',
                                    mean_squared_error(y_test, y_pred_AffineTrans2),
                                    np.corrcoef(y_test, y_pred_AffineTrans2)[0,1],
                                    mean_absolute_error(y_test, y_pred_AffineTrans2),
                                    r2_score(y_test, y_pred_AffineTrans2)
                                 ]).reshape(1, -1), columns=['data_name','n_sample','n_itr','type', 'MSE', 'Corr', 'MAE', 'R2'], 
                                index=[target_name+'_n'+str(num_train)+'_itr'+str(n_itr)+'_AffineTrans2'])], axis=0)
            df_result = pd.concat([df_result,
                                pd.DataFrame(np.array([
                                    target_name, 
                                    num_train,
                                    n_itr,
                                    'Proposed method 3',
                                    mean_squared_error(y_test, y_pred_AffineTrans3),
                                    np.corrcoef(y_test, y_pred_AffineTrans3)[0,1],
                                    mean_absolute_error(y_test, y_pred_AffineTrans3),
                                    r2_score(y_test, y_pred_AffineTrans3)
                                 ]).reshape(1, -1), columns=['data_name','n_sample','n_itr','type', 'MSE', 'Corr', 'MAE', 'R2'], 
                                index=[target_name+'_n'+str(num_train)+'_itr'+str(n_itr)+'_AffineTrans3'])], axis=0)
            df_result.to_csv('../30_Output/30_csv/300_TransferLearning_Result.csv')
            
            clear_output(True)
            print(time.time()-t1, ' / ', time.time()-t0)
clear_output(True)
print(time.time()-t0)
print('*** Succeeded ***')

36051.32017827034
*** Succeeded ***
