<center>


# Domain Adaptation

</center>

### Advanced Machine Learning - Assignment
### Mohammad Poul Doust 



### Exercise 1: Subspace alignment
#### Tasks:
<ul>
<li>Implement the subspace alignment method as a function in Python taking as input the initial data matrices S and T as well as the parameter d and outputting the accuracy of the final classifier</li>
<li>Test the proposed implementation on Office/Caltech data set and compare its performance with a 1-NN classifier on the raw data.</li>
</ul>

In [38]:
# Imports
import scipy.io as sio
import numpy as np
from scipy.linalg import eigh
from sklearn.preprocessing import scale
import sklearn.metrics
from sklearn.neighbors import KNeighborsClassifier
#superpass warnings
import warnings
warnings.filterwarnings('ignore')
import ot


In [39]:
# Find top d eigenvectors
def PCA(data, d):
    """ 
    Calculate the eigenvectors for the data

    Parameters:
    ----------
    data : numpy array
        Data to find eigenvectors
    d : int
        Number of the eigen vectors to return
  
    Returns: 
    -------
    eigen_vectors_to_return: numpy array
        The highest d eigenvectors for the data
  
    """
    covariance = np.cov(data)
    eigen_vals, eigen_vecs = eigh(covariance)
    eigen_vectors_to_return = np.column_stack(eigen_vecs[:, -i] for i in range(1, d + 1))
    eigen_vals_to_return = [eigen_vals[-i] for i in range(1, d + 1)]
    return eigen_vectors_to_return


In [40]:
def fit_predict_1_NN(X, y, X_test, y_test):
    """ 
    1-Nearest neighbor classifier

    Parameters: 
    ----------
    X : numpy array
        Training data features
    y : numpy array
        Training data labels
    X_test : numpy array
        Testing data features
    y_test : numpy array
        Testing data labels
  
    Returns: 
    -------
    accuracy: float
        Accuracy affter fitting 1-NN classifier trained on X,y  and evaluated on X_test, Y_test
  
    """
    nbrs = KNeighborsClassifier(n_neighbors=1).fit(X, y)
    preds = nbrs.predict(X_test)
    accuracy = sklearn.metrics.accuracy_score(y_test, preds)
    return accuracy

In [41]:
def fit_subspace_alignment(S, T, **kwargs):
    """ 
    Domain Adaptation using Subspace Alignment. 
   
    Parameters: 
    ----------
    S : Mat
        Source Domain Data Matrix
    T : Mat
        Target Domain Data Matrix
    d : float
        Number of eigen vectors to consider.
  
    Returns:
    -------    
    accuracy: float
        accuracy affter fitting 1-NN classifier trained on projected Source Domain and evaluated on Target Domain
  
    """
    d = kwargs.pop('d')
    X_s = S['fts']
    y_s = S['labels']
    X_t = T['fts']
    y_t = T['labels']

    #scale
    X_s = scale(X_s)
    X_t = scale(X_t)

#     print('X_s.shape: ', X_s.shape)
#     print('X_s.shape: ', X_s.shape)
    # 1. Find top eigen vectors in source and target domain
    X_s_pca = PCA(X_s.T, d)
    X_t_pca = PCA(X_t.T, d)
    
#     print('X_s_pca.shape: ', X_s_pca.shape)
#     print('X_s_pca.shape: ', X_t_pca.shape)
    
    # 2. Compute Alignment Matrix
    allignment_mat = X_s_pca.T.dot(X_t_pca)
    
    # 3. Align Source to target
    X_a = X_s_pca.dot(allignment_mat)
    # X_a =X_s_pca.dot(X_s_pca.T).dot(X_t_pca)

    # 4. Project source and target
    S_a = X_s.dot(X_a)
    T_t = X_t.dot(X_t_pca)
    
    # 5. Fit 1-NN classifier:
    accuracy = fit_predict_1_NN(S_a, y_s.ravel(), T_t, y_t)    
    
    return accuracy


In [42]:
def evaluate_da(datasets_folder_path, datasets, algorithm, **kwargs):
    """ 
    Generic method to evaluate different domain adaptation methods against different datasets. 
   
    Parameters: 
    ----------
    datasets_folder_path : str
        The folder path that contains datasets to be evaluated
    datasets : list(str)
        List of strings of datasets names to be evaluated
    algorithm : function
        Function represents the domain adaptation method, in our scenario (fit_subspace_alignment or fit_entropic_regularized_ot)
  
    * kwargs : dict
        Dict of or arguments to be passed for domain adaptation metods. In our scenario(d or reg_e)

    
    Note:
    -------
    This function print information about the algorithm being evaluated
    Additionally, it calculates the accuracy on each pair of datasets and compare it
    to the performance without domain adaptation.
    Finally, prints information about the mean accuracy with/without domain adapation for all experiments (pair of datasets)

  
    """
    print("Algorithm: ", algorithm.__name__)
    dataset_no = len(datasets)
    exp_no = 1
    da_experiments_accuracy= []
    experiments_accuracy= []
    for i in range(dataset_no):
        for j in range(dataset_no):
            source_dataset_name = datasets[i]
            target_dataset_name = datasets[j]
            if i == j:
                continue
            print(exp_no, ". Experiment {Source: ", source_dataset_name,", Target: ",target_dataset_name,"}")
            S = sio.loadmat(datasets_folder_path+source_dataset_name )
            T = sio.loadmat(datasets_folder_path+target_dataset_name )

            acc_da = algorithm(S, T, **kwargs)
            acc = fit_predict_1_NN(S['fts'], S['labels'], T['fts'], T['labels'])
            print("\t Using Domain Adaptation: Accuracy: ", acc_da)
            print("\t Without Domain Adaptation: Accuracy: ", acc, "\n")
            da_experiments_accuracy.append(acc_da)
            experiments_accuracy.append(acc)
            exp_no+=1

    print("Using Domain Adaptation: Mean Accuracy= ", np.mean(da_experiments_accuracy))
    print("Without Domain Adaptation Mean Accuracy= ", np.mean(experiments_accuracy))


In [43]:
datasets_folder_path=r"F:\MLDM\3rd Semester\Advanced ML\Domain Adaptation\surf\\"
datasets = ["amazon.mat", "caltech10.mat", "dslr.mat", "webcam.mat"]

evaluate_da(datasets_folder_path, datasets, fit_subspace_alignment, d=80)

Algorithm:  fit_subspace_alignment
1 . Experiment {Source:  amazon.mat , Target:  caltech10.mat }
	 Using Domain Adaptation: Accuracy:  0.39893143365983974
	 Without Domain Adaptation: Accuracy:  0.24220837043633126 

2 . Experiment {Source:  amazon.mat , Target:  dslr.mat }
	 Using Domain Adaptation: Accuracy:  0.3630573248407643
	 Without Domain Adaptation: Accuracy:  0.18471337579617833 

3 . Experiment {Source:  amazon.mat , Target:  webcam.mat }
	 Using Domain Adaptation: Accuracy:  0.3864406779661017
	 Without Domain Adaptation: Accuracy:  0.24067796610169492 

4 . Experiment {Source:  caltech10.mat , Target:  amazon.mat }
	 Using Domain Adaptation: Accuracy:  0.4164926931106472
	 Without Domain Adaptation: Accuracy:  0.21711899791231734 

5 . Experiment {Source:  caltech10.mat , Target:  dslr.mat }
	 Using Domain Adaptation: Accuracy:  0.4267515923566879
	 Without Domain Adaptation: Accuracy:  0.08917197452229299 

6 . Experiment {Source:  caltech10.mat , Target:  webcam.mat }
	

### Exercise 2: Entropic regularized optimal transport
#### Tasks:
<ul>
<li>Implement all these steps as a function in Python taking as input the initial data matrices S and T
as well as the parameter reg_e</li>
<li>Test the proposed implementation on Office/Caltech data set and compare
its performance with the algorithm from Exercise 1</li>
</ul>

In [44]:
def fit_entropic_regularized_ot(S, T, **kwargs):
    """ 
    Domain Adaptation using Entropic Regularized Optimal Transport. 
   
    Parameters:
    ----------
    S : Mat
        Source Domain Data Matrix
    T : Mat
        Target Domain Data Matrix
    reg_e : float
        Entropic Regularization Parameter.
  
    Returns: 
    -------
    accuracy: float
        accuracy affter fitting 1-NN classifier trained on transported Source Domain and evaluated on Target Domain
  
    """
    reg_par = kwargs.pop('reg_e')
    if reg_par is not None:
        reg_e = reg_par
    else:
        reg_e= 1e-3
        
    X_s = data_source['fts']
    y_s = data_source['labels']
    X_t = data_target['fts']
    y_t = data_target['labels']

    ns = X_s.shape[0]
    nt = X_t.shape[0]
    
    #1. Define two uniform vectors a and b that have the size equal to ns and nt, respectively.
    a = np.random.uniform(0, 1, ns)
    b = np.random.uniform(0, 1, nt)
    
    #2. Calculate the loss matrix M where an element with index (i, j) is a distance between the row xi~S and row yi~T
    M = ot.dist(X_s, X_t, metric='euclidean')
    
    #Normalize M
#     Without normalizing on the max was giving better results!
#     M /= M.max()
    
    #3. Fit S to T, G coupling Matrix
    G = ot.sinkhorn(a, b, M, reg_e)
    
    #4. Transport from S to T
    S_a = G.dot(X_t)

    #5. Fit 1-NN classifier
    accuracy = fit_predict_1_NN(S_a,y_s.ravel(),X_t, y_t)
    
    return accuracy


In [45]:
datasets_folder_path=r"F:\MLDM\3rd Semester\Advanced ML\Domain Adaptation\surf\\"
datasets = ["amazon.mat", "caltech10.mat", "dslr.mat", "webcam.mat"]

evaluate_da(datasets_folder_path, datasets, fit_entropic_regularized_ot, reg_e= 0.1)

Algorithm:  fit_entropic_regularized_ot
1 . Experiment {Source:  amazon.mat , Target:  caltech10.mat }
	 Using Domain Adaptation: Accuracy:  0.7070063694267515
	 Without Domain Adaptation: Accuracy:  0.24220837043633126 

2 . Experiment {Source:  amazon.mat , Target:  dslr.mat }
	 Using Domain Adaptation: Accuracy:  0.7388535031847133
	 Without Domain Adaptation: Accuracy:  0.18471337579617833 

3 . Experiment {Source:  amazon.mat , Target:  webcam.mat }
	 Using Domain Adaptation: Accuracy:  0.7133757961783439
	 Without Domain Adaptation: Accuracy:  0.24067796610169492 

4 . Experiment {Source:  caltech10.mat , Target:  amazon.mat }
	 Using Domain Adaptation: Accuracy:  0.7133757961783439
	 Without Domain Adaptation: Accuracy:  0.21711899791231734 

5 . Experiment {Source:  caltech10.mat , Target:  dslr.mat }
	 Using Domain Adaptation: Accuracy:  0.6942675159235668
	 Without Domain Adaptation: Accuracy:  0.08917197452229299 

6 . Experiment {Source:  caltech10.mat , Target:  webcam.mat

# Bonus : Sinkhorn Iterative Algorithm

In [52]:
def my_sinkhorn(a, b, M,  reg=0.1, numItermax=1000):
    M = -reg * M
    K = np.exp(M)
#     print("K: ", K.shape)
    current_iter = 0
    while current_iter < numItermax:
        sum_columns = K.sum(1)
        K = K * (a / sum_columns).reshape((-1, 1))
        sum_rows = K.sum(0)
        K = K * (b / sum_rows)
        current_iter += 1

    K /= K.sum()
#     K /= K.max()

    return K

#### Comparison with with POT library

In [53]:
from time import time
import numpy as np
import ot

# Example adopted from pot manual
a = np.array([.5, .5])
b = np.array([.5, .5])
M = np.array([[0., 1.], [1., 0.]])

start = time()
pot_version = ot.sinkhorn(a, b, M, 1)
print(f'POT execution tim: {time() - start} sec')

start = time()
manual_version = my_sinkhorn(a, b, M, reg=1)
print(f'Manual version execution tim: {time() - start} sec')

print('pot_version: ', pot_version)
print('manual_version: ', manual_version)




POT execution tim: 0.0009965896606445312 sec
Manual version execution tim: 0.012939453125 sec
pot_version:  [[0.36552929 0.13447071]
 [0.13447071 0.36552929]]
manual_version:  [[0.36552929 0.13447071]
 [0.13447071 0.36552929]]


In [54]:
from time import time
import numpy as np
import ot
import scipy.io as sio

data_source = sio.loadmat(r"F:\MLDM\3rd Semester\Advanced ML\Domain Adaptation\surf\webcam.mat")
data_target = sio.loadmat(r"F:\MLDM\3rd Semester\Advanced ML\Domain Adaptation\surf\dslr.mat")

X_s = data_source['fts']
y_s = data_source['labels']
X_t = data_target['fts']
y_t = data_target['labels']

ns = X_s.shape[0]
nt = X_t.shape[0]

a = np.random.uniform(0, 1, ns)
b = np.random.uniform(0, 1, nt)

M = ot.dist(X_s, X_t, metric='euclidean')

start = time()
ot.sinkhorn(a, b, M, 1)
print(f'POT execution tim: {time() - start} sec')

start = time()
test2 = my_sinkhorn(a, b,M,reg=1, )
print(f'manual version execution tim: {time() - start} sec')



POT execution tim: 0.04886960983276367 sec
manual version execution tim: 0.16156768798828125 sec
