In [None]:
%load_ext autoreload
%autoreload 2

import os
import sys
sys.path.append("./src/")

In [None]:
import numpy as np
import pandas as pd
import ot as pot
import torch
import pickle
import gc

from torch import nn
from torch import optim

import lightgbm as lgb

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold

from DANN import DANN
from DAN import DAN
from CORAL import coral
from optimalTransport1D import optimalTransport1D
from nnModel import FC_embedding, EmbeddingModel

from utils import *
from io_utils import *

In [None]:
class DanseNet(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.input_layer = nn.Sequential(
            nn.Linear(input_dim, 16),
            nn.ReLU(),
            nn.Linear(16, 16),
            nn.ReLU()
        )
        self.output_layer = nn.Sequential(
            nn.Linear(16, 16),
            nn.ReLU(),
            nn.Linear(16, output_dim)
        )

    def forward(self, inputs):
        self.hidden_rep = self.input_layer(inputs)
        return self.output_layer(self.hidden_rep)

## execution environment

In [None]:
device = torch.device("cuda")

## convert categorical features to embedding

In [None]:
source = np.load("./data/mobile_trans.npy")
target = np.load("./data/desktop_trans.npy")
source_label = np.load("./data/mobile_label.npy")
target_label = np.load("./data/desktop_label.npy")

with open("./data/embedding_dict_kaggle.pkl", "rb") as file:
    embedding_dict = pickle.load(file)
    
# exclude target unique modality
exclude_index = []
for i in range(8):
    target_modality = np.unique(target[:, i])
    source_modality = np.unique(source[:, i])
    exclude = [m for m in target_modality if m not in source_modality]
    exclude_index.append(np.isin(target[:, i], exclude))
target_cate = target[~np.any(exclude_index, axis=0)]
target_num_label = target_label[~np.any(exclude_index, axis=0)]

source_cate_embedding = []
for i in range(8):
    source_cate_embedding.append(embedding_dict[i][source[:, i].astype(int)])
    
target_cate_embedding = []
for i in range(8):
    target_cate_embedding.append(embedding_dict[i][target_cate[:, i].astype(int)])
    
source_num = np.concatenate(source_cate_embedding, axis=1)
source_num = np.c_[source_num, source[:, 8:]]

target_num = np.concatenate(target_cate_embedding, axis=1)
target_num = np.c_[target_num, target_cate[:, 8:]]

target_num_size = int(target_num.shape[0] // 12)

del(target)
del(target_label)
gc.collect()

target = target_cate
target_label = target_num_label
target_size = target_num_size

In [None]:
def cate_adaptation(target, source):
    """
    input: target & source with categorical features
    return: adapted embed target
    """
    ot_plans = []
    source_modalities = []
    target_modalities = []

    for c in range(8):
        sim, modality = of_uni_cate(np.r_[target[:,c], source[:,c]])
        distance = similarity_to_dissimilarity(sim)

        target_modality, counts = np.unique(target[:,c], return_counts=True)
        target_density = counts / counts.sum()
        target_modalities.append(target_modality)

        source_modality, counts = np.unique(source[:,c], return_counts=True)
        source_density = counts / counts.sum()
        source_modalities.append(source_modality)

        Gs = pot.emd(target_density.tolist(), source_density.tolist(), 
                     distance[np.where(np.in1d(modality, target_modality))[0]][:,np.where(np.in1d(modality, source_modality))[0]].tolist())

        norm_array = Gs.sum(axis=1)
        norm_array[norm_array==0] = 1
        ot_plan = (Gs.T / norm_array).T
        ot_plans.append(ot_plan)


    target_embedding = []
    for c, ot_plan in enumerate(ot_plans):
        trans_matrix = np.zeros((embedding_dict[c].shape[0], embedding_dict[c].shape[0]))
        for i, target_index in enumerate(target_modalities[c]):
            trans_matrix[int(target_index), list(map(int, source_modalities[c]))] = ot_plan[i]

        target_embedding.append(trans_matrix.dot(embedding_dict[c]))

    target_cate_embedding = []
    for i in range(8):
        target_cate_embedding.append(target_embedding[i][target[:, i].astype(int)])

    target_cate_num = np.concatenate(target_cate_embedding, axis=1)
    target_cate_num = np.c_[target_cate_num, target[:, 8:]]
    return target_cate_num

### Chose the subset of data tu investigate

In [None]:
s = 0 #0, 1, 2, 3

In [None]:
source_splits = []
kf = KFold(n_splits=4, shuffle=True, random_state=12345)
for i, (train_index, valid_index) in enumerate(kf.split(source)):
    source_splits.append((train_index, valid_index))

train_index, valid_index = source_splits[s]

xs, ys = source_num[train_index], source_label[train_index]
xv, yv = source_num[valid_index], source_label[valid_index]

### Create navie source black-box model (Neural networks)

In [None]:
torch.manual_seed(12345)
try:
    clf = load_model("./model/kaggle_nn_{}".format(s))
except:
    dansnet = DanseNet(input_dim=xs.shape[1], output_dim=1)
    clf = DAN(dansnet, device=torch.device("cuda"))
    clf.fit(xs, ys, xs, xv, yv, epoch=100, batch_size=128, lr=0.01, beta=0, early_stop=False, verbose=True)
    save_model(clf, "./model/kaggle_nn_{}".format(s))

### Adaptation methods require no retraining:
* Identical
* CORAL
* 1D OT

In [None]:
# Identical
pred_test = clf.predict_prob(target_num)
performance(pred_test, target_label)

In [None]:
# CORAL
x_coral = coral(target_num, xs)
pred_coral = clf.predict_prob(x_coral)
performance(pred_coral, target_label)

In [None]:
# 1D OT num
opt = optimalTransport1D()
tsf, _ = opt.fit_transform(target_num[:,17:], xs[:,17:], njobs=20)
x_ot_num = np.c_[target_num[:,:17], tsf]
pred_num = clf.predict_prob(x_ot_num)
performance(pred_num, target_label)

In [None]:
# 1D OT cate
x_ot_cate = cate_adaptation(target, source[train_index])
pred_cate = clf.predict_prob(x_ot_cate)
performance(pred_cate, target_label)

In [None]:
# 1D OT
x_ot = np.c_[x_ot_cate[:,:17], tsf]
pred_ot = clf.predict_prob(x_ot)
performance(pred_ot, target_label)

### Adaptation methods require retraining:
* DANN
* DAN

In [None]:
# DAN
np.random.seed(12345)
torch.manual_seed(12345)
try:
    clf = load_model("./model/kaggle_dan_{}".format(s))
except:
    dansnet = DanseNet(input_dim=xs.shape[1], output_dim=1)
    clf = DAN(dansnet, device=torch.device("cuda"))
    indext = reduce_dataset(target_num, xs)
    clf.fit(xs, ys, target_num[indext], xv, yv, epoch=100, batch_size=128, lr=0.01, beta=0.5, early_stop=False, verbose=False)
    save_model(clf, "./model/kaggle_dan_{}".format(s))

In [None]:
# DAN
pred_dan = clf.predict_prob(target_num)
performance(pred_dan, target_label)

In [None]:
# DANN 
np.random.seed(12345)
torch.manual_seed(12345)
if s == 0:
    e, b, a = 50, 1.0, 50
elif s == 1:
    e, b, a = 50, 1.0, 100
elif s == 2:
    e, b, a = 100, 0.5, 200
else:
    e, b, a = 50, 1.0, 200
try:
    clf = load_model("./model/kaggle_dann_{}".format(s))
except:
    dansnet = DanseNet(input_dim=xs.shape[1], output_dim=1)
    clf = DANN(dansnet, device=torch.device("cuda"))
    indext = reduce_dataset(target_num, xs)
    clf.fit(xs, ys, target_num[indext], xv, yv, epoch=e, batch_size=128, lr=0.01, beta=b, alpha=a,
            early_stop=False, verbose=False)
    save_model(clf, "./model/kaggle_dann_{}".format(s))

In [None]:
pred_dann = clf.predict_prob(target_num)
performance(pred_dann, target_label)

### Weakly supervised adaptation with feature selection for neural networks

In [None]:
ratio = 0.01 # percentage of labeled target data
repitition = 30 # experiments repetition
n_sample = 200 # number of bootstrap examples

cate_fea = [(0,), (1, 2, 3), (4,), (5, 6, 7, 8), (9, 10, 11), (12,), (13, 14, 15), (16,)]
num_fea = [(i, ) for i in range(17, 129)]

for s in range(4):
    # Load Dataset
    train_index, valid_index = source_splits[s]
    xs, ys = source_num[train_index], source_label[train_index]
    xv, yv = source_num[valid_index], source_label[valid_index]

    # Load Model
    clf = load_model("./model/kaggle_nn_{}".format(s))
    
    # Numerical Adaptation
    opt = optimalTransport1D()
    tsf, _ = opt.fit_transform(target_num[:,17:], xs[:,17:], njobs=20)

    # Categorical Adaptation
    x_ot_cate = cate_adaptation(target, source[train_index])
    x_ot = np.c_[x_ot_cate[:,:17], tsf]

    bootstrap_perf = []
    for i in range(repitition):
        # Sample
        choice_range = range(target_num.shape[0])
        n = int(target_num.shape[0] * ratio)
        index = np.random.choice(choice_range, n, replace=False)
        
        # Compute perf_list and feature_list
        perf_list, feature_list, pred_list, perf_dist_list, index_list = greedy_search(clf, target_num[index], x_ot[index], 
                                                                           target_label[index], 
                                                                           explore_features=cate_fea + num_fea, 
                                                                           n_sample=n_sample,
                                                                           verbose=False)

        save_model(perf_list, "./model/bootstrap_perf_list_{}_{}_{}".format(ratio, s, i))
        save_model(feature_list, "./model/bootstrap_feature_list_{}_{}_{}".format(ratio, s, i))
        save_model(pred_list, "./model/bootstrap_pred_list_{}_{}_{}".format(ratio, s, i))
        save_model(index, "./model/bootstrap_index_{}_{}_{}".format(ratio, s, i))
        save_model(perf_dist_list, "./model/bootstrap_perf_dist_list_{}_{}_{}".format(ratio, s, i))
        save_model(index_list, "./model/bootstrap_index_list_{}_{}_{}".format(ratio, s, i))
        
        # Perf on all test data
        pred = best_prediction(clf, target_num, x_ot, feature_list, len(feature_list)-1, repeat=1)
        perf = performance(pred, target_label)
        bootstrap_perf.append(perf)
        print("Period:", s, "Bootstrap performance:", perf, "Selected features:", len(feature_list))

    save_model(bootstrap_perf, "./model/bootstrap_perf_{}_{}".format(ratio, s))

### Create navie source black-box model (LGB)

In [None]:
xs, ys = source[train_index], source_label[train_index]
xv, yv = source[valid_index], source_label[valid_index]

In [None]:
try:
    clf = load_model("./model/kaggle_lgb_{}".format(s))
except:
    lgb_train = lgb.Dataset(xs, ys, categorical_feature=range(8))
    lgb_valid = lgb.Dataset(xv, yv, categorical_feature=range(8))

    params = {
        'boosting': 'gbdt',
        'objective': 'binary',
        'metric': 'auc',
        'learning_rate': 0.04,
        'max_depth': 3,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'scale_pos_weight': 1,
        'reg_alpha': 0,
        'reg_lambda': 0,
        'random_state': 12345
    }

    clf = lgb.train(params, lgb_train, valid_sets=[lgb_train, lgb_valid], num_boost_round=5000, 
                    early_stopping_rounds=50, verbose_eval=10)
    save_model(clf, "./model/kaggle_lgb_{}".format(s))

### Adaptation methods require no retraining:
* Identical
* CORAL
* 1D OT

In [None]:
# Identical
pred_test = clf.predict(target)
performance(pred_test, target_label)

In [None]:
# CORAL
tsf = coral(target[:,8:], xs[:,8:])
x_coral = np.c_[target[:,:8], tsf]
pred_coral = clf.predict(x_coral)
performance(pred_coral, target_label)

In [None]:
# 1D OT num
opt = optimalTransport1D()
tsf, w_distance = opt.fit_transform(target[:,8:], xs[:,8:], njobs=20)
x_ot_num = np.c_[target[:,:8], tsf]
pred_num = clf.predict(x_ot_num)
performance(pred_num, target_label)

In [None]:
# 1D OT cate
np.random.seed(12345)

repeat = 10
ot_plans = []
transport_plan = []
source_modalities = []
target_modalities = []

# Get transformation plan
for c in range(8):
    sim, modality = of_uni_cate(np.r_[target[:,c], xs[:,c]])
    distance = similarity_to_dissimilarity(sim)
    
    target_modality, counts = np.unique(target[:,c], return_counts=True)
    target_density = counts / counts.sum()
    target_modalities.append(target_modality)

    source_modality, counts = np.unique(xs[:,c], return_counts=True)
    source_density = counts / counts.sum()
    source_modalities.append(source_modality)

    Gs = pot.emd(target_density.tolist(), source_density.tolist(), 
                 distance[np.where(np.in1d(modality, target_modality))[0]][:,np.where(np.in1d(modality, source_modality))[0]].tolist())

    norm_array = Gs.sum(axis=1)
    norm_array[norm_array==0] = 1
    ot_plan = (Gs.T / norm_array).T
    ot_plans.append(ot_plan)

    mapping = {}
    for m in target_modality:
        index = np.where(target_modality==m)[0][0]
        mapping[m] = ot_plan[index]

    transport_plan.append(mapping)


# Perform stochastical transformation
preds = []
for r in range(repeat):
    data = target.copy()

    for c in range(8):
        for m, values in transport_plan[c].items():
            if np.sum(target[:, c]==m) > 0:
                data[target[:, c]==m, c] = source_modalities[c][
                    np.random.choice(
                    len(source_modalities[c]), 
                    size=np.sum(target[:, c]==m), 
                    p=values)]

    pred = clf.predict(data)
    preds.append(pred)
    
# Get prediction results
pred_cate = np.array(preds).mean(axis=0)
performance(pred_cate, target_label)

In [None]:
# 1D OT
preds = []
for r in range(repeat):
    data = x_ot_num.copy()

    for c in range(8):
        for m, values in transport_plan[c].items():
            if np.sum(x_ot_num[:, c]==m) > 0:
                data[x_ot_num[:, c]==m, c] = source_modalities[c][
                    np.random.choice(
                    len(source_modalities[c]), 
                    size=np.sum(x_ot_num[:, c]==m), 
                    p=values)]

    pred = clf.predict(data)
    preds.append(pred)

# Get prediction results
pred_ot = np.array(preds).mean(axis=0)
performance(pred_ot, target_label)

### Weakly supervised adaptation with feature selection for LGB

In [None]:
ratio = 0.01 # percentage of labeled target data
repitition = 30 # experiments repetition
n_sample = 200 # number of bootstrap examples
repeat = 10 # number of stochastic transformation

num_fea = [(i, ) for i in range(120)]

for s in range(4):
    # Load Dataset
    train_index, valid_index = source_splits[s]
    xs, ys = source[train_index], source_label[train_index]
    xv, yv = source[valid_index], source_label[valid_index]

    # Load Models
    clf = load_model("./model/kaggle_lgb_{}".format(s))
    clf.predict_prob = clf.predict

    # Numerical transformation
    opt = optimalTransport1D()
    tsf, w_distance = opt.fit_transform(target[:,8:], xs[:,8:], njobs=20)
    x_ot_num = np.c_[target[:,:8], tsf]

    # Stochastic categorical transformation
    ot_plans = []
    transport_plan = []
    source_modalities = []
    target_modalities = []

    # Get transformation plan
    for c in range(8):
        sim, modality = of_uni_cate(np.r_[target[:,c], xs[:,c]])
        distance = similarity_to_dissimilarity(sim)

        target_modality, counts = np.unique(target[:,c], return_counts=True)
        target_density = counts / counts.sum()
        target_modalities.append(target_modality)

        source_modality, counts = np.unique(xs[:,c], return_counts=True)
        source_density = counts / counts.sum()
        source_modalities.append(source_modality)

        Gs = pot.emd(target_density.tolist(), source_density.tolist(), 
                     distance[np.where(np.in1d(modality, target_modality))[0]][:,np.where(np.in1d(modality, source_modality))[0]].tolist())

        norm_array = Gs.sum(axis=1)
        norm_array[norm_array==0] = 1
        ot_plan = (Gs.T / norm_array).T
        ot_plans.append(ot_plan)

        mapping = {}
        for m in target_modality:
            index = np.where(target_modality==m)[0][0]
            mapping[m] = ot_plan[index]

        transport_plan.append(mapping)

    bootstrap_perf = []
    for i in range(repitition):
        # Sample labeledtarget data
        choice_range = range(target.shape[0])
        n = int(target.shape[0] * ratio)
        index = np.random.choice(choice_range, n, replace=False)

        # Perform stochastical transformation
        cate_adapt = None
        for r in range(repeat):
            data = x_ot_num[index].copy()
            for c in range(8):
                for m, values in transport_plan[c].items():
                    if np.sum(x_ot_num[index, c]==m) > 0:
                        data[x_ot_num[index, c]==m, c] = source_modalities[c][
                            np.random.choice(
                            len(source_modalities[c]), 
                            size=np.sum(x_ot_num[index, c]==m), 
                            p=values)]
                    
            if cate_adapt is None:
                cate_adapt = data
            else:
                cate_adapt = np.r_[cate_adapt, data]

        # Compute perf_list and feature_list
        perf_list, feature_list, pred_list, perf_dist_list, index_list = greedy_search_cate(clf, 
                                                                                            target[np.tile(index, repeat)], 
                                                                                            cate_adapt, 
                                                                                            target_label[index], 
                                                                                            explore_features=num_fea, 
                                                                                            repeat=repeat,
                                                                                            n_sample=n_sample,
                                                                                            verbose=False)

        save_model(perf_list, "./model/cate_bootstrap_perf_list_{}_{}_{}".format(ratio, s, i))
        save_model(feature_list, "./model/cate_bootstrap_feature_list_{}_{}_{}".format(ratio, s, i))
        save_model(pred_list, "./model/cate_bootstrap_pred_list_{}_{}_{}".format(ratio, s, i))
        save_model(index, "./model/cate_bootstrap_index_{}".format(ratio, s, i))
        save_model(perf_dist_list, "./model/cate_bootstrap_perf_dist_list_{}_{}_{}".format(ratio, s, i))
        save_model(index_list, "./model/cate_bootstrap_index_list_{}_{}_{}".format(ratio, s, i))

        # Adapt selected features
        x_ot = None
        for r in range(repeat):
            data = x_ot_num.copy()
            for c in range(8):
                for m, values in transport_plan[c].items():
                    if np.sum(x_ot_num[:, c]==m) > 0:
                        data[x_ot_num[:, c]==m, c] = source_modalities[c][
                            np.random.choice(
                            len(source_modalities[c]), 
                            size=np.sum(x_ot_num[:, c]==m), 
                            p=values)]
            if x_ot is None:
                x_ot = data
            else:
                x_ot = np.r_[x_ot, data]

        # Perf on all test data
        pred = best_prediction(clf, np.tile(target, (repeat, 1)), x_ot, feature_list, len(feature_list)-1, repeat=repeat)
        perf = performance(pred, target_label)
        bootstrap_perf.append(perf)
        print("Period:", s, "Bootstrap performance:", perf, "Selected features:", len(feature_list))
    save_model(bootstrap_perf, "./model/cate_bootstrap_perf_{}_{}".format(ratio, s))