In [None]:
import os
import sys
sys.path.append("../src/")
sys.path.append("../model/")

In [None]:
import numpy as np
import torch

from collections import defaultdict

from io_utils import load_dataset, load_model, model_log, save_pickle
from metric import performance_logloss, performance_pr_auc

from labelshift_correction import build_pivot_dataset, adjust_model
from train_utils import sample_validation_data, LogScore

from coordinate_ot_adaptation_acc import adaptation, Discretize

from feature_selection_sup import feature_selection_sup
from feature_selection_acc import feature_selection

from subdomain_tools import sub_domain_ind, get_pi, pred_ij, weight_pred, \
    sij_minimization_sup, sij_minimization_unsup, \
    optimal_kts_unsup, optimal_kts_unsup_reg, optimal_counter_weight_sup

### Setting

In [None]:
task = "kaggle"
model_type = "nn"

num_dim = 43
period = [0, 1, 2]
cate_index = 8

test_flag = False

n_label = 200
n_source_pred = 10000
n_pseudo = 10000

# train_mode = "sup"
train_mode = "unsup"

version = "exp_subunsup_sparse"
basic_version = "exp_unsup_sparse"


source_version = "uni" 
data_type = "uni"
source_domain = "source"
target_domain = "target"
njobs = 20

In [None]:
source_spoints = {
    1: [],
    2: [12814],
    3: [12201, 37891]
}


target_spoints_p = {
    0: {
        1: [],
        2: [10507],
        # not possible to get 3 subdomains
    },
    1: {
        1: [],
        2: [3303],
        # not possible to get 3 subdomains
    },
    2: {
        1: [],
        2: [18278],
        3: [5965, 16411],
    },
}


for seed in range(10):
    for p in period:
        torch.manual_seed(seed)
        np.random.seed(seed)

        print("Period:", p, seed, flush=True)
        
        # load source and target data
        source_train, source_train_label, source_test, source_test_label = load_dataset("../data/", 
                                                                                        task, source_domain, data_type, 0)
        target_train, target_train_label, target_test, target_test_label = load_dataset("../data/", 
                                                                                        task, target_domain, data_type, p)
        
        # discretize
        source_train_num = source_train[:, cate_index:]
        target_train_num = target_train[:, cate_index:]

        source_train_cate = source_train[:, :cate_index]
        target_train_cate = target_train[:, :cate_index]

        discret = Discretize()
        discret.fit(np.vstack([source_train_num, target_train_num]))

        source_train_num = discret.transform(source_train_num)
        target_train_num = discret.transform(target_train_num)

        source_train = np.hstack([source_train_cate, source_train_num])
        target_train = np.hstack([target_train_cate, target_train_num])
        
        
        # set time order to label first index
        source_train_label[:, 0] = np.arange(source_train_label.shape[0])
        target_train_label[:, 0] = np.arange(target_train_label.shape[0])


        # get source reference prediction
        model = load_model("../model/", task, source_domain, model_type, 0, source_version)
        # get target_factor and source_factor
        source_factor = (source_train_label[:, 1]==0).sum() / source_train_label[:, 1].sum()
        target_factor = (target_train_label[:, 1]==0).sum() / target_train_label[:, 1].sum()

        # adjusting the classifier
        model = adjust_model(model, target_factor, source_factor)

        # source sample
        source_train_index, sample_label = sample_validation_data(task, source_train_label, 
                                                                  ratio=1.0, number_examples=n_source_pred)
        source_sample = source_train[source_train_index]
        source_pred = model.predict(source_sample)


        # sample target supervised examples
        target_train_index, sample_label = sample_validation_data(task, target_train_label, 
                                                                  ratio=1.0, number_examples=n_label)
        target_sample = target_train[target_train_index]
        target_sample_label = target_train_label[target_train_index]


        # sample target pseudo label data
        pseudo_mask = np.zeros(target_train.shape[0])
        under_s = np.random.choice(len(pseudo_mask), n_pseudo, replace=False)

        pseudo_mask[under_s] = 1

        target_pseudo = target_train[pseudo_mask.astype(bool)]
        target_pseudo_label = target_train_label[pseudo_mask.astype(bool)]
        
        
        
        pscore_t = {} # for pi computation
        adapt_ts = defaultdict(dict) # for trans_ij computation
        fmask_ts = defaultdict(dict) # for feature of trans_ij computation
        model_ts = defaultdict(dict) # for adapted model
        sij_ts = defaultdict(dict) # for sij computation      

        for kt in target_spoints_p[p]:
            print("##### Target kt:", kt, flush=True)
                
            # get target sub domains
            target_subdata, target_sublabel = sub_domain_ind(target_train, target_train_label, target_spoints_p[p][kt])

            # get sample target data subdomains
            target_spdata, target_splabel = sub_domain_ind(target_sample, target_sample_label, target_spoints_p[p][kt])
            
            # get pseudo target subdomain
            target_pseudata, target_pseulabel = sub_domain_ind(target_pseudo, target_pseudo_label, target_spoints_p[p][kt])

            
            # init affectation of target examples
            pscore = []
            for i in range(kt):
                logscore = LogScore(cate_index)
                logscore.fit(target_subdata[i])
                pscore.append(logscore)
            
            pscore_t[kt] = pscore

            for ks in source_spoints:
                print("## Source ks:", ks, flush=True)
                # get source sub domains
                source_subdata, source_sublabel = sub_domain_ind(source_train, source_train_label, source_spoints[ks])

                adapts = []
                fmasks = []
                models = []
                
                adapt_ts[kt][ks] = adapts
                fmask_ts[kt][ks] = fmasks
                model_ts[kt][ks] = models
                
                for i in range(kt):
                    adapti = []
                    adapts.append(adapti)
                    
                    fmaski = []
                    fmasks.append(fmaski)
                    
                    modeli = []
                    models.append(modeli)
                    for j in range(ks):
                        print("Adaptation i={}, j={}".format(i+1, j+1))

                        
                        # load source model
                        model = load_model("../model/", task, source_domain, model_type, 0, source_version)

                        # get target_factor and source_factor
                        source_factor = (source_train_label[:, 1]==0).sum() / source_train_label[:, 1].sum()
                        target_factor = (target_sublabel[i][:, 1]==0).sum() / target_sublabel[i][:, 1].sum()

                        # adjusting the classifier
                        model = adjust_model(model, target_factor, source_factor)
                        modeli.append(model)

                        # get target_factor and source_factor
                        source_factor = (source_sublabel[j][:, 1]==0).sum() / source_sublabel[j][:, 1].sum()
                        target_factor = (target_sublabel[i][:, 1]==0).sum() / target_sublabel[i][:, 1].sum()

                        # adjusting source train dataset
                        source_train_sub, source_train_label_sub, source_index = build_pivot_dataset(
                            source_subdata[j], source_sublabel[j], target_factor, source_factor)
                        
                        
                        # source subdomain prediction
                        source_pred_sub = model.predict(source_train_sub)
                        
                        # adaptation
                        adapt = adaptation(cate_dim=cate_index, num_dim=num_dim)
                        adapt.fit(target_subdata[i], source_train_sub, lmbda=1e-1)
                        adapti.append(adapt)

                        target_train_trans = adapt.transform(target_spdata[i], repeat=5, njobs=njobs)
                        targert_sub_trans = adapt.transform(target_pseudata[i], repeat=5, njobs=njobs)
                        

                        feature_selection_process = None
                        if train_mode == "sup":
                            # supervised feature selection
                            params = {
                                "model": model, 
                                "valid": target_spdata[i],
                                "valid_trans": target_train_trans,
                                "ref_label": target_splabel[i][:, 1],
                                "repeat": 5,
                                "n_bootstrap": 100,
                                "njobs": 20,
                                "bootstrap_tol": 0.2,
                                "max_feature": 4,
                                "verbose": False,
                            }
                            feature_selection_process = feature_selection_sup
                            
                        elif train_mode == "unsup":    
                            # unsupervised feature selection
                            params = {
                                "model": model, 
                                "valid": target_pseudata[i],
                                "valid_trans": targert_sub_trans,
                                "ref_label": source_pred_sub,
                                "repeat": 5,
                                "delta": 0.05,
                                "n_bootstrap": 100,
                                "njobs": 20,
                                "bootstrap_tol": 0.2,
                                "max_feature": 4,
                                "verbose": False,
                            }
                            feature_selection_process = feature_selection
                            
                        else:
                            print("Unexpected parameters", flush=True)
                            raise 

                        if (kt, ks) == (1, 1):
                            feature_mask = feature_selection_process(**params)
                        else:
                            feature_maks = 1
                        fmaski.append(feature_mask)
                
                
                pi = get_pi(pscore, target_sample)
                predij = pred_ij(models, adapts, fmasks, target_sample, repeat=5, njobs=20)
                
                pi_all = get_pi(pscore, target_train)
                predij_all = pred_ij(models, adapts, fmasks, target_train, repeat=5, njobs=20)
                

                if train_mode == "sup":
                    sij_softmax = sij_minimization_sup(pi, predij, target_sample_label[:, 1], 
                                                       lr=100, max_iter=1000, n_bootstrap=10, tol=1e-4)
                elif train_mode == "unsup":    
                    sij_softmax = sij_minimization_unsup(pi_all, predij_all, source_pred,
                                                         lr=100, max_iter=1000, n_bootstrap=10, tol=1e-4)
                else:
                    print("Unexpected parameters", flush=True)
                    raise 
                sij_ts[kt][ks] = sij_softmax
                
        domain_mapping = defaultdict(dict)
        for kt in sij_ts:
            for ks in sij_ts[kt]:
                domain_mapping[kt][ks] = np.argmax(sij_ts[kt][ks].reshape((kt, ks)), axis=1)

        
        # save exp results
        if not test_flag:
            save_pickle(sij_ts, "../data/results/{}/{}_{}_{}_{}_{}".format(
                task, model_type, version, p, seed, "sij_ts"))
            save_pickle(domain_mapping, "../data/results/{}/{}_{}_{}_{}_{}".format(
                task, model_type, version, p, seed, "domain_mapping"))
            save_pickle(fmask_ts, "../data/results/{}/{}_{}_{}_{}_{}".format(
                task, model_type, version, p, seed, "fmask_ts"))
        
        
        adapt_map_ts = defaultdict(dict)
        fmask_map_ts = defaultdict(dict)
        model_map_ts = defaultdict(dict)
        sij_ts = defaultdict(dict)
        
        pred_ts = defaultdict(dict)
        pred_all_ts = defaultdict(dict)
        for kt in domain_mapping:
            print("##### Target kt:", kt, flush=True)
                
            # get target sub domains
            target_subdata, target_sublabel = sub_domain_ind(target_train, target_train_label, target_spoints_p[p][kt])

            # get sample target data subdomains
            target_spdata, target_splabel = sub_domain_ind(target_sample, target_sample_label, target_spoints_p[p][kt])
            
            # get pseudo target subdomain
            target_pseudata, target_pseulabel = sub_domain_ind(target_pseudo, target_pseudo_label, target_spoints_p[p][kt])
            
            pscore = pscore_t[kt]
            for ks in domain_mapping[kt]:
                print("## Source ks:", ks, flush=True)
                # get source sub domains
                source_subdata, source_sublabel = sub_domain_ind(source_train, source_train_label, source_spoints[ks])
                
                adapts = []
                fmasks = []
                models = []
                
                adapt_map_ts[kt][ks] = adapts
                fmask_map_ts[kt][ks] = fmasks
                model_map_ts[kt][ks] = models
                
                for i in range(kt):
                    j = domain_mapping[kt][ks][i]
                    
                    
                    model = model_ts[kt][ks][i][j]
                    models.append([model])
                    adapt = adapt_ts[kt][ks][i][j]
                    adapts.append([adapt])

                    # get target_factor and source_factor
                    source_factor = (source_sublabel[j][:, 1]==0).sum() / source_sublabel[j][:, 1].sum()
                    target_factor = (target_sublabel[i][:, 1]==0).sum() / target_sublabel[i][:, 1].sum()

                    # adjusting source train dataset
                    source_train_sub, source_train_label_sub, source_index = build_pivot_dataset(
                        source_subdata[j], source_sublabel[j], target_factor, source_factor)

                    # source subdomain prediction
                    source_pred_sub = model.predict(source_train_sub)

                    # adaptation
                    target_train_trans = adapt.transform(target_spdata[i], repeat=5, njobs=njobs)
                    targert_sub_trans = adapt.transform(target_pseudata[i], repeat=5, njobs=njobs)
        
        
                    feature_selection_process = None
                    if train_mode == "sup":
                        # supervised feature selection
                        params = {
                            "model": model, 
                            "valid": target_spdata[i],
                            "valid_trans": target_train_trans,
                            "ref_label": target_splabel[i][:, 1],
                            "repeat": 5,
                            "n_bootstrap": 100,
                            "njobs": 20,
                            "bootstrap_tol": 0.2,
                            "max_feature": 4,
                            "verbose": False,
                        }
                        feature_selection_process = feature_selection_sup

                    elif train_mode == "unsup":    
                        # unsupervised feature selection
                        params = {
                            "model": model, 
                            "valid": target_pseudata[i],
                            "valid_trans": targert_sub_trans,
                            "ref_label": source_pred_sub,
                            "repeat": 5,
                            "delta": 0.05,
                            "n_bootstrap": 100,
                            "njobs": 20,
                            "bootstrap_tol": 0.2,
                            "max_feature": 4,
                            "verbose": False,
                        }
                        feature_selection_process = feature_selection

                    else:
                        print("Unexpected parameters", flush=True)
                        raise 

                    feature_mask = feature_selection_process(**params)
                    fmasks.append([feature_mask])

                pi = get_pi(pscore, target_sample)
                predij = pred_ij(models, adapts, fmasks, target_sample, repeat=5, njobs=20)

                pi_all = get_pi(pscore, target_train)
                predij_all = pred_ij(models, adapts, fmasks, target_train, repeat=5, njobs=20)

                sij_softmax = np.ones((kt, 1))

                sij_ts[kt][ks] = sij_softmax

                pred = weight_pred(pi, predij, sij_softmax)
                pred_ts[kt][ks] = pred

                pred_all = weight_pred(pi_all, predij_all, sij_softmax)
                pred_all_ts[kt][ks] = pred_all

        
        if train_mode == "sup":
            counter_weight = optimal_counter_weight_sup(pred_ts, target_sample_label[:, 1], 
                                                        n_bootstrap=10, max_iter=1000, tol=1e-5, lr=1000)
        elif train_mode == "unsup":
            counter_weight = optimal_kts_unsup_reg(pred_all_ts, source_pred, 
                                                   n_bootstrap=10, max_iter=1000, tol=1e-5, lr=1000)
        else:
            print("Unexpected parameters", flush=True)
            raise 


        # prediction
        pred_final = np.zeros(target_test.shape[0])
        for kt in counter_weight:
            for ks in counter_weight[kt]:
                pi = get_pi(pscore_t[kt], target_test)
                predij = pred_ij(model_map_ts[kt][ks], adapt_map_ts[kt][ks], fmask_map_ts[kt][ks], 
                                 target_test, repeat=10, njobs=20)
                pred = weight_pred(pi, predij, sij_ts[kt][ks])
                pred_final += pred * counter_weight[kt][ks]
                
        print("Vote Weights:", dict(counter_weight))
        
        if not test_flag:
            save_pickle(fmask_map_ts, "../data/results/{}/{}_{}_{}_{}_{}".format(
                task, model_type, version, p, seed, "fmask_map_ts"))
            save_pickle(counter_weight, "../data/results/{}/{}_{}_{}_{}_{}".format(
                task, model_type, version, p, seed, "counter_weight"))


        perf = performance_logloss(pred_final, target_test_label[:, 1])
        if not test_flag:
            model_log("../logs/logloss/", task, source_domain, model_type, p, source_version, 
                     "{}: {}".format(version, perf))
        print("Target Prediction logloss kt={}, ks={}".format("All", "All"), perf, flush=True)
        
        
        perf = performance_pr_auc(pred_final, target_test_label[:, 1])
        if not test_flag:
            model_log("../logs/pr_auc/", task, source_domain, model_type, p, source_version, 
                     "{}: {}".format(version, perf))
        print("Target Prediction pr_auc kt={}, ks={}".format("All", "All"), perf, flush=True)


        # no subdomains
        best_kt = 1
        best_ks = 1
        pi = get_pi(pscore_t[best_kt], target_test)
        predij = pred_ij(model_map_ts[best_kt][best_ks], adapt_map_ts[best_kt][best_ks], fmask_map_ts[best_kt][best_ks], 
                         target_test, repeat=10, njobs=20)
        pred = weight_pred(pi, predij, sij_ts[best_kt][best_ks])
        
        # performance evaluation on target test for logloss
        perf = performance_logloss(pred, target_test_label[:, 1])
        if not test_flag:
            model_log("../logs/logloss/", task, source_domain, model_type, p, source_version, 
                     "{}: {}".format(basic_version, perf))
        print("Target Prediction logloss kt={}, ks={}".format(best_kt, best_ks), perf, flush=True)

        perf = performance_pr_auc(pred, target_test_label[:, 1])
        if not test_flag:
            model_log("../logs/pr_auc/", task, source_domain, model_type, p, source_version, 
                     "{}: {}".format(basic_version, perf))
        print("Target Prediction pr_auc kt={}, ks={}".format(best_kt, best_ks), perf, flush=True)