In [1]:
import sys
sys.path.append("../src/")
sys.path.append("../model/")

In [2]:
import numpy as np
import torch

from io_utils import load_dataset, load_model, model_log
from metric import performance_logloss, performance_pr_auc

from coordinate_ot_adaptation import adaptation
from labelshift_correction import build_pivot_dataset, adjust_model
from confident_feature_selection import scda
from train_utils import sample_validation_data

### Kaggle

In [3]:
model_type = "nn" # the loaded source model type
source_version = "opt" # the version of embedding matrix & prediction model that we use

task = "kaggle"
data_type = "cate"
num_dim = 43
period = [0, 1, 2, 3]
cate_index = 8

source_domain = "source"
target_domain = "target"

ratio = 0.2
version = "scda_conf"+str(ratio)

### Kaggle Adaptation

In [4]:
njobs = 20

for seed in range(10):
    for p in period:
        torch.manual_seed(seed)
        np.random.seed(seed)

        print("Period:", p, seed, flush=True)
        model = load_model("../model/", task, source_domain, model_type, p, source_version)
        source_train, source_train_label, source_test, source_test_label = load_dataset("../data/", 
                                                                                        task, source_domain, data_type, p)
        target_train, target_train_label, target_test, target_test_label = load_dataset("../data/", 
                                                                                        task, target_domain, data_type, p)

        # get target_factor and source_factor
        target_factor = (target_train_label[:, 1]==0).sum() / target_train_label[:, 1].sum()
        source_factor = (source_train_label[:, 1]==0).sum() / source_train_label[:, 1].sum()

        # adjusting the classifier
        model = adjust_model(model, target_factor, source_factor)

        # adjusting source train dataset
        source_train, source_train_label, source_index = build_pivot_dataset(
            source_train, source_train_label[:,1], target_factor, source_factor)

        # source and target datat undersampling
        source_train_index, _ = sample_validation_data(task, source_train_label, ratio)
        source_train = source_train[source_train_index]
        
        target_train_index, _ = sample_validation_data(task, target_train_label, ratio)
        target_train = target_train[target_train_index]
        
        # source train prediction basedline
        pred_source = model.predict(source_train)

        # init adaptation & fit & transform
        adapt = adaptation(cate_dim=cate_index, num_dim=num_dim)
        adapt.fit(target_train, source_train, lmbda=1e-1)

        target_train_trans = adapt.transform(target_train, repeat=5, njobs=njobs)

        params = {
            "model": model, 
            "valid": target_train,
            "valid_trans": target_train_trans,
            "valid_label": pred_source, 
            "cate_index": cate_index, 
            "repeat": 5,
            "feature_cluster":[[i] for i in range(target_train.shape[-1])],
            "best": None,
            "feature_mask": None,
            "n_bootstrap": 100,
            "verbose": True,
        }

        
        # greedy feature selection
        feature_mask, best_history = scda(**params)
        
        # target test transformation based on selected features
        target_test_trans = adapt.transform(target_test, repeat=20, interpolation=feature_mask, njobs=njobs)

        pred = model.predict(target_test_trans).reshape(20, -1).mean(axis=0)

        perf = performance_pr_auc(pred, target_test_label[:, 1])
        model_log("../logs/pr_auc/", task, source_domain, model_type, p, source_version, 
                 "{}: {}".format(version, perf))
        print("Target Prediction pr_auc", perf, flush=True)

        perf = performance_logloss(pred, target_test_label[:, 1])
        model_log("../logs/logloss/", task, source_domain, model_type, p, source_version, 
                 "{}: {}".format(version, perf))
        print("Target Prediction logloss", perf, flush=True)

Period: 0 0
Length: 754
Votes Percentage: 1.00
Selected Features: [34]
Length: 754
Votes Percentage: 0.73
Selected Features: [34 35]
Length: 754
Votes Percentage: 0.61
Selected Features: [34 35 46]
Length: 754
Votes Percentage: 0.37
Selected Features: [34 35 37 46]
Target Prediction pr_auc 0.624303247573127
Target Prediction logloss -0.15113897732637063
Period: 1 0
Length: 754
Votes Percentage: 0.98
Selected Features: [45]
Length: 754
Votes Percentage: 0.91
Selected Features: [45 47]
Length: 754
Votes Percentage: 0.62
Selected Features: [45 47 49]
Length: 754
Votes Percentage: 0.63
Selected Features: [45 47 48 49]
Length: 754
Votes Percentage: 0.76
Selected Features: [ 8 45 47 48 49]
Length: 754
Votes Percentage: 0.63
Selected Features: [ 8 40 45 47 48 49]
Length: 754
Votes Percentage: 0.40
Selected Features: [ 8 35 40 45 47 48 49]
Target Prediction pr_auc 0.653497575265429
Target Prediction logloss -0.14906506960847604
Period: 2 0
Length: 754
Votes Percentage: 0.89
Selected Features: 