In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys
sys.path.append("./src/")
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [22]:
import numpy as np
import pandas as pd
import ot as pot
import torch
import pickle
import gc

from torch import nn
from torch import optim

import lightgbm as lgb

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from collections import defaultdict

from DANN import DANN
from DAN import DAN
from CORAL import coral
from optimalTransport1D import optimalTransport1D
from nnModel import FC_embedding, EmbeddingModel

from utils import *
from io_utils import *

In [4]:
class DanseNet(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.input_layer = nn.Sequential(
            nn.Linear(input_dim, 16),
            nn.ReLU(),
            nn.Linear(16, 16),
            nn.ReLU()
        )
        self.output_layer = nn.Sequential(
            nn.Linear(16, 16),
            nn.ReLU(),
            nn.Linear(16, output_dim)
        )

    def forward(self, inputs):
        self.hidden_rep = self.input_layer(inputs)
        return self.output_layer(self.hidden_rep)

In [5]:
device = torch.device("cuda")

### Preprocessing

In [None]:
trans = pd.read_csv("./data/train_transaction.csv")
identity = pd.read_csv("./data/train_identity.csv")
trans = trans.merge(identity[["TransactionID", "DeviceType"]], on="TransactionID")

In [None]:
source = trans[trans.DeviceType=="mobile"]
target = trans[trans.DeviceType=="desktop"]

nan_percent = trans.isnull().sum(axis=0) / trans.shape[0]

ignore = nan_percent[nan_percent > 0.01].index.values.tolist() + ["isFraud", "TransactionDT", "DeviceType"]

source_label = source["isFraud"].values
target_label = target["isFraud"].values

source = source[[f for f in source.columns if f not in ignore]]
target = target[[f for f in target.columns if f not in ignore]]

source = pd.merge(identity, source, how="right", on="TransactionID")
target = pd.merge(identity, target, how="right", on="TransactionID")

nan_percent = source.append(target).isnull().sum(axis=0) / source.append(target).shape[0]

ignore = nan_percent[nan_percent > 0.01].index.values.tolist() + ["TransactionID", "DeviceType"]

source = source[[f for f in source.columns if f not in ignore]]
target = target[[f for f in target.columns if f not in ignore]]

source_index = np.where(~np.any(source.isnull().values, axis=1))[0]
target_index = np.where(~np.any(target.isnull().values, axis=1))[0]

cates = ["id_12", "id_15", "id_28", "id_29", "id_31", "id_35", "id_36", "id_37", "id_38", 
         "ProductCD", "card1", "card2", "card3", "card4", "card5", "card6"]
no_cates = [c for c in source.columns if c not in cates]
source = source[cates+no_cates]
target = target[cates+no_cates]

for c in cates:
    encoder = LabelEncoder()
    encoder.fit(source[c].append(target[c]).astype(str))
    source[c] = encoder.transform(source[c].astype(str))
    target[c] = encoder.transform(target[c].astype(str))
    
cates = ["id_15", "id_31","ProductCD", "card2", "card3", "card4", "card5", "card6"]
no_cates = [c for c in source.columns if c not in cates]
source = source[cates+no_cates]
source.drop("card1", inplace=True, axis=1)
target = target[cates+no_cates]
target.drop("card1", inplace=True, axis=1)

source = source.values[source_index]
target = target.values[target_index]

source_label = source_label[source_index]
target_label = target_label[target_index]


min_values = np.min(np.r_[source, target], axis=0)

source = source - min_values
target = target - min_values

for i in range(8, 120):
    source[:,i] = np.log(1 + source[:,i])
    target[:,i] = np.log(1 + target[:,i])
    
np.random.seed(12345)
torch.manual_seed(12345)

embd = FC_embedding()

params = {
    "epoch": 50,
    "batch_size": 128,
    "learning_rate": 0.001,
    "pos_weight": 1,
    "model": embd
}
model = EmbeddingModel(**params)

source_size = int(source.shape[0] / 4)

model.fit(source[:source_size*3], source_label[:source_size*3], 
          source[source_size*3:], source_label[source_size*3:], verbose=True)

model.save_embedding_dict("./data/embedding_dict_kaggle.pkl")

np.save("./data/mobile_trans", source)
np.save("./data/mobile_label", source_label)
np.save("./data/desktop_trans", target)
np.save("./data/desktop_label", target_label)

### Use embedding

In [6]:
source = np.load("./data/mobile_trans.npy")
target = np.load("./data/desktop_trans.npy")
source_label = np.load("./data/mobile_label.npy")
target_label = np.load("./data/desktop_label.npy")

with open("./data/embedding_dict_kaggle.pkl", "rb") as file:
    embedding_dict = pickle.load(file)
    
# exclude target unique modality
exclude_index = []
for i in range(8):
    target_modality = np.unique(target[:, i])
    source_modality = np.unique(source[:, i])
    exclude = [m for m in target_modality if m not in source_modality]
    exclude_index.append(np.isin(target[:, i], exclude))
target_cate = target[~np.any(exclude_index, axis=0)]
target_num_label = target_label[~np.any(exclude_index, axis=0)]

source_cate_embedding = []
for i in range(8):
    source_cate_embedding.append(embedding_dict[i]["weight"].cpu()[source[:, i].astype(int)])
    
target_cate_embedding = []
for i in range(8):
    target_cate_embedding.append(embedding_dict[i]["weight"].cpu()[target_cate[:, i].astype(int)])
    
source_num = np.concatenate(source_cate_embedding, axis=1)
source_num = np.c_[source_num, source[:, 8:]]

target_num = np.concatenate(target_cate_embedding, axis=1)
target_num = np.c_[target_num, target_cate[:, 8:]]

target_num_size = int(target_num.shape[0] // 12)

del(target)
del(target_label)
gc.collect()
target = target_cate
target_label = target_num_label
target_size = target_num_size

In [7]:
def cate_adaptation(target, source):
    """
    input: target & source with categorical features
    return: adapted embed target
    """
    ot_plans = []
    source_modalities = []
    target_modalities = []

    for c in range(8):
        sim, modality = of_uni_cate(np.r_[target[:,c], source[:,c]])
        distance = similarity_to_dissimilarity(sim)

        target_modality, counts = np.unique(target[:,c], return_counts=True)
        target_density = counts / counts.sum()
        target_modalities.append(target_modality)

        source_modality, counts = np.unique(source[:,c], return_counts=True)
        source_density = counts / counts.sum()
        source_modalities.append(source_modality)

        Gs = pot.emd(target_density.tolist(), source_density.tolist(), 
                     distance[np.where(np.in1d(modality, target_modality))[0]][:,np.where(np.in1d(modality, source_modality))[0]].tolist())

        norm_array = Gs.sum(axis=1)
        norm_array[norm_array==0] = 1
        ot_plan = (Gs.T / norm_array).T
        ot_plans.append(ot_plan)


    target_embedding = []
    for c, ot_plan in enumerate(ot_plans):
        trans_matrix = np.zeros((embedding_dict[c]['weight'].shape[0], embedding_dict[c]['weight'].shape[0]))
        for i, target_index in enumerate(target_modalities[c]):
            trans_matrix[int(target_index), list(map(int, source_modalities[c]))] = ot_plan[i]

        target_embedding.append(trans_matrix.dot(embedding_dict[c]['weight'].cpu().numpy()))

    target_cate_embedding = []
    for i in range(8):
        target_cate_embedding.append(target_embedding[i][target[:, i].astype(int)])

    target_cate_num = np.concatenate(target_cate_embedding, axis=1)
    target_cate_num = np.c_[target_cate_num, target[:, 8:]]
    return target_cate_num

In [8]:
source_splits = []
kf = KFold(n_splits=4, shuffle=True, random_state=12345)
for i, (train_index, valid_index) in enumerate(kf.split(source)):
    source_splits.append((train_index, valid_index))

In [9]:
s = 0 #0, 1, 2, 3
train_index, valid_index = source_splits[s]

xs, ys = source_num[train_index], source_label[train_index]
xv, yv = source_num[valid_index], source_label[valid_index]

### Train baseline NN model

In [10]:
torch.manual_seed(12345)
try:
    clf = load_model("./model/kaggle_nn_{}".format(s))
except:
    dansnet = DanseNet(input_dim=xs.shape[1], output_dim=1)
    clf = DAN(dansnet, device=torch.device("cuda"))
    clf.fit(xs, ys, xs, xv, yv, epoch=100, batch_size=128, lr=0.01, beta=0, early_stop=False, verbose=True)
    save_model(clf, "./model/kaggle_nn_{}".format(s))

#### No retraining adaptation

In [12]:
# Identity
pred_test = clf.predict_prob(target_num)
performance(pred_test, target_label)

0.5912323528278833

In [13]:
# 1D OT num
opt = optimalTransport1D()
tsf, _ = opt.fit_transform(target_num[:,17:], xs[:,17:], njobs=20)
x_ot_num = np.c_[target_num[:,:17], tsf]
pred_num = clf.predict_prob(x_ot_num)
performance(pred_num, target_label)

0.5872259859741512

In [19]:
# 1D OT cate
x_ot_cate = cate_adaptation(target, source[train_index])
pred_cate = clf.predict_prob(x_ot_cate)
performance(pred_cate, target_label)

0.6218406030378054

In [20]:
# 1D OT
x_ot = np.c_[x_ot_cate[:,:17], tsf]
pred_ot = clf.predict_prob(x_ot)
performance(pred_ot, target_label)

0.631372621120201

In [21]:
# CORAL
x_coral = coral(target_num, xs)
pred_coral = clf.predict_prob(x_coral)
performance(pred_coral, target_label)

0.5489617944336429

#### Retrain adaptation

In [24]:
# DAN
np.random.seed(12345)
torch.manual_seed(12345)
try:
    clf = load_model("./model/kaggle_dan_{}".format(s))
except:
    dansnet = DanseNet(input_dim=xs.shape[1], output_dim=1)
    clf = DAN(dansnet, device=torch.device("cuda"))
    indext = reduce_dataset(target_num, xs)
    clf.fit(xs, ys, target_num[indext], xv, yv, epoch=100, batch_size=128, lr=0.01, beta=0.5, early_stop=False, verbose=False)
    save_model(clf, "./model/kaggle_dan_{}".format(s))

In [25]:
# DAN
pred_dan = clf.predict_prob(target_num)
performance(pred_dan, target_label)

0.6488761052360881

In [26]:
# DANN 
np.random.seed(12345)
torch.manual_seed(12345)
if s == 0:
    e, b, a = 50, 1.0, 50
elif s == 1:
    e, b, a = 50, 1.0, 100
elif s == 2:
    e, b, a = 100, 0.5, 200
else:
    e, b, a = 50, 1.0, 200
try:
    clf = load_model("./model/kaggle_dann_{}".format(s))
except:
    dansnet = DanseNet(input_dim=xs.shape[1], output_dim=1)
    clf = DANN(dansnet, device=torch.device("cuda"))
    indext = reduce_dataset(target_num, xs)
    clf.fit(xs, ys, target_num[indext], xv, yv, epoch=e, batch_size=128, lr=0.01, beta=b, alpha=a,
            early_stop=False, verbose=False)
    save_model(clf, "./model/kaggle_dann_{}".format(s))

In [27]:
pred_dann = clf.predict_prob(target_num)
performance(pred_dann, target_label)

0.6398225143360166

#### weakly supervised adaptation with feature selection

In [None]:
ratio = 0.01
n_sample = 200

cate_fea = [(0,), (1, 2, 3), (4,), (5, 6, 7, 8), (9, 10, 11), (12,), (13, 14, 15), (16,)]
num_fea = [(i, ) for i in range(17, 129)]

for s in range(4):
    # Load Dataset
    train_index, valid_index = source_splits[s]
    xs, ys = source_num[train_index], source_label[train_index]
    xv, yv = source_num[valid_index], source_label[valid_index]

    # Load Model
    clf = load_model("kaggle_nn_{}".format(s))
    
    # Numerical Adaptation
    opt = optimalTransport1D()
    tsf, _ = opt.fit_transform(target_num[:,17:], xs[:,17:], njobs=20)

    # Categorical Adaptation
    x_ot_cate = cate_adaptation(target, source[train_index])
    x_ot = np.c_[x_ot_cate[:,:17], tsf]

    bootstrap_perf = []
    for i in range(30):
        # Sample
        choice_range = range(target_num.shape[0])
        n = int(target_num.shape[0] * ratio)
        index = np.random.choice(choice_range, n, replace=False)
        
        # Compute perf_list and feature_list
        perf_list, feature_list, pred_list, perf_dist_list, index_list = greedy_search(clf, target_num[index], x_ot[index], 
                                                                           target_label[index], 
                                                                           explore_features=cate_fea + num_fea, 
                                                                           n_sample=n_sample,
                                                                           verbose=False)

        save_model(perf_list, "./model/bootstrap_perf_list_{}_{}_{}".format(ratio, s, i))
        save_model(feature_list, "./model/bootstrap_feature_list_{}_{}_{}".format(ratio, s, i))
        save_model(pred_list, "./model/bootstrap_pred_list_{}_{}_{}".format(ratio, s, i))
        save_model(index, "./model/bootstrap_index_{}_{}_{}".format(ratio, s, i))
        save_model(perf_dist_list, "./model/bootstrap_perf_dist_list_{}_{}_{}".format(ratio, s, i))
        save_model(index_list, "./model/bootstrap_index_list_{}_{}_{}".format(ratio, s, i))
        
        # Perf on all test data
        pred = best_prediction(clf, target_num, x_ot, feature_list, len(feature_list)-1, repeat=1)
        perf = performance(pred, target_label)
        bootstrap_perf.append(perf)
        print("Bootstrap:", perf, len(feature_list))

    save_model(bootstrap_perf, "./model/bootstrap_perf_{}_{}".format(ratio, s))

### Train LightGBM model

In [28]:
xs, ys = source[train_index], source_label[train_index]
xv, yv = source[valid_index], source_label[valid_index]

In [29]:
try:
    clf = load_model("./model/kaggle_lgb_{}".format(s))
except:
    lgb_train = lgb.Dataset(xs, ys, categorical_feature=range(8))
    lgb_valid = lgb.Dataset(xv, yv, categorical_feature=range(8))

    params = {
        'boosting': 'gbdt',
        'objective': 'binary',
        'metric': 'auc',
        'learning_rate': 0.04,
        'max_depth': 3,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'scale_pos_weight': 1,
        'reg_alpha': 0,
        'reg_lambda': 0,
        'random_state': 12345
    }

    clf = lgb.train(params, lgb_train, valid_sets=[lgb_train, lgb_valid], num_boost_round=5000, 
                    early_stopping_rounds=50, verbose_eval=10)
    save_model(clf, "./model/kaggle_lgb_{}".format(s))

In [30]:
# Identity
pred_test = clf.predict(target)
performance(pred_test, target_label)

0.6711815591119094

In [31]:
# 1D OT num
opt = optimalTransport1D()
tsf, w_distance = opt.fit_transform(target[:,8:], xs[:,8:], njobs=20)
x_ot_num = np.c_[target[:,:8], tsf]
pred_num = clf.predict(x_ot_num)
performance(pred_num, target_label)

0.6497218833919742

In [32]:
# 1D OT cate
np.random.seed(12345)

repeat = 20
ot_plans = []
transport_plan = []
source_modalities = []
target_modalities = []

for c in range(8):
    sim, modality = of_uni_cate(np.r_[target[:,c], xs[:,c]])
    distance = similarity_to_dissimilarity(sim)
    
    target_modality, counts = np.unique(target[:,c], return_counts=True)
    target_density = counts / counts.sum()
    target_modalities.append(target_modality)

    source_modality, counts = np.unique(xs[:,c], return_counts=True)
    source_density = counts / counts.sum()
    source_modalities.append(source_modality)

    Gs = pot.emd(target_density.tolist(), source_density.tolist(), 
                 distance[np.where(np.in1d(modality, target_modality))[0]][:,np.where(np.in1d(modality, source_modality))[0]].tolist())

    norm_array = Gs.sum(axis=1)
    norm_array[norm_array==0] = 1
    ot_plan = (Gs.T / norm_array).T
    ot_plans.append(ot_plan)

    mapping = {}
    for m in target_modality:
        index = np.where(target_modality==m)[0][0]
        mapping[m] = source_modality[np.random.choice(range(ot_plan.shape[-1]), size=repeat, p=ot_plan[index])]

    transport_plan.append(mapping)

In [33]:
preds = []
for r in range(repeat):
    data = target.copy()

    for c in range(8):
        for m, v in transport_plan[c].items():
            data[data[:, c]==m, c] = v[r]

    pred = clf.predict(data)
    preds.append(pred)

In [34]:
pred_cate = np.array(preds).mean(axis=0)
performance(pred_cate, target_label)

0.6955292752723653

In [35]:
# 1D OT
preds = []
for r in range(repeat):
    data = x_ot_num.copy()

    for c in range(8):
        for m, v in transport_plan[c].items():
            data[data[:, c]==m, c] = v[r]

    pred = clf.predict(data)
    preds.append(pred)

In [36]:
pred_ot = np.array(preds).mean(axis=0)
performance(pred_ot, target_label)

0.6704625127135608

In [37]:
# CORAL
tsf = coral(target[:,8:], xs[:,8:])
x_coral = np.c_[target[:,:8], tsf]
pred_coral = clf.predict(x_coral)
performance(pred_coral, target_label)

0.6041227022400151

#### weakly supervised adaptation with feature selection

In [None]:
ratio = 0.01
n_sample = 200
num_fea = [(i, ) for i in range(120)]
repeat = 10

for s in range(4):
    # Load Dataset
    train_index, valid_index = source_splits[s]
    xs, ys = source[train_index], source_label[train_index]
    xv, yv = source[valid_index], source_label[valid_index]

    # Load Models
    clf = load_model("./model/kaggle_lgb_{}".format(s))
    clf.predict_prob = clf.predict

    # Numerical transformation
    opt = optimalTransport1D()
    tsf, w_distance = opt.fit_transform(target[:,8:], xs[:,8:], njobs=20)
    x_ot_num = np.c_[target[:,:8], tsf]

    # Stochastic categorical transformation
    ot_plans = []
    transport_plan = []
    source_modalities = []
    target_modalities = []

    for c in range(8):
        sim, modality = of_uni_cate(np.r_[target[:,c], xs[:,c]])
        distance = similarity_to_dissimilarity(sim)

        target_modality, counts = np.unique(target[:,c], return_counts=True)
        target_density = counts / counts.sum()
        target_modalities.append(target_modality)

        source_modality, counts = np.unique(xs[:,c], return_counts=True)
        source_density = counts / counts.sum()
        source_modalities.append(source_modality)

        Gs = pot.emd(target_density.tolist(), source_density.tolist(), 
                     distance[np.where(np.in1d(modality, target_modality))[0]][:,np.where(np.in1d(modality, source_modality))[0]].tolist())

        norm_array = Gs.sum(axis=1)
        norm_array[norm_array==0] = 1
        ot_plan = (Gs.T / norm_array).T
        ot_plans.append(ot_plan)

        mapping = {}
        for m in target_modality:
            index = np.where(target_modality==m)[0][0]
            mapping[m] = source_modality[np.random.choice(range(ot_plan.shape[-1]), size=repeat, p=ot_plan[index])]

        transport_plan.append(mapping)


    bootstrap_perf = []
    for i in range(10):
        choice_range = range(target.shape[0])
        n = int(target.shape[0] * ratio)
        index = np.random.choice(choice_range, n, replace=False)

        cate_adapt = None
        for r in range(repeat):
            data = x_ot_num[index].copy()
            for c in range(8):
                for m, v in transport_plan[c].items():
                    data[data[:, c]==m, c] = v[r]
            if cate_adapt is None:
                cate_adapt = data
            else:
                cate_adapt = np.r_[cate_adapt, data]

        # Compute perf_list and feature_list
        perf_list, feature_list, pred_list, perf_dist_list, index_list = greedy_search_cate(clf, 
                                                                                            target[np.tile(index, repeat)], 
                                                                                            cate_adapt, 
                                                                                            target_label[index], 
                                                                                            explore_features=num_fea, 
                                                                                            repeat=repeat,
                                                                                            n_sample=n_sample,
                                                                                            verbose=False)

        save_model(perf_list, "./model/cate_bootstrap_perf_list_{}_{}_{}".format(ratio, s, i))
        save_model(feature_list, "./model/cate_bootstrap_feature_list_{}_{}_{}".format(ratio, s, i))
        save_model(pred_list, "./model/cate_bootstrap_pred_list_{}_{}_{}".format(ratio, s, i))
        save_model(index, "./model/cate_bootstrap_index_{}".format(ratio, s, i))
        save_model(perf_dist_list, "./model/cate_bootstrap_perf_dist_list_{}_{}_{}".format(ratio, s, i))
        save_model(index_list, "./model/cate_bootstrap_index_list_{}_{}_{}".format(ratio, s, i))

        x_ot = None
        for r in range(repeat):
            data = x_ot_num.copy()
            for c in range(8):
                for m, v in transport_plan[c].items():
                    data[data[:, c]==m, c] = v[r]
            if x_ot is None:
                x_ot = data
            else:
                x_ot = np.r_[x_ot, data]

        # Perf on all test data
        pred = best_prediction(clf, np.tile(target, (repeat, 1)), x_ot, feature_list, len(feature_list)-1, repeat=repeat)
        perf = performance(pred, target_label)
        bootstrap_perf.append(perf)
        print("Bootstrap:", perf, len(feature_list))
    save_model(bootstrap_perf, "./model/cate_bootstrap_perf_{}_{}".format(ratio, s))