# Import Modules

In [2]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.manifold import TSNE
import torch
from torch import nn, optim
from torch.utils.data import TensorDataset, DataLoader

import utils

# Prepare Data

In [3]:
source_X = pd.read_csv("./deep_occupancy_detection/data/1_X_train.csv").values
target_X = pd.read_csv("./deep_occupancy_detection/data/2_X_train.csv").values

source_y_task = pd.read_csv("./deep_occupancy_detection/data/1_Y_train.csv").values.reshape(-1)
target_y_task = pd.read_csv("./deep_occupancy_detection/data/2_Y_train.csv").values.reshape(-1)

scaler = preprocessing.StandardScaler()
source_X = scaler.fit_transform(source_X)
target_X = scaler.fit_transform(target_X)

source_target_X = np.concatenate([source_X, target_X], axis=0)
source_target_y_domain = np.concatenate([np.zeros(source_X.shape[0]), np.ones(target_X.shape[0])], axis=0)

# 1. Covariate Shift

In [4]:
# 1. Transform d-dimentional array into 1-dimentional
tsne = TSNE(n_components=1, learning_rate="auto", init="random", perplexity=3)
# TODO: Understand Argumetns for t-SNE
source_X_tsne = tsne.fit_transform(source_X)

# 2. Continuous to categorical
num_bins = 100
# TODO: Found non-gaussian distribution case, not suitable for standardization
source_X_tsne = scaler.fit_transform(source_X_tsne)
source_X_tsne, bins = pd.cut(source_X_tsne[:, 0], num_bins, labels=False, retbins=True)

# 3. Count y for every unique value of x(≒ p(y|x))
p_y_of_x_source = pd.DataFrame()
for unique_X in range(num_bins):
    total = len(source_y_task[source_X_tsne == unique_X])
    count_1 = sum(source_y_task[source_X_tsne == unique_X])
    count_0 = total - count_1
    p_y_of_x_source[unique_X] = [count_0, count_1]
p_y_of_x_source /= len(source_y_task)
p_y_of_x_source /= p_y_of_x_source.values.sum(axis=0)
p_y_of_x_source = p_y_of_x_source.fillna(0)

print(p_y_of_x_source)

    0         1    2    3    4     5         6         7    8    9   ...   90  \
0  1.0  0.777778  1.0  1.0  1.0  0.25  0.142857  0.777778  0.0  0.0  ...  0.2   
1  0.0  0.222222  0.0  0.0  0.0  0.75  0.857143  0.222222  1.0  1.0  ...  0.8   

    91   92   93   94        95   96   97   98        99  
0  0.1  0.0  0.0  0.2  0.142857  0.7  0.5  0.4  0.666667  
1  0.9  1.0  1.0  0.8  0.857143  0.3  0.5  0.6  0.333333  

[2 rows x 100 columns]


In [5]:
# Same Process #1. ~ #3. for Target
# TODO: Source and Target should have same bins for pd.cut()

target_X_tsne = tsne.fit_transform(target_X)
target_X_tsne = scaler.fit_transform(target_X_tsne)
target_X_tsne = pd.cut(target_X_tsne[:, 0], bins, labels=False)
p_y_of_x_target = pd.DataFrame()
for unique_X in range(num_bins):
    total = len(target_y_task[target_X_tsne == unique_X])
    count_1 = sum(target_y_task[target_X_tsne == unique_X])
    count_0 = total - count_1
    p_y_of_x_target[unique_X] = [count_0, count_1]
p_y_of_x_target /= len(target_y_task)
p_y_of_x_target /= p_y_of_x_target.values.sum(axis=0)
p_y_of_x_target = p_y_of_x_target.fillna(0)
print(p_y_of_x_target)

    0    1    2    3    4    5    6    7    8    9   ...        90   91  \
0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  ...  0.272727  0.8   
1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.727273  0.2   

         92   93   94   95   96   97   98   99  
0  0.666667  1.0  1.0  1.0  1.0  0.9  0.0  0.0  
1  0.333333  0.0  0.0  0.0  0.0  0.1  1.0  1.0  

[2 rows x 100 columns]


In [6]:
# 4. Calculate distribution gap between source and target
p_y_of_x_target = p_y_of_x_target.values.reshape(-1)
p_y_of_x_source = p_y_of_x_source.values.reshape(-1)
mse = sum((p_y_of_x_source-p_y_of_x_target)**2) / (num_bins*2)
print(f"Conditional Distribution Gap MSE: {mse}")

Conditional Distribution Gap MSE: 0.24704834116243604


# 2. Marginal Distribution Discrepancy between Source and Target

In [None]:
source_target_X = torch.Tensor(source_target_X)
source_target_y_domain = torch.Tensor(source_target_y_domain)

source_target_X = source_target_X.to(utils.DEVICE)
source_target_y_domain = source_target_y_domain.to(utils.DEVICE)

source_target_ds = TensorDataset(source_target_X, source_target_y_domain)
source_target_loader = DataLoader(source_target_ds, batch_size=16, shuffle=True)

In [5]:
criterion = nn.BCELoss()
num_repeats = 10
num_epochs = 100

In [15]:
accs = []
for _ in range(num_repeats):
    domain_classifier = utils.Decoder(input_size=source_target_X.shape[1], output_size=1).to(utils.DEVICE)
    optimizer = optim.Adam(domain_classifier.parameters(), lr=0.001)

    # TODO: See Convergence, if needed should introduce early stopping
    for _ in range(num_epochs):
        for source_target_X_batch, source_target_y_domain_batch in source_target_loader:
            # Forward
            pred_y = domain_classifier(source_target_X_batch)
            pred_y = torch.sigmoid(pred_y).reshape(-1)
            loss = criterion(pred_y, source_target_y_domain_batch)

            # Backward
            optimizer.zero_grad()
            loss.backward()

            # Update Params
            optimizer.step()

    # Evaluation
    pred_y = domain_classifier(source_target_X)
    pred_y = torch.sigmoid(pred_y).reshape(-1)
    pred_y = pred_y > 0.5

    acc = sum(pred_y == source_target_y_domain) / source_target_y_domain.shape[0]
    accs.append(acc.item())

print(f"Domain Classification Accuracy: {np.mean(accs)}")

Domain Classification Accuracy: 0.7404029846191407


# 3. Common Model Minimizing Loss of Both Domains

In [None]:
source_X = torch.Tensor(source_X)
target_X = torch.Tensor(target_X)
source_y_task = torch.Tensor(source_y_task)
target_y_task = torch.Tensor(target_y_task)

source_X = source_X.to(utils.DEVICE)
target_X = target_X.to(utils.DEVICE)
source_y_task = source_y_task.to(utils.DEVICE)
target_y_task = target_y_task.to(utils.DEVICE)

source_ds = TensorDataset(source_X, source_y_task)
target_ds = TensorDataset(target_X, target_y_task)

source_loader = DataLoader(source_ds, batch_size=16, shuffle=True)
target_loader = DataLoader(target_ds, batch_size=16, shuffle=True)

In [10]:
losses = []
for _ in range(num_repeats):
    task_classifier = utils.Decoder(input_size=source_X.shape[1], output_size=1).to(utils.DEVICE)
    optimizer = optim.Adam(task_classifier.parameters(), lr=0.001)
    for _ in range(num_epochs):
        for (source_X_batch, source_y_task_batch), (target_X_batch, target_y_task_batch) in zip(source_loader, target_loader):
            # Forward
            pred_source_y_task = task_classifier(source_X_batch)
            pred_target_y_task = task_classifier(target_X_batch)
            pred_source_y_task = torch.sigmoid(pred_source_y_task).reshape(-1)
            pred_target_y_task = torch.sigmoid(pred_target_y_task).reshape(-1)
            loss = criterion(pred_source_y_task, source_y_task_batch)
            loss += criterion(pred_target_y_task, target_y_task_batch)

            # Backward
            optimizer.zero_grad()
            loss.backward()

            # Update Params
            optimizer.step()

    # Evaluation
    pred_y = task_classifier(source_X)
    pred_y = torch.sigmoid(pred_y).reshape(-1)
    loss = criterion(pred_y, source_y_task)

    pred_y = task_classifier(target_X)
    pred_y = torch.sigmoid(pred_y).reshape(-1)
    loss += criterion(pred_y, target_y_task)
    losses.append(loss.item())

print(f"Common Model's Cross Entropy Loss: {np.mean(losses)}")

Common Model's Cross Entropy Loss: 0.6571887731552124
