In [2]:
# ---------------------------
# Cell 2: Imports
# ---------------------------
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


In [3]:
# ---------------------------
# Cell 3: Load & preprocess data
# ---------------------------
math = pd.read_csv("student-mat.csv", sep=",")
por = pd.read_csv("student-por.csv", sep=";")

# One-hot encode categoricals consistently
full = pd.concat([math, por])
full = pd.get_dummies(full, drop_first=True)

# Separate back into math and portuguese
math_enc = full.iloc[:len(math), :]
por_enc = full.iloc[len(math):, :]

X_math = math_enc.drop("G3", axis=1).values
y_math = math_enc["G3"].values
X_por = por_enc.drop("G3", axis=1).values
y_por = por_enc["G3"].values

print("Math:", X_math.shape, y_math.shape)
print("Portuguese:", X_por.shape, y_por.shape)


Math: (395, 41) (395,)
Portuguese: (649, 41) (649,)


In [4]:
# ---------------------------
# Cell 4: Build simple networks
# ---------------------------
input_dim = X_por.shape[1]
hidden_dim = 64
ensemble_size = 3

# Feature extractor
feature_extractor = nn.Sequential(
    nn.Linear(input_dim, hidden_dim),
    nn.ReLU(),
    nn.Linear(hidden_dim, hidden_dim//2),
    nn.ReLU()
)

# Ensemble of predictors
predictors = [nn.Sequential(
    nn.Linear(hidden_dim//2, 16),
    nn.ReLU(),
    nn.Linear(16, 1)
) for _ in range(ensemble_size)]

# Domain discriminator
domain_disc = nn.Sequential(
    nn.Linear(hidden_dim//2, 32),
    nn.ReLU(),
    nn.Linear(32, 2)  # source=0, target=1
)

device = "cuda" if torch.cuda.is_available() else "cpu"
feature_extractor.to(device)
for p in predictors: p.to(device)
domain_disc.to(device)


Sequential(
  (0): Linear(in_features=32, out_features=32, bias=True)
  (1): ReLU()
  (2): Linear(in_features=32, out_features=2, bias=True)
)

In [5]:
# ---------------------------
# Cell 5: Gradient reversal trick
# ---------------------------
class GRL(torch.autograd.Function):
    @staticmethod
    def forward(ctx, x, lambda_):
        ctx.lambda_ = lambda_
        return x.view_as(x)
    @staticmethod
    def backward(ctx, grad_output):
        return -ctx.lambda_ * grad_output, None


In [8]:
from itertools import chain

In [12]:
# ---------------------------
# Cell 6: Hyperparameter tuning with grid search
# ---------------------------

def train_dann_once(X_src, y_src, X_tgt, 
                    hidden_dim=128, 
                    epochs=200, 
                    batch_size=32, 
                    lambda_adv=0.5, 
                    dropout_rate=0.3):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    scaler = StandardScaler()
    X_src = scaler.fit_transform(X_src)
    X_tgt = scaler.transform(X_tgt)

    X_src_t = torch.tensor(X_src, dtype=torch.float32)
    y_src_t = torch.tensor(y_src, dtype=torch.float32).view(-1,1)
    X_tgt_t = torch.tensor(X_tgt, dtype=torch.float32)

    src_loader = DataLoader(TensorDataset(X_src_t, y_src_t), batch_size=batch_size, shuffle=True)
    tgt_loader = DataLoader(TensorDataset(X_tgt_t, torch.zeros(len(X_tgt_t),1)), batch_size=batch_size, shuffle=True)

    # Feature extractor with dropout
    feature_extractor = nn.Sequential(
        nn.Linear(X_src.shape[1], hidden_dim),
        nn.ReLU(),
        nn.Dropout(dropout_rate),
        nn.Linear(hidden_dim, hidden_dim//2),
        nn.ReLU(),
        nn.Dropout(dropout_rate)
    ).to(device)

    # Ensemble of predictors
    ensemble_size = 3
    predictors = [
        nn.Sequential(
            nn.Linear(hidden_dim//2, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        ).to(device)
        for _ in range(ensemble_size)
    ]

    # Domain discriminator
    domain_disc = nn.Sequential(
        nn.Linear(hidden_dim//2, 64),
        nn.ReLU(),
        nn.Linear(64, 2)
    ).to(device)

    mse_loss = nn.MSELoss()
    ce_loss = nn.CrossEntropyLoss()

    opt_f = optim.Adam(feature_extractor.parameters(), lr=1e-3)
    opt_p = optim.Adam(chain(*[p.parameters() for p in predictors]), lr=1e-3)
    opt_d = optim.Adam(domain_disc.parameters(), lr=1e-3)

    for epoch in range(epochs):
        for (xs, ys), (xt, _) in zip(src_loader, tgt_loader):
            xs, ys, xt = xs.to(device), ys.to(device), xt.to(device)

            # Extract features
            zs = feature_extractor(xs)
            zt = feature_extractor(xt)

            # --- Label loss ---
            preds = [p(zs) for p in predictors]
            preds_mean = torch.mean(torch.stack(preds), dim=0)
            loss_label = mse_loss(preds_mean, ys)

            # --- Domain loss ---
            z_all = torch.cat([zs, zt], dim=0)
            d_labels = torch.cat([torch.zeros(len(zs)), torch.ones(len(zt))]).long().to(device)

            z_rev = GRL.apply(z_all, lambda_adv)
            d_preds = domain_disc(z_rev)
            loss_domain = ce_loss(d_preds, d_labels)

            loss_total = loss_label + loss_domain

            # Backprop
            opt_f.zero_grad(); opt_p.zero_grad(); opt_d.zero_grad()
            loss_total.backward()
            opt_f.step(); opt_p.step(); opt_d.step()

    return feature_extractor, predictors, scaler


def evaluate(feat, predictors, scaler, X_test, y_test):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    feat.eval()
    for p in predictors: p.eval()

    X_test = scaler.transform(X_test)
    X_test_t = torch.tensor(X_test, dtype=torch.float32).to(device)
    y_test = y_test

    with torch.no_grad():
        z = feat(X_test_t)
        preds = [p(z).cpu().numpy().flatten() for p in predictors]
        preds_mean = np.mean(preds, axis=0)

    rmse = np.sqrt(mean_squared_error(y_test, preds_mean))
    mae = mean_absolute_error(y_test, preds_mean)
    r2 = r2_score(y_test, preds_mean)
    return rmse, mae, r2


# ---------------------------
# Run grid search
# ---------------------------
hidden_dims = [64, 128, 256]
lambda_vals = [0.1, 0.3, 0.5, 1.0]
dropouts = [0.2, 0.3, 0.5]

results = []

for hd in hidden_dims:
    for lam in lambda_vals:
        for dr in dropouts:
            feat, preds, scaler = train_dann_once(X_por, y_por, X_math, 
                                                  hidden_dim=hd, 
                                                  epochs=200, 
                                                  lambda_adv=lam, 
                                                  dropout_rate=dr)
            rmse, mae, r2 = evaluate(feat, preds, scaler, X_math, y_math)
            results.append((hd, lam, dr, rmse, mae, r2))
            print(f"[hd={hd}, lambda={lam}, dropout={dr}] RMSE={rmse:.4f}, MAE={mae:.4f}, R²={r2:.4f}")

# Show best setting
best = sorted(results, key=lambda x: x[3])[0]  # sort by RMSE
print("\nBest config:")
print(f"hidden_dim={best[0]}, lambda={best[1]}, dropout={best[2]} -> RMSE={best[3]:.4f}, MAE={best[4]:.4f}, R²={best[5]:.4f}")


[hd=64, lambda=0.1, dropout=0.2] RMSE=2.6898, MAE=1.6795, R²=0.6544
[hd=64, lambda=0.1, dropout=0.3] RMSE=2.7589, MAE=1.7429, R²=0.6364
[hd=64, lambda=0.1, dropout=0.5] RMSE=3.4184, MAE=2.5705, R²=0.4419
[hd=64, lambda=0.3, dropout=0.2] RMSE=2.6166, MAE=1.5953, R²=0.6730
[hd=64, lambda=0.3, dropout=0.3] RMSE=2.8910, MAE=1.8095, R²=0.6008
[hd=64, lambda=0.3, dropout=0.5] RMSE=3.2037, MAE=2.2130, R²=0.5098
[hd=64, lambda=0.5, dropout=0.2] RMSE=2.5980, MAE=1.6043, R²=0.6776
[hd=64, lambda=0.5, dropout=0.3] RMSE=2.7144, MAE=1.6733, R²=0.6481
[hd=64, lambda=0.5, dropout=0.5] RMSE=3.8035, MAE=3.0718, R²=0.3090
[hd=64, lambda=1.0, dropout=0.2] RMSE=2.6733, MAE=1.6558, R²=0.6587
[hd=64, lambda=1.0, dropout=0.3] RMSE=2.8071, MAE=1.8509, R²=0.6236
[hd=64, lambda=1.0, dropout=0.5] RMSE=3.6959, MAE=2.9254, R²=0.3476
[hd=128, lambda=0.1, dropout=0.2] RMSE=2.7146, MAE=1.6695, R²=0.6480
[hd=128, lambda=0.1, dropout=0.3] RMSE=2.6789, MAE=1.6362, R²=0.6572
[hd=128, lambda=0.1, dropout=0.5] RMSE=2.8270,