In [6]:
import sys
import time
from pathlib import Path
from pprint import pprint

import pandas as pd
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split

sys.path.append(str(Path("..") / "datasets" / "general-descriptors-datasets"))

import numpy as np
from chem_data import MolLogP, MolMelt, NPLogP, NPZetaP, ProtSol
from JOPLEn.enums import NormType
from sklearn.metrics import mean_squared_error
from JOPLEn.multitask import MTJOPLEn
from JOPLEn.partitioner import VPartition
import matplotlib.pyplot as plt


def rmse(y_true, y_pred):  # noqa: ANN001, ANN201
    return mean_squared_error(y_true, y_pred, squared=False)


n_cells = 2
n_partitions = 100
print_epochs = 100
# lam_task = 0.5
# lam_core = 1.0
lam_task = 0.05
lam_core = 0.25
mu = 1e-3
max_iters = 10000
norm_type = NormType.L21
core_alpha = 0.0
task_alpha = 0.0
rel_lr = [1] * 2

In [11]:
np_logp = NPLogP(drop_corr=True)
np_zp = NPZetaP(drop_corr=True)

nplp_x, nplp_y, _ = np_logp.get_data()
npzp_x, npzp_y, _ = np_zp.get_data()

shared_features = np.intersect1d(np_logp.features, np_zp.features)

nplp_mask = np.isin(np_logp.features, shared_features)
npzp_mask = np.isin(np_zp.features, shared_features)

nplp_x = nplp_x[:, nplp_mask]
npzp_x = npzp_x[:, npzp_mask]

print("NPLogP:", nplp_x.shape, nplp_y.shape)
print("NPZetaP:", npzp_x.shape, npzp_y.shape)

nplp_x_train, nplp_x_test, nplp_y_train, nplp_y_test = train_test_split(
    nplp_x,
    nplp_y,
    test_size=0.2,
    random_state=0,
)
npzp_x_train, npzp_x_test, npzp_y_train, npzp_y_test = train_test_split(
    npzp_x,
    npzp_y,
    test_size=0.2,
    random_state=0,
)

del nplp_x, nplp_y, npzp_x, npzp_y

x_train = [nplp_x_train, npzp_x_train]
y_train = [nplp_y_train, npzp_y_train]
x_test = [nplp_x_test, npzp_x_test]
y_test = [nplp_y_test, npzp_y_test]

NPLogP: (147, 1097) (147, 1)
NPZetaP: (206, 1097) (206, 1)


In [12]:
dummy_pred = []
for _, ytr, _, yte in zip(x_train, y_train, x_test, y_test):
    dummy = np.mean(ytr)
    y_pred = np.full(yte.shape, dummy)
    dummy_pred.append(rmse(yte, y_pred.flatten()))

print("Dummy")
print(dummy_pred)

lgbm_pred = []
for xtr, ytr, xte, yte in zip(x_train, y_train, x_test, y_test):
    lgbm = LGBMRegressor(verbose=-1)
    lgbm.fit(xtr, ytr.flatten())
    y_pred = lgbm.predict(xte)
    lgbm_pred.append(rmse(yte, y_pred.flatten()))

print("LGBM")
print(lgbm_pred)

Dummy
[1.7924417419445602, 34.109224940003955]
LGBM
[0.9158572511174042, 17.234382478016517]


In [None]:
jp = MTJOPLEn(
    VPartition,
    n_cells=n_cells,
    n_partitions=n_partitions,
)

start_time = time.time()

history = jp.fit(
    x_train,
    y_train,
    print_epochs=print_epochs,
    lam_core=lam_core,
    lam_task=lam_task,
    mu=mu,
    max_iters=max_iters,
    verbose=True,
    lst_val_x=x_test,
    lst_val_y=y_test,
    norm_type=norm_type,
    core_alpha=core_alpha,
    task_alpha=task_alpha,
    rel_lr=rel_lr,
)

end_time = time.time()

In [None]:
print("Time:", (end_time - start_time))

print(len(history["b_n_features"]))

# get the selected features for each task
wb_norm = np.linalg.norm(jp.cwb.get(), axis=(0, 2), ord="fro")[:-1]
ws_norm = np.linalg.norm(jp.cws.get(), axis=2, ord=2)[:, :-1]

wb_sel_idx = wb_norm > 1e-3
ws_sel_idx = ws_norm > 1e-3
ws_sel_idx = ws_sel_idx & ~wb_sel_idx

# print("Core features:")
# pprint(sorted(shared_features[wb_sel_idx].tolist()))

for i, idx in enumerate(ws_sel_idx):
    print(f"Task {i} features:")
    pprint(sorted(shared_features[idx].tolist()))

# train LGBM using the selected features
# print("LGBM with features")

x_train = [x[:, wb_sel_idx + ws_sel_idx[i]] for i, x in enumerate(x_train)]
x_test = [x[:, wb_sel_idx + ws_sel_idx[i]] for i, x in enumerate(x_test)]

masked_pred = []
for xtr, ytr, xte, yte in zip(x_train, y_train, x_test, y_test):
    lgbm = LGBMRegressor(verbose=-1)
    lgbm.fit(xtr, ytr.flatten())
    y_pred = lgbm.predict(xte)
    masked_pred.append(rmse(yte, y_pred.flatten()))

# combine into a table using pandas
print(
    pd.DataFrame(
        {
            "Dummy": dummy_pred,
            "LGBM": lgbm_pred,
            "LGBM via JOPLEn": masked_pred,
        },
        index=["NPLogP", "NPZetaP"],
    )
)

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(15, 5))
axs[0].plot(history["raw_loss"])
axs[1].plot(history["ws_norm"])
axs[2].plot(history["objective"])

# set titles
axs[0].set_title("Raw Training Loss")
axs[1].set_title("Core Norm")
axs[2].set_title("Objective Function")

plt.show()

# fig, axs = plt.subplots(1, 2, figsize=(10, 5))
# axs[0].matshow(jp.cwb[0].get())
# axs[1].matshow(jp.cws[0].get())
# plt.show()