In [61]:
import torch
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import torch
import numpy as np
from tqdm.notebook import tqdm  # Use notebook-friendly tqdm
import matplotlib.pyplot as plt
from argparse import Namespace
import sys
sys.path.append('..')
sys.path.append('data')
# Import your project modules (adjust the import paths as needed)
from main_transductive import pretrain
from src.utils import set_random_seed, create_optimizer, WBLogger
from src.datasets.data_util import load_dataset, load_processed_graph
from src.models import build_model
from src.evaluation import node_classification_evaluation
from src.utils import build_args, load_best_configs  # if needed


In [62]:
# ----------------------
# Configuration Settings
# ----------------------
# Choose device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Set a seed for reproducibility
seed = 0
set_random_seed(seed)

In [63]:
# Define training hyperparameters
dataset_name    = "CPDB_cdgps"  # Replace with your dataset's name
max_epoch       = 100           # Total training epochs
max_epoch_f     = 200           # For fine tuning
num_hidden      = 64
num_layers      = 3
encoder_type    = "rgcn"     # Replace with your encoder type
decoder_type    = "rgcn"     # Replace with your decoder type
replace_rate    = 0.05
num_edge_types  = 6
in_drop         = 0.2
attn_drop       = 0.1
mask_rate       = 0.5
drop_edge_rate  = 0.0
alpha_l         = 3
num_heads       = 4
activation      = "prelu"          
optimizer       = "adam"             # Adjust as needed (e.g., "sgd", "adam")
loss_fn         = "sce"        # Replace with your loss function identifier
lr              = 0.01
weight_decay    = 1e-3
lr_f            = 0.005               # Learning rate for linear evaluation phase
weight_decay_f  = 1e-4
linear_prob     = False
load_model      = False              # Set True to load a checkpoint
save_model      = True              # Set True to save your trained model
logs            = True              # Set True if you want to use WBLogger
use_scheduler   = True              # Set True to use a learning rate scheduler
weight_decomposition = {'type': 'basis', 'num_bases': 2}
vertical_stacking = True

In [64]:
# ----------------------
# Create a Namespace for Args
# ----------------------

args = Namespace(
    device         = device,
    seeds          = [seed],
    dataset        = dataset_name,
    max_epoch      = max_epoch,
    max_epoch_f    = max_epoch_f,
    num_hidden     = num_hidden,
    num_layers     = num_layers,
    encoder        = encoder_type,
    decoder        = decoder_type,
    activation     = activation,
    in_drop        = in_drop,
    attn_drop      = attn_drop,
    mask_rate      = mask_rate,
    drop_edge_rate = drop_edge_rate,
    alpha_l        = alpha_l,
    num_heads      = num_heads,
    weight_decomposition = weight_decomposition,
    vertical_stacking = vertical_stacking,
    replace_rate   = replace_rate,
    num_edge_types = num_edge_types,
    optimizer      = optimizer,
    loss_fn        = loss_fn,
    lr             = lr,
    weight_decay   = weight_decay,
    lr_f           = lr_f,
    weight_decay_f = weight_decay_f,
    linear_prob    = linear_prob,
    load_model     = load_model,
    save_model     = save_model,
    logging        = logs,
    scheduler      = use_scheduler,
    num_features   = 6, # To be set after loading dataset
    num_out_heads  = 1,
    residual = False,
    norm = None,
    negative_slope = 0.2,
    concat_hidden = False,
    return_hidden = False,
)

In [65]:
# ----------------------
# Load Dataset and Build Model
# ----------------------
#graph, (num_features, num_classes) = load_dataset(dataset_name)
graph = load_processed_graph(f'../data/real/multidim_graph/6d/{dataset_name}_multiomics.pt')
num_features = graph.x.shape[1]
num_classes = graph.y.max().item() + 1

args.num_features = num_features  # Update the args with the number of features

model = build_model(args)
model.to(device)


PreModel(
  (encoder): RGCN(
    (rgcn_layers): ModuleList(
      (0-2): 3 x RGCNConv(64, 64)
    )
    (activation): PReLU(num_parameters=1)
    (head): Identity()
  )
  (decoder): RGCN(
    (rgcn_layers): ModuleList(
      (0): RGCNConv(64, 64)
    )
    (activation): PReLU(num_parameters=1)
    (head): Identity()
  )
  (encoder_to_decoder): Linear(in_features=64, out_features=64, bias=False)
)

In [66]:
# ----------------------
# Create Optimizer and Scheduler
# ----------------------
optimizer = create_optimizer(optimizer, model, lr, weight_decay)

scheduler = None
if use_scheduler:
    # Example: cosine scheduler (you can adjust the function as needed)
    scheduler_fn = lambda epoch: (1 + np.cos(epoch * np.pi / max_epoch)) * 0.5
    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=scheduler_fn)


In [67]:
# ----------------------
# Setup Logger (Optional)
# ----------------------
logger = WBLogger(name="notebook_training") if logs else None


In [68]:
# ----------------------
# Train the Model
# ----------------------
# Here, graph.x is assumed to be your feature matrix.
# If needed, make sure it's on the same device as your model.
model = pretrain(model,
                 graph,
                 graph.x.to(device),
                 optimizer,
                 max_epoch,
                 device,
                 scheduler,
                 num_classes,
                 lr_f,
                 weight_decay_f,
                 max_epoch_f,
                 linear_prob,
                 num_edge_types,
                 logger)

2025-03-21 00:27:06,782 - INFO - start training..
# Epoch 99: train_loss: 0.1408: 100%|██████████| 100/100 [00:29<00:00,  3.40it/s]


In [69]:
# Optionally, move model to CPU after training (or keep it on GPU)
model = model.cpu()

# Optionally save or load the model checkpoint
if save_model:
    torch.save(model.state_dict(), "../checkpoints/emb_extraction_model.pt")
if load_model:
    model.load_state_dict(torch.load("checkpoint.pt"))


In [70]:
# ----------------------
# Evaluate the Model
# ----------------------
model.to(device)
model.eval()

(test_acc, estp_test_acc), (test_auc, estp_test_auc), (test_aupr, estp_test_aupr), (test_precision, estp_test_precision), (test_recall, estp_test_recall), (test_f1, estp_f1) = node_classification_evaluation(model, graph, graph.x, num_classes, lr_f, weight_decay_f, max_epoch_f, device, linear_prob=False)


num parameters for finetuning: 66281
Number of positive labels: tensor(196.)
Number of negative labels: tensor(195.)


# Epoch: 199, train_loss: 0.0738, val_loss: 0.8964, val_acc:0.8877551020408163, val_aupr: 0.9635, test_loss: 1.2509, test_acc: 0.9106,test_auc: 0.9322, test_aupr: 0.9200: 100%|██████████| 200/200 [01:07<00:00,  2.97it/s]

--- TestAcc: 0.9106, early-stopping-TestAcc: 0.9024, Best ValAcc: 0.8980 in epoch 164 --- 
--- TestAUPR: 0.9200, early-stopping-TestAUPR: 0.9219, Best ValAUPR: 0.9648 in epoch 197 --- 





In [71]:
logger.note({
            "test_accuracy": test_acc,
            "test_estp_accuracy": estp_test_acc,
            "test_auc": test_auc,
            "test_estp_auc": estp_test_auc,
            "test_aupr": test_aupr,
            "test_estp_aupr": estp_test_aupr,
            "test_precision": test_precision,
            "test_estp_precision": estp_test_precision,
            "test_recall": test_recall,
            "test_estp_recall": estp_test_recall,
            "test_f1": test_f1,
            "test_estp_f1": estp_f1
            },
            step=max_epoch)

In [72]:
# Assume you have already loaded your trained model, graph, and features (x)
# Set your model to evaluation mode
model.eval()

# Extract embeddings from the model.
# This assumes your model has a method or a flag to return the embeddings.
# For example, if your forward method can optionally return the latent space:
with torch.no_grad():
    # Example: model returns a tuple (loss, embeddings) during training,
    # so during evaluation you might modify it to only get embeddings.
    # Adjust this part based on your model's implementation.
    final_output, hidden_list = model(graph, graph.x, num_edge_types, return_hidden=True)

# Option 2: Use the hidden representation from the last layer as embeddings
embeddings_last_hidden = hidden_list[-1]

# Convert embeddings to a numpy array if they are a torch.Tensor
embeddings_np = embeddings_last_hidden.cpu().numpy()


TypeError: PreModel.forward() got an unexpected keyword argument 'return_hidden'

In [None]:

# Apply t-SNE to reduce the embedding dimensions to 2D
tsne = TSNE(n_components=2, random_state=42)
embeddings_2d = tsne.fit_transform(embeddings_np)

# Plot the embeddings
plt.figure(figsize=(8, 8))
plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], s=5)
plt.title("t-SNE Visualization of Latent Embeddings")
plt.xlabel("t-SNE Dimension 1")
plt.ylabel("t-SNE Dimension 2")
plt.show()