### Import

In [1]:
import gc 
import os
import threading
import tqdm
import time
import copy
import random
from datetime import datetime

import numpy as np
import pandas as pd


from rdkit import Chem

import torch
from torch import Tensor
from torch import nn
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter


from torch_geometric.loader.link_neighbor_loader import LinkNeighborLoader
import torch_geometric.transforms as T
from torch_geometric.data import (
                                    HeteroData,
                                    Data, 
                                    Batch
                                 )   
from torch_geometric.nn import (
                                GATv2Conv,
                                SAGPooling,
                                global_add_pool,
                                HeteroConv,
                                Linear,
                                to_hetero
                                )

from sklearn.model_selection import StratifiedShuffleSplit, KFold, train_test_split, StratifiedKFold
from sklearn.metrics import (
    accuracy_score, 
    precision_score, 
    recall_score, 
    f1_score,
    roc_auc_score, 
    precision_recall_curve, 
    auc, 
    average_precision_score, 
    matthews_corrcoef
    )

# mhgat
import tqdm, os
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime
import ray

from ray import tune
from ray.tune.schedulers import ASHAScheduler
from ray.tune.search.bayesopt import BayesOptSearch
from ray.tune.search import ConcurrencyLimiter
from ray import train, tune
from ray.tune.search.optuna import OptunaSearch
from ray.tune.schedulers import AsyncHyperBandScheduler

### Seed all randomness

In [2]:
def seed_everything(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Usage example:
seed_everything(29)  # Set the seed to 42

### Load HeteroData

In [3]:
# data_dict = data.to_dict()
fnm = '../prep_data/hetero_graph/hetero_data_dict.pt'
data = torch.load(fnm)

In [4]:
data

HeteroData(
  [1mdrug[0m={ node_id=[1007] },
  [1mside_effect[0m={ node_id=[5587] },
  [1m(drug, known, side_effect)[0m={ edge_index=[2, 132063] },
  [1m(drug, struct, drug)[0m={
    edge_index=[2, 15844],
    edge_attr=[15844]
  },
  [1m(drug, word, drug)[0m={
    edge_index=[2, 83865],
    edge_attr=[83865]
  },
  [1m(drug, target, drug)[0m={
    edge_index=[2, 3363],
    edge_attr=[3363]
  },
  [1m(drug, se_encoded, drug)[0m={
    edge_index=[2, 65854],
    edge_attr=[65854]
  },
  [1m(side_effect, name, side_effect)[0m={
    edge_index=[2, 299170],
    edge_attr=[299170]
  },
  [1m(side_effect, dg_encoded, side_effect)[0m={
    edge_index=[2, 101114],
    edge_attr=[101114]
  },
  [1m(side_effect, atc, side_effect)[0m={
    edge_index=[2, 26140],
    edge_attr=[26140]
  }
)

### Load Transformation Maps

In [5]:
DB_TO_ID_DICT = {}
drug_id_mol_graph_tup = []
ID_TO_DB_DICT = {}
MEDRAID_TO_ID_DICT = {}
ID_TO_MEDRAID_DICT = {}

In [6]:
dict_list = [DB_TO_ID_DICT, ID_TO_DB_DICT, MEDRAID_TO_ID_DICT, ID_TO_MEDRAID_DICT, drug_id_mol_graph_tup]
file_names = ['db_to_id.pt', 'id_to_db.pt', 'uml_to_id.pt', 'id_to_uml.pt', 'drug_to_mol.pt']

for data_dict, fnm in zip(dict_list, file_names):
    full_path = f"../prep_data/hetero_graph/{fnm}"
    loaded_data = torch.load(full_path)
    
    if isinstance(data_dict, dict):
        data_dict.update(loaded_data)
    elif isinstance(data_dict, list):
        data_dict.extend(loaded_data)
    else:
        # If it's neither a dict nor a list, just replace it
        index = dict_list.index(data_dict)
        dict_list[index] = loaded_data

### HeteroData Undirected

In [7]:
data = T.ToUndirected()(data)

### Molecule Featurization Utils

In [8]:
# Bond featurization
def get_bond_features(bond):
    # Simplified list of bond types
    permitted_bond_types = [Chem.rdchem.BondType.SINGLE, Chem.rdchem.BondType.DOUBLE, 
                            Chem.rdchem.BondType.TRIPLE, Chem.rdchem.BondType.AROMATIC, 'Unknown']
    bond_type = bond.GetBondType() if bond.GetBondType() in permitted_bond_types else 'Unknown'
    
    # Features: Bond type, Is in a ring
    features = one_of_k_encoding_unk(bond_type, permitted_bond_types) \
               + [bond.IsInRing()]
    
    return np.array(features, dtype=np.float32)

def get_mol_edge_list_and_feat_mtx(mol_graph):
    n_features = [(atom.GetIdx(), atom_features(atom)) for atom in mol_graph.GetAtoms()]
    n_features.sort() # to make sure that the feature matrix is aligned according to the idx of the atom
    _, n_features = zip(*n_features)
    # n_features = torch.stack(n_features)
    n_features = torch.tensor(n_features, dtype=torch.float32)

    edge_list = torch.LongTensor([(b.GetBeginAtomIdx(), b.GetEndAtomIdx()) for b in mol_graph.GetBonds()])
    undirected_edge_list = torch.cat([edge_list, edge_list[:, [1, 0]]], dim=0) if len(edge_list) else edge_list 

    # Extract bond features
    bond_features = [get_bond_features(bond) for bond in mol_graph.GetBonds()]
    undirected_bond_features = bond_features + bond_features  # duplicate for undirected edges
    edge_attr = torch.tensor(undirected_bond_features, dtype=torch.float32)

    return undirected_edge_list.T, n_features, edge_attr 


def one_of_k_encoding_unk(x, allowable_set):
    if x not in allowable_set:
        x = allowable_set[-1]
    return list(map(lambda s: x == s, allowable_set))

def all_of_k_encoding_unk(x, allowable_set):
    enc = np.zeros(len(allowable_set))
    for idx, side_eff_id in enumerate(allowable_set):
        if side_eff_id in x:
            enc[idx] = 1
    return enc
    
def atom_features(atom,
                explicit_H=True,
                use_chirality=False):

    results = one_of_k_encoding_unk(
        atom.GetSymbol(),
        ['C','N','O', 'S','F','Si','P', 'Cl','Br','Mg','Na','Ca','Fe','As','Al','I','B','V','K','Tl',
            'Yb','Sb','Sn','Ag','Pd','Co','Se','Ti','Zn','H', 'Li','Ge','Cu','Au','Ni','Cd','In',
            'Mn','Zr','Cr','Pt','Hg','Pb','Unknown'
        ]) + [atom.GetDegree()/10, atom.GetImplicitValence(), 
                atom.GetFormalCharge(), atom.GetNumRadicalElectrons()] + \
                one_of_k_encoding_unk(atom.GetHybridization(), [
                Chem.rdchem.HybridizationType.SP, Chem.rdchem.HybridizationType.SP2,
                Chem.rdchem.HybridizationType.SP3, Chem.rdchem.HybridizationType.
                                    SP3D, Chem.rdchem.HybridizationType.SP3D2
                ]) + [atom.GetIsAromatic()]
    # In case of explicit hydrogen(QM8, QM9), avoid calling `GetTotalNumHs`
    if explicit_H:
        results = results + [atom.GetTotalNumHs()]

    if use_chirality:
        try:
            results = results + one_of_k_encoding_unk(
            atom.GetProp('_CIPCode'),
            ['R', 'S']) + [atom.HasProp('_ChiralityPossible')]
        except:
            results = results + [False, False
                            ] + [atom.HasProp('_ChiralityPossible')]

    results = np.array(results).astype(np.float32)

    return results #torch.from_numpy(results)

### Molecule Featurization

In [9]:
MOL_EDGE_LIST_FEAT_MTX = {DB_TO_ID_DICT[drug_id]: get_mol_edge_list_and_feat_mtx(mol) 
                                for drug_id, mol in drug_id_mol_graph_tup}
len(MOL_EDGE_LIST_FEAT_MTX.keys())





1007

In [10]:
MOL_EDGE_LIST_FEAT_MTX[998][1].shape, MOL_EDGE_LIST_FEAT_MTX[998][2].shape

(torch.Size([57, 55]), torch.Size([16, 6]))

### CV Split

In [11]:
def get_kfold_data(data, k=10, shuffle=True, num_neighbors=[10, 4], batch_size=64):
    kf = KFold(n_splits=k, shuffle=shuffle)
    kf.get_n_splits()
    train_val_data_X = data['drug', 'known', 'side_effect'].edge_index.T.numpy()
    for train_index, test_index in kf.split(train_val_data_X):
        train_index_, valid_index_ = train_test_split(train_index, test_size=0.1)
        train_set = train_index_
        valid_set = valid_index_
        
        train_data_cv = copy.deepcopy(data)
        train_data_cv['drug', 'known', 'side_effect'].edge_index = torch.tensor(train_val_data_X[train_set].T)
        train_data_cv['side_effect', 'rev_known', 'drug'].edge_index = torch.tensor(train_val_data_X[train_set].T)[[1, 0]]
    
        val_data_cv = copy.deepcopy(data)
        val_data_cv['drug', 'known', 'side_effect'].edge_index = torch.tensor(train_val_data_X[valid_set].T)
        val_data_cv['side_effect', 'rev_known', 'drug'].edge_index = torch.tensor(train_val_data_X[valid_set].T)[[1, 0]]
    
        
        
        test_data_cv = copy.deepcopy(data)
        test_data_cv['drug', 'known', 'side_effect'].edge_index = torch.tensor(train_val_data_X[test_index].T)
        test_data_cv['side_effect', 'rev_known', 'drug'].edge_index = torch.tensor(train_val_data_X[test_index].T)[[1, 0]]
        
        # use RandomLinkSplit to get disjoint train ratio an other pyg transforms
        transform = T.RandomLinkSplit(
            num_val=0.0,
            num_test=0.0,
            disjoint_train_ratio=0.3236238313900354,
            neg_sampling_ratio=0.0,
            add_negative_train_samples=False,
            edge_types=('drug', 'known', 'side_effect'),
            rev_edge_types=('side_effect', 'rev_known', 'drug'), 
        )
        train_cv, _, _ = transform(train_data_cv)
        
        transform = T.RandomLinkSplit(
            num_val=0.0,
            num_test=0.0,
            disjoint_train_ratio=0.99,
            neg_sampling_ratio=1.0,
            add_negative_train_samples=True,
            edge_types=('drug', 'known', 'side_effect'),
            rev_edge_types=('side_effect', 'rev_known', 'drug'), 
        )
        
        val_cv, _, _ = transform(val_data_cv)
       

        test_cv, _, _ = transform(test_data_cv)
        # Define seed edges:
        edge_label_index = train_cv['drug', 'known', 'side_effect'].edge_label_index
        edge_label = train_cv['drug', 'known', 'side_effect'].edge_label

        train_loader = LinkNeighborLoader(
            data=train_cv,
            num_neighbors=num_neighbors,
            neg_sampling_ratio=1.0,
            edge_label_index=(("drug", "known", "side_effect"), edge_label_index),
            edge_label=edge_label,
            batch_size=batch_size,
            shuffle=True,
            # disjoint=True,
        )
        
        edge_label_index = val_cv['drug', 'known', 'side_effect'].edge_label_index
        edge_label = val_cv['drug', 'known', 'side_effect'].edge_label
        # num_neighbors is a dictionary, it uses the specified number for each edge type
        val_loader = LinkNeighborLoader(
            data=val_cv,
            num_neighbors=num_neighbors,
            edge_label_index=(("drug", "known", "side_effect"), edge_label_index),
            edge_label=edge_label,
            batch_size=batch_size,
            shuffle=False,
        )
        
        
        
        edge_label_index = test_cv['drug', 'known', 'side_effect'].edge_label_index
        edge_label = test_cv['drug', 'known', 'side_effect'].edge_label

        test_loader = LinkNeighborLoader(
            data=test_cv,
            num_neighbors=num_neighbors,
            edge_label_index= (("drug", "known", "side_effect"), edge_label_index), 
            edge_label=edge_label,
            batch_size=batch_size,
            shuffle=False
        )
        yield train_loader, val_loader, test_loader


### Hyper Param - DataLoader

In [12]:
def load_data(config):
    # data_dict = data.to_dict()
    fnm = '/root/SDV-HGNN/prep_data/hetero_graph/hetero_data_dict.pt'
    data = torch.load(fnm)
    # create undirected edges for drug known side-effects
    data = T.ToUndirected()(data)
    
    # use RandomLinkSplit to get disjoint train ratio an other pyg transforms
    transform = T.RandomLinkSplit(
        num_val=0.1,
        num_test=0.1,
        disjoint_train_ratio=config['dtr'], #0.3
        neg_sampling_ratio=1.0,
        add_negative_train_samples=False,
        edge_types=('drug', 'known', 'side_effect'),
        rev_edge_types=('side_effect', 'rev_known', 'drug'), 
    )
    train_data, val_data, test_data = transform(data)
    
    num_neighbors=[config['num_neigh1'],config['num_neigh2']]  #[8, 5]
    batch_size=config['batch_size'] #64   

    # Define seed edges:
    edge_label_index = train_data['drug', 'known', 'side_effect'].edge_label_index
    edge_label = train_data['drug', 'known', 'side_effect'].edge_label

    train_loader = LinkNeighborLoader(
        data=train_data,
        num_neighbors=num_neighbors,
        neg_sampling_ratio=1.0,
        edge_label_index=(("drug", "known", "side_effect"), edge_label_index),
        edge_label=edge_label,
        batch_size=batch_size,
        shuffle=True,
        # disjoint=True,
    )
    
    edge_label_index = val_data['drug', 'known', 'side_effect'].edge_label_index
    edge_label = val_data['drug', 'known', 'side_effect'].edge_label
    # num_neighbors is a dictionary, it uses the specified number for each edge type
    val_loader = LinkNeighborLoader(
        data=val_data,
        num_neighbors=num_neighbors,
        edge_label_index=(("drug", "known", "side_effect"), edge_label_index),
        edge_label=edge_label,
        batch_size=batch_size,
        shuffle=False,
    )
    
    
    
    edge_label_index = test_data['drug', 'known', 'side_effect'].edge_label_index
    edge_label = test_data['drug', 'known', 'side_effect'].edge_label

    test_loader = LinkNeighborLoader(
        data=test_data,
        num_neighbors=num_neighbors,
        edge_label_index= (("drug", "known", "side_effect"), edge_label_index), 
        edge_label=edge_label,
        batch_size=batch_size,
        shuffle=False
    )
    return data, train_loader, val_loader, test_loader


### Model

#### MHGNN Hetero

In [13]:
class HeteroMHGNN(nn.Module):
    def __init__(self, metadata, in_channels, outer_n_head, hidden_dims, heads, use_edge_attr=None):
        super().__init__()
        
        self.convs = nn.ModuleList()
        self.norms = nn.ModuleDict()
        self.skips = nn.ModuleDict()
        self.final_norms = nn.ModuleDict()
        hidden_dims = [hidden_dims] * outer_n_head
        heads = [heads] * outer_n_head
        # Define which edge types should use edge attributes
        if use_edge_attr is None:
            use_edge_attr = {edge_type: False for edge_type in metadata[1]}
        
        for i, (out_dim, head) in enumerate(zip(hidden_dims, heads)):
            conv_dict = {}
            for edge_type in metadata[1]:
                src, _, dst = edge_type
                if i == 0:
                    in_channels = in_channels
                else:
                    in_channels = hidden_dims[i-1] * heads[i-1]
                
                if use_edge_attr[edge_type]:
                    conv_dict[edge_type] = GATv2Conv(in_channels, out_dim, heads=head, add_self_loops=False, edge_dim=1)
                else:
                    conv_dict[edge_type] = GATv2Conv(in_channels, out_dim, heads=head, add_self_loops=False)
            
            self.convs.append(HeteroConv(conv_dict, aggr='sum'))
            
            for node_type in metadata[0]:
                self.norms[f'{node_type}_{i}'] = nn.LayerNorm(out_dim * head)
                if i == 0:
                    self.skips[f'{node_type}_{i}'] = Linear(in_channels, out_dim * head)
                else:
                    self.skips[f'{node_type}_{i}'] = Linear(hidden_dims[i-1] * heads[i-1], out_dim * head)
        
        self.node_types = metadata[0]
        for node_type in metadata[0]:
            self.final_norms[f'{node_type}'] = nn.LayerNorm(out_dim * head *len(heads))
        
        # Initialize skips with xavier init
        for skip in self.skips.values():
            nn.init.xavier_uniform_(skip.weight)
            nn.init.zeros_(skip.bias)

    def forward(self, x_dict, edge_index_dict, edge_attr_dict):
        x_repr_dict = {node_type: [] for node_type in self.node_types}
        # edge_attr_dict = {key: value.to(torch.float32) for key, value in edge_attr_dict.items()}

        
        for i, conv in enumerate(self.convs):
            skip_x = {}
            for node_type in self.node_types:
                skip_x[node_type] = self.skips[f'{node_type}_{i}'](x_dict[node_type])
            
            x_dict_new = conv(x_dict, edge_index_dict, edge_attr_dict)
            
            for node_type in self.node_types:
                # skip_x = self.skips[f'{node_type}_{i}'](x_dict[node_type])
                x = x_dict_new[node_type]
                x = self.norms[f'{node_type}_{i}'](x) + skip_x[node_type]
                x = self.norms[f'{node_type}_{i}'](x)
                x = F.elu(x)
                x_repr_dict[node_type].append(x)
                x_dict[node_type] = x
        
        # Concatenate all representations for each node type
        for node_type in self.node_types:
            x_repr_dict[node_type] = self.final_norms[f'{node_type}'](torch.cat(x_repr_dict[node_type], dim=1))
        
        return x_repr_dict

# Specify which edge types should use edge attributes
use_edge_attr = {
    ('drug', 'known', 'side_effect'): False,
    ('drug', 'struct', 'drug'): True,
    ('drug', 'word', 'drug'): True,
    ('drug', 'target', 'drug'): True,
    ('drug', 'se_encoded', 'drug'): True,
    ('side_effect', 'name', 'side_effect'): True,
    ('side_effect', 'dg_encoded', 'side_effect'): True,
    ('side_effect', 'atc', 'side_effect'): True,
    ('side_effect', 'rev_known', 'drug'): False
}


#### MHGNN - Outer HGNN

In [14]:
# class MHGNN(nn.Module):
#     def __init__(self, input_dim, outer_n_head, hidden_dims, heads):
#         super().__init__()
#         self.GATLayers = nn.ModuleList()
#         self.norms = nn.ModuleList() 
#         self.skips = nn.ModuleList()
#         for i in range(outer_n_head):
#             self.GATLayers.append(GATv2Conv(input_dim, hidden_dims, heads=heads, 
#                                     add_self_loops=False, name=f'GATLayer{i}'))
#             self.norms.append(nn.LayerNorm(hidden_dims * heads))
#             self.skips.append(nn.Linear(input_dim, hidden_dims * heads))
#             input_dim = hidden_dims * heads     
        
#         # # # initialize skips with xavier init
#         for skip in self.skips:
#             nn.init.xavier_uniform_(skip.weight)
#             nn.init.zeros_(skip.bias)

#     def forward(self, x, edge_index, edge_attr):
#         x_repr = []
#         for idx, (layer, skip, norm) in enumerate(zip(self.GATLayers, self.skips, self.norms)): # norm, self.norms
#             skip_x = skip(x)
#             x = layer(x, edge_index)
#             x = norm(x) + skip_x  # Add skip connection
#             x = norm(x)     # Apply normalization
#             x = F.elu(x)    # Apply activation
#             x_repr.append(x)
#             # x = F.elu(norm(x))
#             # x += skip_x
#             # x_repr.append(F.elu(x))
#             # if idx < len(self.GATLayers) - 1:
#             #     x = F.elu(x) # norm(x)
#         x_repr = torch.cat(x_repr, dim=1)
#         return x_repr

#### DVModel

In [15]:
class DrugInterView_Block(nn.Module):
    def __init__(self, n_heads, in_features, head_out_feats):
        super().__init__()
        self.n_heads = n_heads
        self.in_features = in_features
        self.out_features = head_out_feats

        self.feature_conv = GATv2Conv(in_features, head_out_feats, n_heads, edge_dim=6)

        self.readout = SAGPooling(n_heads * head_out_feats, min_score=-1)

    def forward(self, mol_data):
        mol_data.x = self.feature_conv(mol_data.x, mol_data.edge_index, mol_data.edge_attr)
        mol_data_att_x, att_edge_index, att_edge_attr, h_att_batch, att_perm, h_att_scores = self.readout(mol_data.x, mol_data.edge_index, batch=mol_data.batch)

        mol_data_global_graph_emb = global_add_pool(mol_data_att_x, h_att_batch)

        return mol_data, mol_data_global_graph_emb, h_att_scores, h_att_batch

In [16]:
class FinalDrugMolEmb(nn.Module):
    def __init__(self, in_features, mol_n_head, heads_out_feat_params, blocks_params):
        super().__init__()
        self.in_features = in_features
        self.n_blocks = mol_n_head # len(blocks_params)

        self.inital_norm = nn.LayerNorm(self.in_features)

        self.blocks = nn.ModuleList()
        self.net_norms = nn.ModuleList()

        for i in range(mol_n_head):
            block = DrugInterView_Block(blocks_params, in_features, heads_out_feat_params)
            self.blocks.append(block)
            self.net_norms.append(nn.LayerNorm(heads_out_feat_params * blocks_params))
            in_features = heads_out_feat_params * blocks_params
       
    def forward(self, mol_data):
        repr_mol = []
        mol_data.x = self.inital_norm(mol_data.x)
        attention_weights = []
        attention_batch = []
        for idx, (block, norm) in enumerate(zip(self.blocks, self.net_norms)):
            mol_data, mol_data_global_graph_emb, mol_data_att_x, h_att_batch = block(mol_data)
            attention_weights.append(mol_data_att_x)
            attention_batch.append((mol_data.batch, h_att_batch))
            repr_mol.append(mol_data_global_graph_emb)
            if idx < len(self.blocks) - 1:
                mol_data.x = F.elu(norm(mol_data.x))
        # concat all the global graph embeddings
        mol_data_global_graph_emb = torch.cat(repr_mol, dim=1)
        return mol_data_global_graph_emb, attention_weights, attention_batch

In [17]:
class DVModel(torch.nn.Module):
    def __init__(self, config, n_side_effect, edge_metadata):
        super().__init__()
        # Instantiate node embeddings:
        outer_emb_dim = config['mol_heads_out'] * config['mol_block_param'] * config['mol_n_head']
        
        self.seff_emb = torch.nn.Embedding(n_side_effect, outer_emb_dim)
        # DV for Drug
        self.drug_emb = FinalDrugMolEmb(in_features=55, mol_n_head=config['mol_n_head'],
                                        heads_out_feat_params=config['mol_heads_out'], 
                                        blocks_params=config['mol_block_param']) 
        # heads_out_feat_params=[64, 64], blocks_params=[3, 3])

        
        self.inital_norm_outer_drug = nn.LayerNorm(outer_emb_dim)
        # self.inital_norm_outer_se = nn.LayerNorm(outer_emb_dim)
        
        # gnn_model = MHGNN(input_dim=outer_emb_dim, outer_n_head=config['outer_n_head'],
        #                   hidden_dims=config['outer_hidden_dims'], #[128, 128]
        #                   heads=config['outer_block']) 
        # hidden_dims=[64, 64, 64], heads=[2, 2, 2])

        # gnn_model = to_hetero(gnn_model, metadata=edge_metadata)

        gnn_model = HeteroMHGNN(edge_metadata, in_channels=outer_emb_dim, outer_n_head=config['outer_n_head'], 
                                hidden_dims=config['outer_hidden_dims'], heads=config['outer_block'], use_edge_attr=use_edge_attr)
        classifier_model = VanillaClassifier()
        
        # Instantiate Outer GNNs
        self.gnn = gnn_model # outer message passing
        
        # Instantiate classifier:
        self.classifier = classifier_model
        
        torch.nn.init.xavier_uniform_(self.seff_emb.weight)
        
    def __create_graph_data(self, drug_ids, device):
        drug_ids_ = drug_ids.cpu().numpy().astype(int).tolist()
        final_data = []
        for id in drug_ids_:
            _ = MOL_EDGE_LIST_FEAT_MTX[id]
            final_data.append(Data(x= _[1]  , edge_index=_[0], edge_attr=_[2]))
        return Batch.from_data_list(final_data).to(device)        
       
    
    def forward(self, data: HeteroData) -> Tensor:
        drug_list_of_graph_data = self.__create_graph_data(data["drug"].node_id, data["drug"].node_id.device)
        
        drug_dv, attention_weights, h_att_batch = self.drug_emb(drug_list_of_graph_data)
        
        # layer normalization of input features for outer gnn:
        x_dict = {
            "drug":  self.inital_norm_outer_drug(drug_dv),
            "side_effect": self.seff_emb(data["side_effect"].node_id)
        }

        # `x_dict` holds feature matrices of all node types
        # `edge_index_dict` holds all edge indices of all edge types
        # x_dict = self.gnn(x_dict, data.edge_index_dict, data.edge_attr_dict)
        # Forward pass
        x_dict = self.gnn(x_dict, data.edge_index_dict, data.edge_attr_dict)
        pred = self.classifier(
            x_dict["drug"],
            x_dict["side_effect"],
            data["drug", "known", "side_effect"].edge_label_index,
        )

        return pred, attention_weights, h_att_batch

#### Edge Classifier

In [18]:
# Our final classifier applies the hammard-product between source and destination
# node embeddings to derive edge-level predictions:
class VanillaClassifier(torch.nn.Module):
    def forward(self, x_drug: Tensor, x_se: Tensor, edge_label_index: Tensor) -> Tensor:
        # Convert node embeddings to edge-level representations:
        edge_feat_drug = x_drug[edge_label_index[0]]
        edge_feat_se = x_se[edge_label_index[1]]

        # Apply hammard-product to get a prediction per supervision edge:
        return (edge_feat_drug * edge_feat_se).sum(dim=-1)

### Train Utils

#### Train Loop

In [19]:
def do_train_compute(batch, device, model):
    # batch = batch.to(device)
    pred, _, _ = model(batch)
    actual = batch["drug", "known", "side_effect"].edge_label
    return pred, actual

# def do_train_compute(batch, device, model):
#     # batch = batch.to(device)
#     pred = model(batch)
#     actual = batch.edge_label
#     return pred, actual


def evaluate_metrics(probas_pred, ground_truth):
    # compute binary classification metrics using sklearn
    # convert to numpy array
    probas_pred = probas_pred.numpy()
    
    ground_truth = ground_truth.numpy()
    
    # convert to binary predictions
    binary_pred = np.where(probas_pred > 0.5, 1, 0)

    
    # compute metrics
    accuracy = accuracy_score(ground_truth, binary_pred)
    precision = precision_score(ground_truth, binary_pred)
    recall = recall_score(ground_truth, binary_pred)
    f1 = f1_score(ground_truth, binary_pred)
    roc_auc = roc_auc_score(ground_truth, probas_pred)
    precision_, recall_, _ = precision_recall_curve(ground_truth, probas_pred)
    pr_auc = auc(recall_, precision_)
    average_precision = average_precision_score(ground_truth, probas_pred)
    return accuracy, precision, recall, f1, roc_auc, pr_auc, average_precision

def train_loop(model, model_name, writer, train_loader, val_loader, loss_fn, optimizer, n_epochs, device, scheduler=None, early_stopping_patience=3, early_stopping_counter=0):
    early_stop = False
    best_val_metrics = -float("inf") #-float("inf")
    best_model_path = f"saved_models/{model_name}/best_model.pth"
    # make best_model_path parent directory if it doesn't exist
    os.makedirs(os.path.dirname(best_model_path), exist_ok=True)
    
    print("Starting training loop at", datetime.today().strftime("%Y-%m-%d %H:%M:%S"))
    
    total_train_val_steps = len(train_loader) + len(val_loader)
    epoch_progress_bar = tqdm.notebook.tqdm(range(1, (total_train_val_steps*n_epochs)+1), desc="MiniBatches")
    epoch = 0
    for _ in epoch_progress_bar:
        epoch += 1
        start_time = time.time()
        train_loss = 0
        val_loss = 0
        train_probas_pred = []
        train_ground_truth = []
        val_probas_pred = []
        val_ground_truth = []
        print("Epoch", epoch)
        
        model.train()
        for idx, batch in enumerate(train_loader):
            batch = batch.to(device)
            lr = optimizer.param_groups[0]['lr']
            optimizer.zero_grad()
            out, actual = do_train_compute(batch, device, model)
            pred = torch.sigmoid(out)
            train_probas_pred.append(pred.detach().cpu())
            train_ground_truth.append(actual.detach().cpu())
            loss = loss_fn(out, actual)
            loss.backward()
            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # Adjust max_norm as needed

            optimizer.step()
            train_loss += loss.item()
            epoch_progress_bar.set_postfix_str(f"Epoch {epoch} - LR {lr:.7f} - Train Batch {idx+1}/{len(train_loader)} - Train loss: {train_loss/(idx+1):.4f}")
            epoch_progress_bar.update()
            writer.add_scalar("Training Loss MiniBatch", loss.item(), idx)
            batch = batch.to("cpu")
            # if scheduler is not None: # cosine annealing scheduler
            #     scheduler.step()
        
        train_loss /= len(train_loader)
        writer.add_scalar("Training Loss Epoch", train_loss, epoch)
        model.eval()
        with torch.no_grad():
            train_probas_pred = torch.cat(train_probas_pred, dim=0)
            train_ground_truth = torch.cat(train_ground_truth, dim=0)
            train_accuracy, train_precision, train_recall, train_f1, \
                train_roc_auc, train_pr_auc, train_average_precision = evaluate_metrics(train_probas_pred, train_ground_truth)
            writer.add_scalar("Training Accuracy", train_accuracy, epoch)
            writer.add_scalar("Training Precision", train_precision, epoch)
            writer.add_scalar("Training Recall", train_recall, epoch)
            writer.add_scalar("Training F1", train_f1, epoch)
            writer.add_scalar("Training ROC AUC", train_roc_auc, epoch)
            writer.add_scalar("Training PR AUC", train_pr_auc, epoch)
            writer.add_scalar("Training Average Precision", train_average_precision, epoch)

            for idx_, batch in enumerate(val_loader):
                batch = batch.to(device)
                out, actual = do_train_compute(batch, device, model)
                pred = torch.sigmoid(out)
                val_probas_pred.append(pred.detach().cpu())
                val_ground_truth.append(actual.detach().cpu())
                loss = loss_fn(out, actual)
                val_loss += loss.item()
                epoch_progress_bar.set_postfix_str(f"Epoch {epoch} - LR {lr:.7f} - Val Batch {idx_+1}/{len(val_loader)} - Val loss: {val_loss/(idx+1):.4f}")
                epoch_progress_bar.update()
                writer.add_scalar("Validation Loss MiniBatch", loss.item(), idx_)
                batch = batch.to("cpu")
            val_loss /= len(val_loader)
            val_probas_pred = torch.cat(val_probas_pred, dim=0)
            val_ground_truth = torch.cat(val_ground_truth, dim=0)
            val_accuracy, val_precision, val_recall, val_f1, \
                val_roc_auc, val_pr_auc, val_average_precision = evaluate_metrics(val_probas_pred, val_ground_truth)
            
            writer.add_scalar("Validation Loss Epoch", val_loss, epoch)
            writer.add_scalar("Validation Accuracy", val_accuracy, epoch)
            writer.add_scalar("Validation Precision", val_precision, epoch)
            writer.add_scalar("Validation Recall", val_recall, epoch)
            writer.add_scalar("Validation F1", val_f1, epoch)
            writer.add_scalar("Validation ROC AUC", val_roc_auc, epoch)
            writer.add_scalar("Validation PR AUC", val_pr_auc, epoch)
            writer.add_scalar("Validation Average Precision", val_average_precision, epoch)
            
            if val_f1 > best_val_metrics:
                best_val_metrics = val_f1
                early_stopping_counter = 0
                torch.save(model.state_dict(), best_model_path)
                print("New best model saved!") 
            else:
                early_stopping_counter += 1
                print("Early stopping counter:", early_stopping_counter)
                if early_stopping_counter >= early_stopping_patience:
                    print("Early stopping triggered!")
                    early_stop = True
        
        if scheduler is not None:
            scheduler.step(val_f1) #
      
        
        epoch_progress_bar.set_postfix_str("Train loss: {:.4f}, Train f1: {:.4f}, Train auc: {:.4f}, Train pr_auc: {:.4f},\
                                            Val loss: {:.4f}, Val f1: {:.4f}, Val auc: {:.4f}, Val pr_auc: {:.4f},\
                                            Best val f1: {:.4f}".format(train_loss, train_f1, train_roc_auc, train_pr_auc,\
                                            val_loss, val_f1, val_roc_auc, val_pr_auc, best_val_metrics))
        epoch_progress_bar.update()
        print("Epoch Number:", epoch)   
        print("Epoch time:", time.time() - start_time)
        print("Train loss:", train_loss)
        print("Train accuracy:", train_accuracy)
        print("Train precision:", train_precision)
        print("Train recall:", train_recall)
        print("Train f1:", train_f1)
        print("Train roc_auc:", train_roc_auc)
        print("Train pr_auc:", train_pr_auc)
        print("Train average_precision:", train_average_precision)
        
        print("Val loss:", val_loss)
        print("Val accuracy:", val_accuracy)
        print("Val precision:", val_precision)
        print("Val recall:", val_recall)
        print("Val f1:", val_f1)
        print("Val roc_auc:", val_roc_auc)
        print("Val pr_auc:", val_pr_auc)
        print("Val average_precision:", val_average_precision)
        print("Best val_f1:", val_f1)
        print()
        train.report({"val_f1": val_f1})
        if early_stop:
            break
        if epoch == n_epochs:
            print("Training completed!")
            break
    
    # load best model 
    # model.load_state_dict(torch.load(best_model_path))
    return {"val_f1": best_val_metrics, "train_loss": train_loss}

### Run RayTune

In [20]:
def ray_tune_sdv(config):
    model_name ="ray_sdvhgnn"
    # Define the log directory where TensorBoard logs will be stored
    log_dir = f"logs/{model_name}/" + datetime.now().strftime("%Y%m%d-%H%M%S")
    # log_dir = f"{log_dir}/{ray.train.get_context().get_trial_dir()}"
    os.makedirs(log_dir, exist_ok=True)
    # Create a SummaryWriter
    writer = SummaryWriter(log_dir)
    
    # load your data
    data, train_loader, val_loader, test_loader = load_data(config['load_data'])
    # Initialize model
    sdv_model = DVModel(config['model'], data["side_effect"].num_nodes, data.metadata())

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Device: '{device}'")

    sdv_model = sdv_model.to(device)
    
    # Define loss function and optimizer
    criterion = F.binary_cross_entropy_with_logits
    optimizer = torch.optim.Adam(sdv_model.parameters(), lr=config['training']['lr'], weight_decay=config['training']['decay']) #, weight_decay=5e-4
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer, mode="max", factor=0.5, patience=2, min_lr=1e-6
        )
    # scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda epoch: 0.96 ** (epoch))
    # scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, 
    #                                                                  T_0=len(train_loader), 
    #                                                                  T_mult=1, 
    #                                                                  eta_min=1e-5, 
    #                                                                  verbose=False)
    # on plateau scheduler
    # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2, verbose=False, threshold=0.0001, threshold_mode='rel', cooldown=0, min_lr=1e-6, eps=1e-08)
    
    print(f"Total Number of Parameters: {sum(p.numel() for p in sdv_model.parameters())}")
    print(f"Total Number of Trainable Parameters: {sum(p.numel() for p in sdv_model.parameters() if p.requires_grad)}")
  
    sdv_model_metric = train_loop(sdv_model, model_name, writer, train_loader, val_loader, criterion, optimizer, 
                        n_epochs=config['training']['n_epochs'], device=device, scheduler=scheduler,
                        early_stopping_patience=5)
    return sdv_model_metric

In [21]:
config = {
    'load_data': {
        'dtr': tune.quniform(0.1, 0.7, 0.05),
        'batch_size': tune.choice([64, 128, 256]), #16, 32, 
        'num_neigh1': tune.qrandint(4, 12, 2), #tune.choice(list(range(4, 12, 2))),
        'num_neigh2': tune.qrandint(4, 12, 2), #tune.choice(list(range(4, 12, 2))),
        # 'num_neigh3': tune.qrandint(4, 12, 2), #tune.choice(list(range(4, 12, 2))),
        # 'num_neigh': tune.choice([[8, 5], [6, 4], [4, 8], [5, 5], [7, 5]])
    },
    'training': {
        'lr':  tune.loguniform(1e-5, 1e-2),
        'n_epochs': tune.qrandint(5, 21, 5),
        'decay':  tune.loguniform(1e-5, 1e-2)
    },
    'model': {
        'mol_n_head': tune.choice([2, 3, 4]),
        'mol_heads_out': tune.choice([16, 32, 64, 128]),
        'mol_block_param': tune.choice([2, 3]), #1
        
        'outer_n_head': tune.choice([2, 3]), # 1
        'outer_hidden_dims': tune.choice([32, 64, 128, 256]),
        'outer_block': tune.choice([2, 3]) #1
    }
}

In [22]:
os.makedirs('ray_results_sdv', exist_ok=True)

In [23]:
ray.init(num_cpus=4, num_gpus=1)
scheduler = AsyncHyperBandScheduler(time_attr="training_iteration",
                                     max_t=20, grace_period=3)

algo = OptunaSearch()  #
trainable_with_resources = tune.with_resources(ray_tune_sdv, {"cpu": 4, "gpu":1})

def stop_fn(trial_id: str, result: dict) -> bool:
    # global trial_done
    early_stop = result.get('early_stop', False)
    print("Here Global ", early_stop)
    return early_stop #trial_id.info["early_stop"]


tuner = tune.Tuner(
    trainable_with_resources,
    tune_config=tune.TuneConfig(
        metric="val_f1",
        mode="max",
        search_alg=algo,
        scheduler=scheduler,
        num_samples=20,
    ),
    run_config=train.RunConfig(
        # stop={"training_iteration": 5},
        local_dir='/root/SDV-HGNN/hyperparam/ray_results_sdv',
        log_to_file=True,
        verbose=1, 
        # stop={"training_iteration" : 1}
    ),
    
    param_space=config,
)

2024-08-05 18:17:57,342	INFO worker.py:1642 -- Started a local Ray instance.


In [24]:
results = tuner.fit()

0,1
Current time:,2024-08-05 19:02:02
Running for:,00:44:04.44
Memory:,10.6/19.4 GiB

Trial name,status,loc,load_data/batch_size,load_data/dtr,load_data/num_neigh1,load_data/num_neigh2,model/mol_block_para m,model/mol_heads_out,model/mol_n_head,model/outer_block,model/outer_hidden_d ims,model/outer_n_head,training/decay,training/lr,training/n_epochs,iter,total time (s),val_f1,train_loss
ray_tune_sdv_679d5c9b,RUNNING,172.17.88.72:1870627,128,0.65,6,12,2,128,4,3,256,2,4.41475e-05,0.00064983,15,,,,
ray_tune_sdv_90cbd3fa,PENDING,,128,0.45,12,6,2,16,2,2,256,2,0.00861366,5.93264e-05,5,,,,
ray_tune_sdv_2753f3f0,TERMINATED,172.17.88.72:1870627,128,0.15,10,4,2,16,2,3,32,3,3.48181e-05,0.000739041,10,11.0,332.161,0.845518,0.360245
ray_tune_sdv_bcbd48ff,TERMINATED,172.17.88.72:1870627,256,0.2,6,8,3,32,4,2,128,2,0.000793064,3.44379e-05,15,3.0,72.4173,0.739009,
ray_tune_sdv_52220c6f,TERMINATED,172.17.88.72:1870627,256,0.25,6,6,3,32,2,2,64,2,0.00678095,7.48425e-05,15,3.0,68.1297,0.691893,
ray_tune_sdv_ee9f818f,TERMINATED,172.17.88.72:1870627,128,0.6,8,12,2,64,3,3,32,2,0.000119228,0.00294401,10,11.0,997.158,0.867695,0.316711
ray_tune_sdv_e4cb0611,TERMINATED,172.17.88.72:1870627,64,0.1,4,10,2,64,4,3,32,3,0.00375349,3.71767e-05,15,3.0,233.67,0.766526,
ray_tune_sdv_cb794737,TERMINATED,172.17.88.72:1870627,128,0.45,6,6,2,32,2,3,256,3,0.000424171,7.35499e-05,15,3.0,476.149,0.754992,


[2m[36m(ray_tune_sdv pid=1870627)[0m 2024-08-05 18:18:01.061217: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
[2m[36m(ray_tune_sdv pid=1870627)[0m To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
[2m[36m(ray_tune_sdv pid=1870627)[0m 2024-08-05 18:18:01.205650: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
[2m[36m(ray_tune_sdv pid=1870627)[0m 2024-08-05 18:18:01.655606: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open sh

[2m[36m(ray_tune_sdv pid=1870627)[0m Device: 'cuda'
[2m[36m(ray_tune_sdv pid=1870627)[0m Total Number of Parameters: 871056
[2m[36m(ray_tune_sdv pid=1870627)[0m Total Number of Trainable Parameters: 871056
[2m[36m(ray_tune_sdv pid=1870627)[0m Starting training loop at 2024-08-05 18:18:03
[2m[36m(ray_tune_sdv pid=1870627)[0m MiniBatches:   0%|          | 0/3310 [00:00<?, ?it/s]
[2m[36m(ray_tune_sdv pid=1870627)[0m Epoch 1
[2m[36m(ray_tune_sdv pid=1870627)[0m New best model saved!
[2m[36m(ray_tune_sdv pid=1870627)[0m Epoch Number: 1
[2m[36m(ray_tune_sdv pid=1870627)[0m Epoch time: 31.430431127548218
[2m[36m(ray_tune_sdv pid=1870627)[0m Train loss: 2.125873408009929
[2m[36m(ray_tune_sdv pid=1870627)[0m Train accuracy: 0.6868176942007951
[2m[36m(ray_tune_sdv pid=1870627)[0m Train precision: 0.6857160780377642
[2m[36m(ray_tune_sdv pid=1870627)[0m Train recall: 0.6897835552470499
[2m[36m(ray_tune_sdv pid=1870627)[0m Train f1: 0.6877438026928401
[2m

In [24]:
results = tuner.fit()

0,1
Current time:,2024-08-05 10:30:17
Running for:,09:43:17.14
Memory:,11.3/19.4 GiB

Trial name,# failures,error file
ray_tune_sdv_1d2160e6,1,"/root/SDV-HGNN/hyperparam/ray_results/ray_tune_sdv_2024-08-05_00-46-59/ray_tune_sdv_1d2160e6_1_batch_size=128,dtr=0.1500,num_neigh1=10,num_neigh2=6,mol_block_param=3,mol_heads_out=16,mol_n_head=2,outer_2024-08-05_00-47-00/error.txt"
ray_tune_sdv_f5b47831,1,"/root/SDV-HGNN/hyperparam/ray_results/ray_tune_sdv_2024-08-05_00-46-59/ray_tune_sdv_f5b47831_2_batch_size=128,dtr=0.5500,num_neigh1=8,num_neigh2=12,mol_block_param=2,mol_heads_out=64,mol_n_head=2,outer_2024-08-05_00-47-02/error.txt"
ray_tune_sdv_1d96640c,1,"/root/SDV-HGNN/hyperparam/ray_results/ray_tune_sdv_2024-08-05_00-46-59/ray_tune_sdv_1d96640c_3_batch_size=128,dtr=0.3500,num_neigh1=12,num_neigh2=6,mol_block_param=3,mol_heads_out=32,mol_n_head=3,outer_2024-08-05_00-58-53/error.txt"
ray_tune_sdv_afbfd312,1,"/root/SDV-HGNN/hyperparam/ray_results/ray_tune_sdv_2024-08-05_00-46-59/ray_tune_sdv_afbfd312_6_batch_size=32,dtr=0.5000,num_neigh1=12,num_neigh2=10,mol_block_param=1,mol_heads_out=16,mol_n_head=3,outer_2024-08-05_02-03-25/error.txt"
ray_tune_sdv_796b45b0,1,"/root/SDV-HGNN/hyperparam/ray_results/ray_tune_sdv_2024-08-05_00-46-59/ray_tune_sdv_796b45b0_8_batch_size=16,dtr=0.5500,num_neigh1=12,num_neigh2=4,mol_block_param=2,mol_heads_out=32,mol_n_head=2,outer__2024-08-05_03-13-49/error.txt"
ray_tune_sdv_2c093bc7,1,"/root/SDV-HGNN/hyperparam/ray_results/ray_tune_sdv_2024-08-05_00-46-59/ray_tune_sdv_2c093bc7_12_batch_size=64,dtr=0.3500,num_neigh1=8,num_neigh2=10,mol_block_param=2,mol_heads_out=32,mol_n_head=4,outer_2024-08-05_04-44-15/error.txt"
ray_tune_sdv_67363dff,1,"/root/SDV-HGNN/hyperparam/ray_results/ray_tune_sdv_2024-08-05_00-46-59/ray_tune_sdv_67363dff_19_batch_size=16,dtr=0.5000,num_neigh1=8,num_neigh2=10,mol_block_param=2,mol_heads_out=128,mol_n_head=2,oute_2024-08-05_06-22-51/error.txt"

Trial name,status,loc,load_data/batch_size,load_data/dtr,load_data/num_neigh1,load_data/num_neigh2,model/mol_block_para m,model/mol_heads_out,model/mol_n_head,model/outer_block,model/outer_hidden_d ims,model/outer_n_head,training/decay,training/lr,training/n_epochs,iter,total time (s),val_f1
ray_tune_sdv_44b6dbab,RUNNING,172.17.88.72:1043506,16,0.5,6,10,2,128,4,1,64,3,9.25228e-05,0.000119418,10,3.0,2757.28,0.854303
ray_tune_sdv_b1763bdc,PENDING,,16,0.45,10,10,2,128,4,1,64,3,0.000154732,6.65609e-05,15,,,
ray_tune_sdv_60a4baa2,TERMINATED,172.17.88.72:243144,16,0.35,4,8,1,32,2,1,128,1,0.00831433,0.000110584,20,4.0,1220.78,0.746451
ray_tune_sdv_eaed376e,TERMINATED,172.17.88.72:243144,256,0.35,10,8,2,128,4,1,64,1,0.000177137,0.000139957,20,4.0,136.468,0.8515
ray_tune_sdv_166023d0,TERMINATED,172.17.88.72:442284,128,0.65,10,4,1,32,3,1,64,2,0.00035002,0.000349099,25,4.0,325.434,0.839782
ray_tune_sdv_1d56158e,TERMINATED,172.17.88.72:584131,256,0.2,4,6,1,32,3,2,256,2,0.00177304,0.00310383,30,4.0,91.2687,0.477986
ray_tune_sdv_da3d87f5,TERMINATED,172.17.88.72:584131,64,0.4,12,8,3,16,4,2,128,1,0.00717747,1.01307e-05,30,4.0,466.793,0.792162
ray_tune_sdv_08409a75,TERMINATED,172.17.88.72:584131,64,0.65,10,6,3,32,4,1,64,3,0.00699544,0.000633977,25,4.0,865.88,0.745407
ray_tune_sdv_27b48cce,TERMINATED,172.17.88.72:676413,256,0.5,4,12,2,16,4,1,256,2,0.000897008,0.00425494,25,4.0,166.942,0.723562
ray_tune_sdv_08fbf58e,TERMINATED,172.17.88.72:676413,16,0.6,6,10,2,16,4,1,64,3,6.32488e-05,0.000220617,10,4.0,2854.4,0.842335


[2m[36m(ray_tune_sdv pid=85167)[0m 2024-08-05 00:47:02.824066: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
[2m[36m(ray_tune_sdv pid=85167)[0m To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
[2m[36m(ray_tune_sdv pid=85167)[0m 2024-08-05 00:47:02.931750: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
[2m[36m(ray_tune_sdv pid=85167)[0m 2024-08-05 00:47:03.353484: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared obj

[2m[36m(ray_tune_sdv pid=85167)[0m Device: 'cuda'
[2m[36m(ray_tune_sdv pid=85167)[0m Total Number of Parameters: 12081200
[2m[36m(ray_tune_sdv pid=85167)[0m Total Number of Trainable Parameters: 12081200
[2m[36m(ray_tune_sdv pid=85167)[0m Starting training loop at 2024-08-05 00:47:05
[2m[36m(ray_tune_sdv pid=85167)[0m MiniBatches:   0%|          | 0/4965 [00:00<?, ?it/s]
[2m[36m(ray_tune_sdv pid=85167)[0m Epoch 1
[2m[36m(ray_tune_sdv pid=85167)[0m New best model saved!
[2m[36m(ray_tune_sdv pid=85167)[0m Epoch Number: 1
[2m[36m(ray_tune_sdv pid=85167)[0m Epoch time: 42.505014419555664
[2m[36m(ray_tune_sdv pid=85167)[0m Train loss: 8.557623839186084
[2m[36m(ray_tune_sdv pid=85167)[0m Train accuracy: 0.6499021896889001
[2m[36m(ray_tune_sdv pid=85167)[0m Train precision: 0.6549474920096536
[2m[36m(ray_tune_sdv pid=85167)[0m Train recall: 0.6336215056477567
[2m[36m(ray_tune_sdv pid=85167)[0m Train f1: 0.644108024889345
[2m[36m(ray_tune_sdv pid=851

2024-08-05 00:58:49,482	ERROR tune_controller.py:1502 -- Trial task failed for trial ray_tune_sdv_1d2160e6
Traceback (most recent call last):
  File "/home/mayank/miniconda3/envs/dsn/lib/python3.7/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/home/mayank/miniconda3/envs/dsn/lib/python3.7/site-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/home/mayank/miniconda3/envs/dsn/lib/python3.7/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/home/mayank/miniconda3/envs/dsn/lib/python3.7/site-packages/ray/_private/worker.py", line 2547, in get
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(ValueError): [36mray::ImplicitFunc.train()[39m (pid=85167, ip=172.17.88.72, actor_id=9c68c0fda2d2ee05cafa7ebe01000000, repr=ray_tune_sdv)
  File "/home/mayank/miniconda3/envs/dsn/lib/p

[2m[36m(ray_tune_sdv pid=85167)[0m Early stopping counter: 2
[2m[36m(ray_tune_sdv pid=85167)[0m Epoch Number: 15
[2m[36m(ray_tune_sdv pid=85167)[0m Epoch time: 48.68236684799194
[2m[36m(ray_tune_sdv pid=85167)[0m Train loss: 0.8214024605770265
[2m[36m(ray_tune_sdv pid=85167)[0m Train accuracy: 0.6783618350476431
[2m[36m(ray_tune_sdv pid=85167)[0m Train precision: 0.6832890214642371
[2m[36m(ray_tune_sdv pid=85167)[0m Train recall: 0.6649208051997223
[2m[36m(ray_tune_sdv pid=85167)[0m Train f1: 0.673979787642318
[2m[36m(ray_tune_sdv pid=85167)[0m Train roc_auc: 0.7461999319237989
[2m[36m(ray_tune_sdv pid=85167)[0m Train pr_auc: 0.7289931525285591
[2m[36m(ray_tune_sdv pid=85167)[0m Train average_precision: 0.7290512643959883
[2m[36m(ray_tune_sdv pid=85167)[0m Val loss: 0.475911690849037
[2m[36m(ray_tune_sdv pid=85167)[0m Val accuracy: 0.7734363168256853
[2m[36m(ray_tune_sdv pid=85167)[0m Val precision: 0.7450128918442123
[2m[36m(ray_tune_sdv pid

[2m[36m(ray_tune_sdv pid=118766)[0m 2024-08-05 00:58:53.025370: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
[2m[36m(ray_tune_sdv pid=118766)[0m To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
[2m[36m(ray_tune_sdv pid=118766)[0m 2024-08-05 00:58:53.157885: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
[2m[36m(ray_tune_sdv pid=118766)[0m 2024-08-05 00:58:53.633617: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared

[2m[36m(ray_tune_sdv pid=118766)[0m Device: 'cuda'
[2m[36m(ray_tune_sdv pid=118766)[0m Total Number of Parameters: 3463536
[2m[36m(ray_tune_sdv pid=118766)[0m Total Number of Trainable Parameters: 3463536
[2m[36m(ray_tune_sdv pid=118766)[0m Starting training loop at 2024-08-05 00:58:55
[2m[36m(ray_tune_sdv pid=118766)[0m MiniBatches:   0%|          | 0/16525 [00:00<?, ?it/s]
[2m[36m(ray_tune_sdv pid=118766)[0m Epoch 1
[2m[36m(ray_tune_sdv pid=118766)[0m New best model saved!
[2m[36m(ray_tune_sdv pid=118766)[0m Epoch Number: 1
[2m[36m(ray_tune_sdv pid=118766)[0m Epoch time: 74.63467860221863
[2m[36m(ray_tune_sdv pid=118766)[0m Train loss: 1.1212072403945588
[2m[36m(ray_tune_sdv pid=118766)[0m Train accuracy: 0.7924984511599091
[2m[36m(ray_tune_sdv pid=118766)[0m Train precision: 0.7929370400372279
[2m[36m(ray_tune_sdv pid=118766)[0m Train recall: 0.7917498451159909
[2m[36m(ray_tune_sdv pid=118766)[0m Train f1: 0.792342997873055
[2m[36m(ray_tun

2024-08-05 01:27:12,628	ERROR tune_controller.py:1502 -- Trial task failed for trial ray_tune_sdv_f5b47831
Traceback (most recent call last):
  File "/home/mayank/miniconda3/envs/dsn/lib/python3.7/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/home/mayank/miniconda3/envs/dsn/lib/python3.7/site-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/home/mayank/miniconda3/envs/dsn/lib/python3.7/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/home/mayank/miniconda3/envs/dsn/lib/python3.7/site-packages/ray/_private/worker.py", line 2547, in get
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(ValueError): [36mray::ImplicitFunc.train()[39m (pid=118766, ip=172.17.88.72, actor_id=d22e9cdfca45853b6e9a144101000000, repr=ray_tune_sdv)
  File "/home/mayank/miniconda3/envs/dsn/lib/

[2m[36m(ray_tune_sdv pid=118766)[0m Early stopping counter: 5
[2m[36m(ray_tune_sdv pid=118766)[0m Early stopping triggered!
[2m[36m(ray_tune_sdv pid=118766)[0m Epoch Number: 23
[2m[36m(ray_tune_sdv pid=118766)[0m Epoch time: 73.0198814868927
[2m[36m(ray_tune_sdv pid=118766)[0m Train loss: 0.2601345425003951
[2m[36m(ray_tune_sdv pid=118766)[0m Train accuracy: 0.8920458456666897
[2m[36m(ray_tune_sdv pid=118766)[0m Train precision: 0.8694375973015049
[2m[36m(ray_tune_sdv pid=118766)[0m Train recall: 0.9226440421284505
[2m[36m(ray_tune_sdv pid=118766)[0m Train f1: 0.8952509768560265
[2m[36m(ray_tune_sdv pid=118766)[0m Train roc_auc: 0.9528794244011581
[2m[36m(ray_tune_sdv pid=118766)[0m Train pr_auc: 0.9354063460260873
[2m[36m(ray_tune_sdv pid=118766)[0m Train average_precision: 0.9354097331591823
[2m[36m(ray_tune_sdv pid=118766)[0m Val loss: 0.34769916347259483
[2m[36m(ray_tune_sdv pid=118766)[0m Val accuracy: 0.8613130395274875
[2m[36m(ray_tune

[2m[36m(ray_tune_sdv pid=199927)[0m 2024-08-05 01:27:16.241734: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
[2m[36m(ray_tune_sdv pid=199927)[0m To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
[2m[36m(ray_tune_sdv pid=199927)[0m 2024-08-05 01:27:16.361608: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
[2m[36m(ray_tune_sdv pid=199927)[0m 2024-08-05 01:27:16.810939: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared

[2m[36m(ray_tune_sdv pid=199927)[0m Device: 'cuda'
[2m[36m(ray_tune_sdv pid=199927)[0m Total Number of Parameters: 3069969
[2m[36m(ray_tune_sdv pid=199927)[0m Total Number of Trainable Parameters: 3069969
[2m[36m(ray_tune_sdv pid=199927)[0m Starting training loop at 2024-08-05 01:27:18
[2m[36m(ray_tune_sdv pid=199927)[0m MiniBatches:   0%|          | 0/7440 [00:00<?, ?it/s]
[2m[36m(ray_tune_sdv pid=199927)[0m Epoch 1
[2m[36m(ray_tune_sdv pid=199927)[0m New best model saved!
[2m[36m(ray_tune_sdv pid=199927)[0m Epoch Number: 1
[2m[36m(ray_tune_sdv pid=199927)[0m Epoch time: 61.95915412902832
[2m[36m(ray_tune_sdv pid=199927)[0m Train loss: 0.960689291306433
[2m[36m(ray_tune_sdv pid=199927)[0m Train accuracy: 0.7712226519187603
[2m[36m(ray_tune_sdv pid=199927)[0m Train precision: 0.7719212624044249
[2m[36m(ray_tune_sdv pid=199927)[0m Train recall: 0.7699380696108392
[2m[36m(ray_tune_sdv pid=199927)[0m Train f1: 0.770928390582055
[2m[36m(ray_tune_

2024-08-05 01:43:01,214	ERROR tune_controller.py:1502 -- Trial task failed for trial ray_tune_sdv_1d96640c
Traceback (most recent call last):
  File "/home/mayank/miniconda3/envs/dsn/lib/python3.7/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/home/mayank/miniconda3/envs/dsn/lib/python3.7/site-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/home/mayank/miniconda3/envs/dsn/lib/python3.7/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/home/mayank/miniconda3/envs/dsn/lib/python3.7/site-packages/ray/_private/worker.py", line 2547, in get
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(ValueError): [36mray::ImplicitFunc.train()[39m (pid=199927, ip=172.17.88.72, actor_id=742260a08bab5854f348a2d701000000, repr=ray_tune_sdv)
  File "/home/mayank/miniconda3/envs/dsn/lib/

[2m[36m(ray_tune_sdv pid=199927)[0m Early stopping counter: 3
[2m[36m(ray_tune_sdv pid=199927)[0m Epoch Number: 15
[2m[36m(ray_tune_sdv pid=199927)[0m Epoch time: 61.25422191619873
[2m[36m(ray_tune_sdv pid=199927)[0m Train loss: 0.23890056938036092
[2m[36m(ray_tune_sdv pid=199927)[0m Train accuracy: 0.905671092841496
[2m[36m(ray_tune_sdv pid=199927)[0m Train precision: 0.8845986206189268
[2m[36m(ray_tune_sdv pid=199927)[0m Train recall: 0.9330665007977932
[2m[36m(ray_tune_sdv pid=199927)[0m Train f1: 0.9081863648328508
[2m[36m(ray_tune_sdv pid=199927)[0m Train roc_auc: 0.958371954044573
[2m[36m(ray_tune_sdv pid=199927)[0m Train pr_auc: 0.9407552101855567
[2m[36m(ray_tune_sdv pid=199927)[0m Train average_precision: 0.9407740946228113
[2m[36m(ray_tune_sdv pid=199927)[0m Val loss: 0.36304385625365854
[2m[36m(ray_tune_sdv pid=199927)[0m Val accuracy: 0.8580948053914887
[2m[36m(ray_tune_sdv pid=199927)[0m Val precision: 0.8969279838845057
[2m[36m(

[2m[36m(ray_tune_sdv pid=243144)[0m 2024-08-05 01:43:04.333846: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
[2m[36m(ray_tune_sdv pid=243144)[0m To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
[2m[36m(ray_tune_sdv pid=243144)[0m 2024-08-05 01:43:04.456269: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
[2m[36m(ray_tune_sdv pid=243144)[0m 2024-08-05 01:43:04.912242: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared

[2m[36m(ray_tune_sdv pid=243144)[0m Device: 'cuda'
[2m[36m(ray_tune_sdv pid=243144)[0m Total Number of Parameters: 533488
[2m[36m(ray_tune_sdv pid=243144)[0m Total Number of Trainable Parameters: 533488
[2m[36m(ray_tune_sdv pid=243144)[0m Starting training loop at 2024-08-05 01:43:07
[2m[36m(ray_tune_sdv pid=243144)[0m MiniBatches:   0%|          | 0/79260 [00:00<?, ?it/s]
[2m[36m(ray_tune_sdv pid=243144)[0m Epoch 1
[2m[36m(ray_tune_sdv pid=243144)[0m New best model saved!
[2m[36m(ray_tune_sdv pid=243144)[0m Epoch Number: 1
[2m[36m(ray_tune_sdv pid=243144)[0m Epoch time: 312.40179347991943
[2m[36m(ray_tune_sdv pid=243144)[0m Train loss: 0.5869106317525504
[2m[36m(ray_tune_sdv pid=243144)[0m Train accuracy: 0.7080617681261324
[2m[36m(ray_tune_sdv pid=243144)[0m Train precision: 0.6875594236817085
[2m[36m(ray_tune_sdv pid=243144)[0m Train recall: 0.7627173648484192
[2m[36m(ray_tune_sdv pid=243144)[0m Train f1: 0.7231909328683522
[2m[36m(ray_tun

2024-08-05 03:13:45,673	ERROR tune_controller.py:1502 -- Trial task failed for trial ray_tune_sdv_afbfd312
Traceback (most recent call last):
  File "/home/mayank/miniconda3/envs/dsn/lib/python3.7/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/home/mayank/miniconda3/envs/dsn/lib/python3.7/site-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/home/mayank/miniconda3/envs/dsn/lib/python3.7/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/home/mayank/miniconda3/envs/dsn/lib/python3.7/site-packages/ray/_private/worker.py", line 2547, in get
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(ValueError): [36mray::ImplicitFunc.train()[39m (pid=243144, ip=172.17.88.72, actor_id=781e4485aeb62f4c3b48bd5701000000, repr=ray_tune_sdv)
  File "/home/mayank/miniconda3/envs/dsn/lib/

[2m[36m(ray_tune_sdv pid=243144)[0m Early stopping counter: 5
[2m[36m(ray_tune_sdv pid=243144)[0m Early stopping triggered!
[2m[36m(ray_tune_sdv pid=243144)[0m Epoch Number: 12
[2m[36m(ray_tune_sdv pid=243144)[0m Epoch time: 349.3505003452301
[2m[36m(ray_tune_sdv pid=243144)[0m Train loss: 0.2503468232451606
[2m[36m(ray_tune_sdv pid=243144)[0m Train accuracy: 0.8990534784666351
[2m[36m(ray_tune_sdv pid=243144)[0m Train precision: 0.8773720014321518
[2m[36m(ray_tune_sdv pid=243144)[0m Train recall: 0.9277804070042593
[2m[36m(ray_tune_sdv pid=243144)[0m Train f1: 0.9018723834935825
[2m[36m(ray_tune_sdv pid=243144)[0m Train roc_auc: 0.9555655734036856
[2m[36m(ray_tune_sdv pid=243144)[0m Train pr_auc: 0.9371963910074358
[2m[36m(ray_tune_sdv pid=243144)[0m Train average_precision: 0.9372091419550816
[2m[36m(ray_tune_sdv pid=243144)[0m Val loss: 0.3419129687066012
[2m[36m(ray_tune_sdv pid=243144)[0m Val accuracy: 0.8705891261547781
[2m[36m(ray_tune

[2m[36m(ray_tune_sdv pid=442284)[0m 2024-08-05 03:13:49.211852: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
[2m[36m(ray_tune_sdv pid=442284)[0m To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
[2m[36m(ray_tune_sdv pid=442284)[0m 2024-08-05 03:13:49.366434: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
[2m[36m(ray_tune_sdv pid=442284)[0m 2024-08-05 03:13:49.923787: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared

[2m[36m(ray_tune_sdv pid=442284)[0m Device: 'cuda'
[2m[36m(ray_tune_sdv pid=442284)[0m Total Number of Parameters: 755793
[2m[36m(ray_tune_sdv pid=442284)[0m Total Number of Trainable Parameters: 755793
[2m[36m(ray_tune_sdv pid=442284)[0m Starting training loop at 2024-08-05 03:13:52
[2m[36m(ray_tune_sdv pid=442284)[0m MiniBatches:   0%|          | 0/18600 [00:00<?, ?it/s]
[2m[36m(ray_tune_sdv pid=442284)[0m Epoch 1
[2m[36m(ray_tune_sdv pid=442284)[0m New best model saved!
[2m[36m(ray_tune_sdv pid=442284)[0m Epoch Number: 1
[2m[36m(ray_tune_sdv pid=442284)[0m Epoch time: 82.69226813316345
[2m[36m(ray_tune_sdv pid=442284)[0m Train loss: 0.44980051536355825
[2m[36m(ray_tune_sdv pid=442284)[0m Train accuracy: 0.8203733636218019
[2m[36m(ray_tune_sdv pid=442284)[0m Train precision: 0.8107837043733755
[2m[36m(ray_tune_sdv pid=442284)[0m Train recall: 0.8358015522840126
[2m[36m(ray_tune_sdv pid=442284)[0m Train f1: 0.8231025705374108
[2m[36m(ray_tun

2024-08-05 04:34:53,681	ERROR tune_controller.py:1502 -- Trial task failed for trial ray_tune_sdv_796b45b0
Traceback (most recent call last):
  File "/home/mayank/miniconda3/envs/dsn/lib/python3.7/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/home/mayank/miniconda3/envs/dsn/lib/python3.7/site-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/home/mayank/miniconda3/envs/dsn/lib/python3.7/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/home/mayank/miniconda3/envs/dsn/lib/python3.7/site-packages/ray/_private/worker.py", line 2547, in get
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(ValueError): [36mray::ImplicitFunc.train()[39m (pid=442284, ip=172.17.88.72, actor_id=33b7acd3f48c55df1f708d4001000000, repr=ray_tune_sdv)
  File "/home/mayank/miniconda3/envs/dsn/lib/

[2m[36m(ray_tune_sdv pid=442284)[0m Early stopping counter: 5
[2m[36m(ray_tune_sdv pid=442284)[0m Early stopping triggered!
[2m[36m(ray_tune_sdv pid=442284)[0m Epoch Number: 9
[2m[36m(ray_tune_sdv pid=442284)[0m Epoch time: 512.4765110015869
[2m[36m(ray_tune_sdv pid=442284)[0m Train loss: 0.35597524380245726
[2m[36m(ray_tune_sdv pid=442284)[0m Train accuracy: 0.8512769326082467
[2m[36m(ray_tune_sdv pid=442284)[0m Train precision: 0.8493410919048434
[2m[36m(ray_tune_sdv pid=442284)[0m Train recall: 0.8540476354374613
[2m[36m(ray_tune_sdv pid=442284)[0m Train f1: 0.8516878614700785
[2m[36m(ray_tune_sdv pid=442284)[0m Train roc_auc: 0.9212951111360024
[2m[36m(ray_tune_sdv pid=442284)[0m Train pr_auc: 0.9121047417147821
[2m[36m(ray_tune_sdv pid=442284)[0m Train average_precision: 0.9121072573194796
[2m[36m(ray_tune_sdv pid=442284)[0m Val loss: 0.3594594492984858
[2m[36m(ray_tune_sdv pid=442284)[0m Val accuracy: 0.8449946993790701
[2m[36m(ray_tune

[2m[36m(ray_tune_sdv pid=584131)[0m 2024-08-05 04:34:56.965528: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
[2m[36m(ray_tune_sdv pid=584131)[0m To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
[2m[36m(ray_tune_sdv pid=584131)[0m 2024-08-05 04:34:57.079407: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
[2m[36m(ray_tune_sdv pid=584131)[0m 2024-08-05 04:34:57.548757: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared

[2m[36m(ray_tune_sdv pid=584131)[0m Device: 'cuda'
[2m[36m(ray_tune_sdv pid=584131)[0m Total Number of Parameters: 6814545
[2m[36m(ray_tune_sdv pid=584131)[0m Total Number of Trainable Parameters: 6814545
[2m[36m(ray_tune_sdv pid=584131)[0m Starting training loop at 2024-08-05 04:34:59
[2m[36m(ray_tune_sdv pid=584131)[0m MiniBatches:   0%|          | 0/5610 [00:00<?, ?it/s]
[2m[36m(ray_tune_sdv pid=584131)[0m Epoch 1
[2m[36m(ray_tune_sdv pid=584131)[0m New best model saved!
[2m[36m(ray_tune_sdv pid=584131)[0m Epoch Number: 1
[2m[36m(ray_tune_sdv pid=584131)[0m Epoch time: 22.500514030456543
[2m[36m(ray_tune_sdv pid=584131)[0m Train loss: 8.397460721343396
[2m[36m(ray_tune_sdv pid=584131)[0m Train accuracy: 0.5678419309039281
[2m[36m(ray_tune_sdv pid=584131)[0m Train precision: 0.5697261539958169
[2m[36m(ray_tune_sdv pid=584131)[0m Train recall: 0.5543303360151444
[2m[36m(ray_tune_sdv pid=584131)[0m Train f1: 0.5619228093741755
[2m[36m(ray_tun

[2m[36m(ray_tune_sdv pid=584131)[0m   _warn_prf(average, modifier, msg_start, len(result))


[2m[36m(ray_tune_sdv pid=584131)[0m New best model saved!
[2m[36m(ray_tune_sdv pid=584131)[0m Epoch Number: 3
[2m[36m(ray_tune_sdv pid=584131)[0m Epoch time: 22.73238182067871
[2m[36m(ray_tune_sdv pid=584131)[0m Train loss: 1.3000049820865494
[2m[36m(ray_tune_sdv pid=584131)[0m Train accuracy: 0.6544723142451491
[2m[36m(ray_tune_sdv pid=584131)[0m Train precision: 0.6679876479670612
[2m[36m(ray_tune_sdv pid=584131)[0m Train recall: 0.6142451490771415
[2m[36m(ray_tune_sdv pid=584131)[0m Train f1: 0.6399901380670612
[2m[36m(ray_tune_sdv pid=584131)[0m Train roc_auc: 0.6927003267134313
[2m[36m(ray_tune_sdv pid=584131)[0m Train pr_auc: 0.6832907621951525
[2m[36m(ray_tune_sdv pid=584131)[0m Train average_precision: 0.6833077350014592
[2m[36m(ray_tune_sdv pid=584131)[0m Val loss: 5.233905923983693
[2m[36m(ray_tune_sdv pid=584131)[0m Val accuracy: 0.5215810995002271
[2m[36m(ray_tune_sdv pid=584131)[0m Val precision: 0.5111791009649329
[2m[36m(ray_tu

2024-08-05 05:15:27,919	ERROR tune_controller.py:1502 -- Trial task failed for trial ray_tune_sdv_2c093bc7
Traceback (most recent call last):
  File "/home/mayank/miniconda3/envs/dsn/lib/python3.7/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/home/mayank/miniconda3/envs/dsn/lib/python3.7/site-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/home/mayank/miniconda3/envs/dsn/lib/python3.7/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/home/mayank/miniconda3/envs/dsn/lib/python3.7/site-packages/ray/_private/worker.py", line 2547, in get
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(ValueError): [36mray::ImplicitFunc.train()[39m (pid=584131, ip=172.17.88.72, actor_id=80f157e4f1c3b311db043cfb01000000, repr=ray_tune_sdv)
  File "/home/mayank/miniconda3/envs/dsn/lib/

[2m[36m(ray_tune_sdv pid=584131)[0m Early stopping counter: 5
[2m[36m(ray_tune_sdv pid=584131)[0m Early stopping triggered!
[2m[36m(ray_tune_sdv pid=584131)[0m Epoch Number: 9
[2m[36m(ray_tune_sdv pid=584131)[0m Epoch time: 112.60000777244568
[2m[36m(ray_tune_sdv pid=584131)[0m Train loss: 0.26303892350629954
[2m[36m(ray_tune_sdv pid=584131)[0m Train accuracy: 0.8920815642155935
[2m[36m(ray_tune_sdv pid=584131)[0m Train precision: 0.8735250167465347
[2m[36m(ray_tune_sdv pid=584131)[0m Train recall: 0.9169213294750791
[2m[36m(ray_tune_sdv pid=584131)[0m Train f1: 0.8946972595690781
[2m[36m(ray_tune_sdv pid=584131)[0m Train roc_auc: 0.952237958562157
[2m[36m(ray_tune_sdv pid=584131)[0m Train pr_auc: 0.9334805749692235
[2m[36m(ray_tune_sdv pid=584131)[0m Train average_precision: 0.9334851300936433
[2m[36m(ray_tune_sdv pid=584131)[0m Val loss: 0.3619089825823001
[2m[36m(ray_tune_sdv pid=584131)[0m Val accuracy: 0.8617673784643344
[2m[36m(ray_tune

[2m[36m(ray_tune_sdv pid=676413)[0m 2024-08-05 05:15:31.487385: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
[2m[36m(ray_tune_sdv pid=676413)[0m To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
[2m[36m(ray_tune_sdv pid=676413)[0m 2024-08-05 05:15:31.602511: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
[2m[36m(ray_tune_sdv pid=676413)[0m 2024-08-05 05:15:32.123082: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared

[2m[36m(ray_tune_sdv pid=676413)[0m Device: 'cuda'
[2m[36m(ray_tune_sdv pid=676413)[0m Total Number of Parameters: 2714546
[2m[36m(ray_tune_sdv pid=676413)[0m Total Number of Trainable Parameters: 2714546
[2m[36m(ray_tune_sdv pid=676413)[0m Starting training loop at 2024-08-05 05:15:34
[2m[36m(ray_tune_sdv pid=676413)[0m MiniBatches:   0%|          | 0/7775 [00:00<?, ?it/s]
[2m[36m(ray_tune_sdv pid=676413)[0m Epoch 1
[2m[36m(ray_tune_sdv pid=676413)[0m New best model saved!
[2m[36m(ray_tune_sdv pid=676413)[0m Epoch Number: 1
[2m[36m(ray_tune_sdv pid=676413)[0m Epoch time: 39.813029289245605
[2m[36m(ray_tune_sdv pid=676413)[0m Train loss: 2.454293180922955
[2m[36m(ray_tune_sdv pid=676413)[0m Train accuracy: 0.6846758163748226
[2m[36m(ray_tune_sdv pid=676413)[0m Train precision: 0.6880288341075111
[2m[36m(ray_tune_sdv pid=676413)[0m Train recall: 0.6757595835305253
[2m[36m(ray_tune_sdv pid=676413)[0m Train f1: 0.6818390189861327
[2m[36m(ray_tun

2024-08-05 08:33:55,181	ERROR tune_controller.py:1502 -- Trial task failed for trial ray_tune_sdv_67363dff
Traceback (most recent call last):
  File "/home/mayank/miniconda3/envs/dsn/lib/python3.7/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/home/mayank/miniconda3/envs/dsn/lib/python3.7/site-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/home/mayank/miniconda3/envs/dsn/lib/python3.7/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/home/mayank/miniconda3/envs/dsn/lib/python3.7/site-packages/ray/_private/worker.py", line 2547, in get
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(ValueError): [36mray::ImplicitFunc.train()[39m (pid=676413, ip=172.17.88.72, actor_id=f93903f808c984e92143c9af01000000, repr=ray_tune_sdv)
  File "/home/mayank/miniconda3/envs/dsn/lib/

[2m[36m(ray_tune_sdv pid=676413)[0m New best model saved!
[2m[36m(ray_tune_sdv pid=676413)[0m Epoch Number: 10
[2m[36m(ray_tune_sdv pid=676413)[0m Epoch time: 700.7611758708954
[2m[36m(ray_tune_sdv pid=676413)[0m Train loss: 0.31510290125895996
[2m[36m(ray_tune_sdv pid=676413)[0m Train accuracy: 0.8708187411263606
[2m[36m(ray_tune_sdv pid=676413)[0m Train precision: 0.8605532956616172
[2m[36m(ray_tune_sdv pid=676413)[0m Train recall: 0.8850544249881684
[2m[36m(ray_tune_sdv pid=676413)[0m Train f1: 0.87263191294772
[2m[36m(ray_tune_sdv pid=676413)[0m Train roc_auc: 0.9379635853948995
[2m[36m(ray_tune_sdv pid=676413)[0m Train pr_auc: 0.9246884827724127
[2m[36m(ray_tune_sdv pid=676413)[0m Train average_precision: 0.9246907909466541
[2m[36m(ray_tune_sdv pid=676413)[0m Val loss: 0.3437093556139377
[2m[36m(ray_tune_sdv pid=676413)[0m Val accuracy: 0.8536650007572315
[2m[36m(ray_tune_sdv pid=676413)[0m Val precision: 0.8280075848023035
[2m[36m(ray_t

[2m[36m(ray_tune_sdv pid=1043506)[0m 2024-08-05 08:33:58.304616: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
[2m[36m(ray_tune_sdv pid=1043506)[0m To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
[2m[36m(ray_tune_sdv pid=1043506)[0m 2024-08-05 08:33:58.412968: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
[2m[36m(ray_tune_sdv pid=1043506)[0m 2024-08-05 08:33:58.877383: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open sh

[2m[36m(ray_tune_sdv pid=1043506)[0m Device: 'cuda'
[2m[36m(ray_tune_sdv pid=1043506)[0m Total Number of Parameters: 7641586
[2m[36m(ray_tune_sdv pid=1043506)[0m Total Number of Trainable Parameters: 7641586
[2m[36m(ray_tune_sdv pid=1043506)[0m Starting training loop at 2024-08-05 08:34:01
[2m[36m(ray_tune_sdv pid=1043506)[0m MiniBatches:   0%|          | 0/49530 [00:00<?, ?it/s]
[2m[36m(ray_tune_sdv pid=1043506)[0m Epoch 1
[2m[36m(ray_tune_sdv pid=1043506)[0m New best model saved!
[2m[36m(ray_tune_sdv pid=1043506)[0m Epoch Number: 1
[2m[36m(ray_tune_sdv pid=1043506)[0m Epoch time: 861.0219705104828
[2m[36m(ray_tune_sdv pid=1043506)[0m Train loss: 0.4382792355245637
[2m[36m(ray_tune_sdv pid=1043506)[0m Train accuracy: 0.8237955513487932
[2m[36m(ray_tune_sdv pid=1043506)[0m Train precision: 0.8197043045924375
[2m[36m(ray_tune_sdv pid=1043506)[0m Train recall: 0.8301940369143398
[2m[36m(ray_tune_sdv pid=1043506)[0m Train f1: 0.8249158249158249
[

2024-08-05 10:30:27,190	ERROR tune.py:1139 -- Trials did not complete: [ray_tune_sdv_1d2160e6, ray_tune_sdv_f5b47831, ray_tune_sdv_1d96640c, ray_tune_sdv_afbfd312, ray_tune_sdv_796b45b0, ray_tune_sdv_2c093bc7, ray_tune_sdv_67363dff]
2024-08-05 10:30:27,192	INFO tune.py:1144 -- Total run time: 35007.40 seconds (34997.14 seconds for the tuning loop).
Resume experiment with: Tuner.restore(path="/root/SDV-HGNN/hyperparam/ray_results/ray_tune_sdv_2024-08-05_00-46-59", trainable=...)
- ray_tune_sdv_b1763bdc: FileNotFoundError('Could not fetch metrics for ray_tune_sdv_b1763bdc: both result.json and progress.csv were not found at /root/SDV-HGNN/hyperparam/ray_results/ray_tune_sdv_2024-08-05_00-46-59/ray_tune_sdv_b1763bdc_22_batch_size=16,dtr=0.4500,num_neigh1=10,num_neigh2=10,mol_block_param=2,mol_heads_out=128,mol_n_head=4,out_2024-08-05_09-32-19')


In [3]:
best_params_list = "load_data/batch_size	load_data/dtr	load_data/num_neigh1	load_data/num_neigh2	model/mol_block_para m	model/mol_heads_out	model/mol_n_head	model/outer_block	model/outer_hidden_d ims	model/outer_n_head	training/decay	training/lr	training/n_epochs"
best_params_list = best_params_list.split("\t")
best_params = "32	0.5	12	10	1	16	3	2	128	3	1.01015e-05	0.00143381	25"
best_params = best_params.split("\t")


['load_data/batch_size',
 'load_data/dtr',
 'load_data/num_neigh1',
 'load_data/num_neigh2',
 'model/mol_block_para m',
 'model/mol_heads_out',
 'model/mol_n_head',
 'model/outer_block',
 'model/outer_hidden_d ims',
 'model/outer_n_head',
 'training/decay',
 'training/lr',
 'training/n_epochs']

In [6]:
best_param_dict = dict(zip(best_params_list, best_params))
best_param_dict

{'load_data/batch_size': '32',
 'load_data/dtr': '0.5',
 'load_data/num_neigh1': '12',
 'load_data/num_neigh2': '10',
 'model/mol_block_para m': '1',
 'model/mol_heads_out': '16',
 'model/mol_n_head': '3',
 'model/outer_block': '2',
 'model/outer_hidden_d ims': '128',
 'model/outer_n_head': '3',
 'training/decay': '1.01015e-05',
 'training/lr': '0.00143381',
 'training/n_epochs': '25'}

In [26]:
print("Best config is:", results.get_best_result().config)

192