In [1]:
# %pip install numpy pandas xgboost scikit-learn lightgbm torch

In [2]:
import numpy as np
import pandas as pd

# import torch
# import torch.nn as nn
# import torch.nn.functional as F
# from torch.utils.data import DataLoader, TensorDataset

from utils.utils import extract_and_merge_datasets, extract_embeddings

## Prepare

In [3]:
dataset_files = ['data/Cdataset/Cdataset.mat', 'data/Fdataset/Fdataset.mat', 
                 'data/Ydataset/Ydataset.mat', 'data/iDrug/iDrug.mat']

In [4]:
merged_data = extract_and_merge_datasets(dataset_files)

for var_name, var_value in merged_data.items():
    if isinstance(var_value, np.ndarray):
        print(f"{var_name}: {var_value.shape}")
    else:
        print(f"{var_name}: {len(var_value)} IDs")

drug_AtcS: (2026, 2026)
drug_ChemS: (2026, 2026)
drug_DDIS: (2026, 2026)
drug_GepS: (2026, 2026)
drug_SideS: (2026, 2026)
drug_TargetS: (2026, 2026)
disease_DoS: (4220, 4220)
disease_PhS: (4220, 4220)
Wdname: (4220,)
Wrname: (2026,)


In [5]:
pd.DataFrame(merged_data['Wdname']).to_csv('data/other/Wdname.csv', index=None, header=None)
pd.DataFrame(merged_data['Wrname']).to_csv('data/other/Wrname.csv', index=None, header=None)

In [6]:
merged_data['drug_GepS']

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]], shape=(2026, 2026))

## VAE(ignore)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

In [None]:
class ComplexVAE(nn.Module):
    def __init__(self, input_dim, latent_dim, hidden_dims=[256, 128, 64]):
        super(ComplexVAE, self).__init__()
        
        encoder_layers = []
        in_dim = input_dim
        for h_dim in hidden_dims:
            encoder_layers.append(nn.Linear(in_dim, h_dim))
            encoder_layers.append(nn.ReLU())
            in_dim = h_dim
        self.encoder = nn.Sequential(*encoder_layers)
        
        self.fc_mean = nn.Linear(hidden_dims[-1], latent_dim)
        self.fc_log_var = nn.Linear(hidden_dims[-1], latent_dim)
        
        decoder_layers = []
        in_dim = latent_dim
        for h_dim in reversed(hidden_dims):
            decoder_layers.append(nn.Linear(in_dim, h_dim))
            decoder_layers.append(nn.ReLU())
            in_dim = h_dim
        self.decoder = nn.Sequential(*decoder_layers)
        
        self.final_layer = nn.Linear(hidden_dims[0], input_dim)
        
    def encode(self, x):
        h = self.encoder(x)
        z_mean = self.fc_mean(h)
        z_log_var = self.fc_log_var(h)
        return z_mean, z_log_var

    def reparameterize(self, z_mean, z_log_var):
        std = torch.exp(0.5 * z_log_var)
        eps = torch.randn_like(std)
        return z_mean + eps * std

    def decode(self, z):
        h = self.decoder(z)
        return torch.sigmoid(self.final_layer(h))

    def forward(self, x):
        z_mean, z_log_var = self.encode(x)
        z = self.reparameterize(z_mean, z_log_var)
        return self.decode(z), z_mean, z_log_var

In [None]:
class VAE(nn.Module):
    def __init__(self, input_dim, latent_dim):
        super(VAE, self).__init__()

        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2_mean = nn.Linear(128, latent_dim)
        self.fc2_log_var = nn.Linear(128, latent_dim)

        self.fc3 = nn.Linear(latent_dim, 128)
        self.fc4 = nn.Linear(128, input_dim)

    def encode(self, x):
        h1 = F.relu(self.fc1(x))
        z_mean = self.fc2_mean(h1)
        z_log_var = self.fc2_log_var(h1)
        return z_mean, z_log_var

    def reparameterize(self, z_mean, z_log_var):
        std = torch.exp(0.5 * z_log_var)
        eps = torch.randn_like(std)
        return z_mean + eps * std

    def decode(self, z):
        h3 = F.relu(self.fc3(z))
        return torch.sigmoid(self.fc4(h3))

    def forward(self, x):
        z_mean, z_log_var = self.encode(x)
        z = self.reparameterize(z_mean, z_log_var)
        return self.decode(z), z_mean, z_log_var

In [9]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(torch.log(torch.tensor(10000.0)) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)  # (1, max_len, d_model)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return self.dropout(x + self.pe[:, :x.size(1), :])

class TransformerVAE(nn.Module):
    def __init__(self, input_dim, latent_dim, nhead=8, num_encoder_layers=6, num_decoder_layers=6):
        super(TransformerVAE, self).__init__()
        
        self.encoder_embedding = nn.Linear(input_dim, 128)
        self.decoder_embedding = nn.Linear(latent_dim, 128)
        
        self.positional_encoding = PositionalEncoding(d_model=128)

        self.transformer_encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=128, nhead=nhead), 
            num_layers=num_encoder_layers
        )
        
        self.transformer_decoder = nn.TransformerDecoder(
            nn.TransformerDecoderLayer(d_model=128, nhead=nhead), 
            num_layers=num_decoder_layers
        )
        
        self.fc2_mean = nn.Linear(128, latent_dim)
        self.fc2_log_var = nn.Linear(128, latent_dim)
        self.fc_out = nn.Linear(128, input_dim)

    def encode(self, x):
        x = self.encoder_embedding(x).unsqueeze(1)  # Add sequence dimension
        x = self.positional_encoding(x)  # Add positional encoding
        encoded = self.transformer_encoder(x)
        h = encoded.mean(dim=1)  # Pooling
        z_mean = self.fc2_mean(h)
        z_log_var = self.fc2_log_var(h)
        return z_mean, z_log_var

    def reparameterize(self, z_mean, z_log_var):
        std = torch.exp(0.5 * z_log_var)
        eps = torch.randn_like(std)
        return z_mean + eps * std

    def decode(self, z):
        z = self.decoder_embedding(z).unsqueeze(1)  # Add sequence dimension
        z = self.positional_encoding(z)  # Add positional encoding
        decoded = self.transformer_decoder(z, z)  # Using z as both src and tgt
        return torch.sigmoid(self.fc_out(decoded.mean(dim=1)))

    def forward(self, x):
        z_mean, z_log_var = self.encode(x)
        z = self.reparameterize(z_mean, z_log_var)
        return self.decode(z), z_mean, z_log_var

In [None]:
def vae_loss(recon_x, x, z_mean, z_log_var):
    recon_loss = F.mse_loss(recon_x, x, reduction='sum')
    kl_divergence = -0.5 * torch.sum(1 + z_log_var - z_mean.pow(2) - z_log_var.exp())
    return recon_loss + kl_divergence

In [None]:
input_dim_drug = 2026
input_dim_disease = 4220
latent_dim_drug = 64
latent_dim_disease = 64
learning_rate = 1e-4
batch_size = 1024
epochs = 1000

In [None]:
drug_matrices = [merged_data['drug_AtcS'], merged_data['drug_ChemS'], 
                 merged_data['drug_DDIS'], merged_data['drug_GepS'], 
                 merged_data['drug_SideS'], merged_data['drug_TargetS']]
disease_matrices = [merged_data['disease_DoS'], merged_data['disease_PhS']]

In [13]:
drug_combined = np.mean(drug_matrices, axis=0)
disease_combined = np.mean(disease_matrices, axis=0)
# drug_combined = np.stack(drug_matrices, axis=0)
# disease_combined = np.stack(disease_matrices, axis=0)

In [None]:
drug_combined.shape

In [None]:
drug_data = torch.tensor(drug_combined, dtype=torch.float32).to(device)
disease_data = torch.tensor(disease_combined, dtype=torch.float32).to(device)

drug_loader = DataLoader(TensorDataset(drug_data), batch_size=batch_size, shuffle=True)
disease_loader = DataLoader(TensorDataset(disease_data), batch_size=batch_size, shuffle=True)

In [None]:
for i in drug_loader:
    print(i[0].shape)
    break

In [None]:
# drug_vae = TransformerVAE(input_dim=input_dim_drug, latent_dim=latent_dim_drug).to(device)
# disease_vae = TransformerVAE(input_dim=input_dim_disease, latent_dim=latent_dim_disease).to(device)
# drug_vae = VAE(input_dim=input_dim_drug, latent_dim=latent_dim_drug).to(device)
# disease_vae = VAE(input_dim=input_dim_disease, latent_dim=latent_dim_disease).to(device)
drug_vae = ComplexVAE(input_dim=input_dim_drug, latent_dim=latent_dim_drug).to(device)
disease_vae = ComplexVAE(input_dim=input_dim_disease, latent_dim=latent_dim_disease).to(device)
optimizer_drug = torch.optim.AdamW(drug_vae.parameters(), lr=learning_rate)
optimizer_disease = torch.optim.AdamW(disease_vae.parameters(), lr=learning_rate)

In [None]:
def train_vae(model, data_loader, optimizer, epochs):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in data_loader:
            x = batch[0].to(device)
            optimizer.zero_grad()
            recon_x, z_mean, z_log_var = model(x)
            loss = vae_loss(recon_x, x, z_mean, z_log_var)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f'Epoch {epoch + 1}, Loss: {total_loss / len(data_loader.dataset)}')

In [None]:
print("Training Drug VAE...")
train_vae(drug_vae, drug_loader, optimizer_drug, epochs)

In [None]:
print("Training Disease VAE...")
train_vae(disease_vae, disease_loader, optimizer_disease, epochs)

In [None]:
def get_embeddings(model, data_loader):
    model.eval()
    embeddings = []
    with torch.no_grad():
        for batch in data_loader:
            x = batch[0].to(device)
            z_mean, _ = model.encode(x)
            embeddings.append(z_mean.cpu().numpy())
    return np.vstack(embeddings)

In [None]:
drug_embeddings = get_embeddings(drug_vae, drug_loader)
disease_embeddings = get_embeddings(disease_vae, disease_loader)
print("Drug embeddings shape:", drug_embeddings.shape)
print("Disease embeddings shape:", disease_embeddings.shape)

In [23]:
pd.DataFrame(drug_embeddings).to_csv('data/Embedding/drug_embeddings.csv', index=None)
pd.DataFrame(disease_embeddings).to_csv('data/Embedding/disease_embeddings.csv', index=None)

## Create Small Embedding File

In [7]:
# llm
drug_embeddings_llm = pd.read_csv('data/Embedding/db2llm_emb.csv')
drug_embeddings_llm = drug_embeddings_llm.iloc[:, 2:].values

disease_embeddings_llm = pd.read_csv('data/Embedding/omim2llm_emb.csv')
disease_embeddings_llm = disease_embeddings_llm.iloc[:, 3:].values

In [8]:
# kg
drug_embeddings_kg = pd.read_csv('data/Embedding/kg_embeddings.csv')
drug_embeddings_kg = drug_embeddings_kg.iloc[:, 2:].values

disease_embeddings_kg = pd.read_csv('data/Embedding/kg_embeddings.csv')
disease_embeddings_kg = disease_embeddings_kg.iloc[:, 2:].values

In [9]:
print(
    f'LLM Embedding Shape: {drug_embeddings_llm.shape}, {disease_embeddings_llm.shape}\n'
    f'KG Embedding Shape: {drug_embeddings_kg.shape}, {disease_embeddings_kg.shape}'
    ) 

LLM Embedding Shape: (2026, 1024), (4220, 1024)
KG Embedding Shape: (6246, 128), (6246, 128)


In [None]:
extract_embeddings('Fdataset', merged_data, drug_embeddings, disease_embeddings)

In [None]:
extract_embeddings('Cdataset', merged_data, drug_embeddings, disease_embeddings)

In [None]:
extract_embeddings('Ydataset', merged_data, drug_embeddings, disease_embeddings)

In [10]:
extract_embeddings('iDrug', merged_data, drug_embeddings_llm, disease_embeddings_llm, type='llm')
extract_embeddings('iDrug', merged_data, drug_embeddings_kg, disease_embeddings_kg, type='kg')

Extracted drug embeddings shape: (1321, 1024)
Extracted disease embeddings shape: (3966, 1024)
Files have been saved at: data/iDrug/iDrug_drug_embedding_llm.csv and data/iDrug/iDrug_disease_embedding_llm.csv
Extracted drug embeddings shape: (1321, 128)
Extracted disease embeddings shape: (3966, 128)
Files have been saved at: data/iDrug/iDrug_drug_embedding_kg.csv and data/iDrug/iDrug_disease_embedding_kg.csv
