<a href="https://colab.research.google.com/github/manthanjshah/Drug-Reprocessing/blob/main/HINGRIL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('/content/drugs_with_smiles.csv')

# Display the first few rows to check column names and data
print(df.head())


      drugName                                             SMILES
0     clobazam       CN1C(=O)CC(=O)N(C2=C1C=CC(=C2)Cl)C3=CC=CC=C3
1  vinorelbine  CCC1=CC2CC(C3=C(CN(C2)C1)C4=CC=CC=C4N3)(C5=C(C...
2   carvedilol  COC1=CC=CC=C1OCCNCC(COC2=CC=CC3=C2C4=CC=CC=C4N3)O
3   benazepril  CCOC(=O)C(CCC1=CC=CC=C1)NC2CCC3=CC=CC=C3N(C2=O...
4  leflunomide            CC1=C(C=NO1)C(=O)NC2=CC=C(C=C2)C(F)(F)F


Final Code

In [None]:
import pandas as pd
import numpy as np
import math
import random
import scipy.sparse as sp
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, roc_curve, auc
from sklearn.model_selection import StratifiedKFold, cross_val_score
import matplotlib.pyplot as plt

# Partition function
def partition(ls, size):
    return [ls[i:i + size] for i in range(0, len(ls), size)]

# Generate Negative Samples
def NegativeGenerate(DrugDisease, AllDrug, AllDisease):
    NegativeSample = []
    counterN = 0
    while counterN < len(DrugDisease):
        counterR = random.randint(0, len(AllDrug) - 1)
        counterD = random.randint(0, len(AllDisease) - 1)
        DiseaseAndRnaPair = [AllDrug[counterR], AllDisease[counterD]]
        if DiseaseAndRnaPair in DrugDisease or DiseaseAndRnaPair in NegativeSample:
            continue
        NegativeSample.append(DiseaseAndRnaPair)
        counterN += 1
    return NegativeSample

# Main function
def main(options):
    if options.dataset == 1:
        dataset = 'B-Dataset'
    else:
        dataset = 'F-Dataset'
    print('fold', options.fold_num)
    print(dataset)

    # Load data
    DrDiNum18416 = pd.read_csv('/content/DrDiNum.csv', header=None)
    DrPrNum3243 = pd.read_csv('/content/DrPrNum.csv', header=None)
    DiPrNum71840 = pd.read_csv('/content/DiPrNum.csv', header=None)

    # Shuffle and partition the data
    RandomList = random.sample(range(0, len(DrDiNum18416)), len(DrDiNum18416))
    print('len(RandomList)', len(RandomList))
    NewRandomList = partition(RandomList, math.ceil(len(RandomList) / options.fold_num))
    print('len(NewRandomList[0])', len(NewRandomList[0]))
    NewRandomList = pd.DataFrame(NewRandomList).fillna(0).astype(int)
    NewRandomList.to_csv('/content/NewRandomList.csv', header=None, index=False)
    del NewRandomList, RandomList

    # Cross-validation split
    Nindex = pd.read_csv('/content/NewRandomList.csv', header=None)
    for i in range(len(Nindex)):
        kk = [j for j in range(options.fold_num) if j != i]
        index = np.hstack([np.array(Nindex)[k] for k in kk])

        # Ensure that indices exist before trying to access them
        valid_index = [idx for idx in index if idx in DrDiNum18416.index]

        DTIs_train = pd.DataFrame(np.array(DrDiNum18416)[valid_index])
        DTIs_train.to_csv(f'/content/DrDiIs_train{i}.csv', header=None, index=False)

        test_indices = np.array(Nindex)[i].tolist()
        valid_test_indices = [idx for idx in test_indices if idx in DrDiNum18416.index]

        DTIs_test = pd.DataFrame(np.array(DrDiNum18416)[valid_test_indices])
        DTIs_test.to_csv(f'/content/DrDiIs_test{i}.csv', header=None, index=False)
        print(i)
    del Nindex, index, DTIs_train, DTIs_test

    # Combine data
    DTIs_train = pd.concat([DrDiNum18416, DrPrNum3243, DiPrNum71840]).sample(frac=1.0)
    DTIs_train.to_csv('/content/AllDrDiIs_train.txt', sep='\t', header=None, index=False)

    # Generate negative samples
    Dr = pd.read_csv('/content/drugName.csv', header=0, names=['id', 'name'])
    Pr = pd.read_csv('/content/diseaseName.csv', header=0, names=['id', 'name'])
    NegativeSample = NegativeGenerate(DrDiNum18416.values.tolist(), Dr['id'].values.tolist(), Pr['id'].values.tolist())
    NegativeSample = pd.DataFrame(NegativeSample)
    NegativeSample.to_csv('/content/NegativeSample.csv', header=None, index=False)

    # Load additional data
    Negative = pd.read_csv('/content/NegativeSample.csv', header=None)
    Nindex = pd.read_csv('/content/NewRandomList.csv', header=None)
    Attribute = pd.read_csv('/content/AllNodeAttribute.csv', header=None, index_col=0).iloc[:, 1:]
    Embedding = pd.read_csv('/content/AllEmbedding_DeepWalk.txt', sep=' ', header=None, skiprows=1)
    Embedding = Embedding.sort_values(0, ascending=True).set_index([0])
    print(Embedding)

    # Prepare training and testing data for each fold
    for i in range(options.fold_num):
        train_data = pd.read_csv(f'/content/DrDiIs_train{i}.csv', header=None)
        train_data[2] = 1
        kk = [j for j in range(options.fold_num) if j != i]
        index = np.hstack([np.array(Nindex)[k] for k in kk])
        result = pd.concat([train_data, pd.DataFrame(np.array(Negative)[index])])
        labels_train = result[2]
        data_train_feature = pd.concat([
            pd.concat([Attribute.loc[result[0].values.tolist()], Embedding.loc[result[0].values.tolist()]], axis=1).reset_index(drop=True),
            pd.concat([Attribute.loc[result[1].values.tolist()], Embedding.loc[result[1].values.tolist()]], axis=1).reset_index(drop=True)
        ], axis=1)

        # Replace NaN values in labels_train with 0
        labels_train.fillna(0, inplace=True)

        globals()[f'data_train{i}'] = data_train_feature.values.tolist()
        globals()[f'labels_train{i}'] = labels_train
        print(len(labels_train))
        del result, data_train_feature

        test_data = pd.read_csv(f'/content/DrDiIs_test{i}.csv', header=None)
        test_data[2] = 1
        result = pd.concat([test_data, pd.DataFrame(np.array(Negative)[np.array(Nindex)[i]])])

        # Replace NaN values in labels_test with 0
        labels_test = result[2]
        labels_test.fillna(0, inplace=True)

        data_test_feature = pd.concat([
            pd.concat([Attribute.loc[result[0].values.tolist()], Embedding.loc[result[0].values.tolist()]], axis=1).reset_index(drop=True),
            pd.concat([Attribute.loc[result[1].values.tolist()], Embedding.loc[result[1].values.tolist()]], axis=1).reset_index(drop=True)
        ], axis=1)
        globals()[f'data_test{i}'] = data_test_feature.values.tolist()
        globals()[f'labels_test{i}'] = labels_test
        print(len(labels_test))
        del test_data, labels_test, result, data_test_feature
        print(i)

    # Aggregate data for training and testing
    data_train = [globals()[f'data_train{i}'] for i in range(options.fold_num)]
    data_test = [globals()[f'data_test{i}'] for i in range(options.fold_num)]
    labels_train = [globals()[f'labels_train{i}'] for i in range(options.fold_num)]
    labels_test = [globals()[f'labels_test{i}'] for i in range(options.fold_num)]

    # Cross-validation and ROC AUC computation
    print(f"{options.fold_num}-CV")
    tprs = []
    aucs = []
    mean_fpr = np.linspace(0, 1, 1000)
    AllResult = []

    for i in range(options.fold_num):
        X_train, X_test = data_train[i], data_test[i]
        Y_train, Y_test = np.array(labels_train[i]), np.array(labels_test[i])

        # Replace NaN values with 0 in Y_train and Y_test
        Y_train[np.isnan(Y_train)] = 0
        Y_test[np.isnan(Y_test)] = 0

        best_RandomF = RandomForestClassifier(n_estimators=options.tree_number)
        best_RandomF.fit(X_train, Y_train)

        # Predict probabilities and handle NaN values in y_score_RandomF
        y_score_RandomF = best_RandomF.predict_proba(X_test)
        y_score_RandomF[np.isnan(y_score_RandomF)] = 0

        fpr, tpr, thresholds = roc_curve(Y_test, y_score_RandomF[:, 1])
        tprs.append(np.interp(mean_fpr, fpr, tpr))  # Changed here to np.interp
        tprs[-1][0] = 0.0
        roc_auc = auc(fpr, tpr)
        aucs.append(roc_auc)
        print('ROC fold %d (AUC=%0.4f)' % (i, roc_auc))

    mean_tpr = np.mean(tprs, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    print('Mean ROC (AUC=%0.4f)' % mean_auc)

if __name__ == '__main__':
    import optparse
    import sys

    # Check if running in a Jupyter notebook or parse command-line arguments
    if 'ipykernel' in sys.modules or any('jupyter' in arg for arg in sys.argv):
        class Options:
            def __init__(self, dataset, fold_num, tree_number):
                self.dataset = dataset
                self.fold_num = fold_num
                self.tree_number = tree_number

        options = Options(dataset=1, fold_num=5, tree_number=100)  # Example options, customize as needed
    else:
        optparser = optparse.OptionParser()
        optparser.add_option('-d', '--dataset', dest='dataset', type='int', default=1)
        optparser.add_option('-f', '--fold_num', dest='fold_num', type='int', default=5)
        optparser.add_option('-t', '--tree_number', dest='tree_number', type='int', default=100)
        (options, args) = optparser.parse_args()

    main(options)


fold 5
B-Dataset
len(RandomList) 18416
len(NewRandomList[0]) 3684
0
1
2
3
4
            1         2         3         4         5         6         7   \
0                                                                            
0     0.036320  0.114171 -0.111191 -0.306078 -0.150277 -0.242947  0.091169   
1     0.305414  0.095406  0.015667 -0.047628 -0.128059 -0.174679  0.038152   
2    -0.256181  0.181465  0.079849 -0.457883 -0.287999 -0.031660  0.076192   
3     0.001911  0.204960  0.024538 -0.404577 -0.621718 -0.363406  0.318680   
4     0.641858  0.440286  0.384397  0.007248 -0.172761 -0.396514 -0.045816   
...        ...       ...       ...       ...       ...       ...       ...   
3391  0.001263 -0.053739 -0.086223 -0.316386  0.019711 -0.248028  0.339029   
3392 -0.100863 -0.370944  0.241698 -0.030355 -0.287542 -0.235261  0.051112   
3393  0.371044 -0.184225  0.645274  0.192262 -0.186283 -0.598438  0.013114   
3394  0.492821 -0.352814  0.275192  0.157270 -0.334156 -0.314840  

Code for Graphtransformer

In [None]:
!pip install torch torchvision torchaudio
!pip install torch-geometric
!pip install rdkit-pypi

Collecting torch-geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m56.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch-geometric
Successfully installed torch-geometric-2.6.1
Collecting rdkit-pypi
  Downloading rdkit_pypi-2022.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.9 kB)
Downloading rdkit_pypi-2022.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.4/29.4 MB[0m [31m36.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit-pypi
Successfully installed rdkit-pypi-2022.9.5


In [None]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
import torch
import torch.nn.functional as F
from torch_geometric.data import Data, Batch
from torch_geometric.nn import TransformerConv, global_mean_pool
from torch_geometric.loader import DataLoader
import numpy as np
import random


file_path = '/content/drugs_with_smiles.csv'
drug_data = pd.read_csv(file_path)
drug_data = drug_data[['drugName', 'SMILES']].dropna(subset=['SMILES'])
drug_data['SMILES'] = drug_data['SMILES'].astype(str)


def atom_features(atom):
    return np.array([
        atom.GetAtomicNum(),
        atom.GetDegree(),
        atom.GetFormalCharge(),
        atom.GetIsAromatic()
    ], dtype=np.float32)

def smiles_to_graph(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    atom_features_list = [atom_features(atom) for atom in mol.GetAtoms()]
    x = torch.tensor(atom_features_list, dtype=torch.float)

    edges = []
    for bond in mol.GetBonds():
        i = bond.GetBeginAtomIdx()
        j = bond.GetEndAtomIdx()
        edges.append([i, j])
        edges.append([j, i])
    edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()

    return Data(x=x, edge_index=edge_index)


def extract_features(smiles):
    if isinstance(smiles, str):
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return None

        features = {
            "molecular_weight": Descriptors.MolWt(mol),
            "logP": Descriptors.MolLogP(mol),
            "num_rotatable_bonds": Descriptors.NumRotatableBonds(mol),
            "tpsa": Descriptors.TPSA(mol),
            "num_heavy_atoms": Descriptors.HeavyAtomCount(mol),
            "num_atoms": mol.GetNumAtoms(),
        }
        return list(features.values())
    return None


drug_graphs = []
drug_names = []
drug_features = []
for index, row in drug_data.iterrows():
    graph = smiles_to_graph(row['SMILES'])
    features = extract_features(row['SMILES'])
    if graph and features:
        drug_graphs.append(graph)
        drug_features.append(features)
        drug_names.append(row['drugName'])


class GraphTransformer(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GraphTransformer, self).__init__()
        self.conv1 = TransformerConv(input_dim, hidden_dim, heads=8, dropout=0.1)
        self.conv2 = TransformerConv(hidden_dim * 8, hidden_dim, heads=8, dropout=0.1)
        self.fc1 = torch.nn.Linear(hidden_dim * 8, hidden_dim)
        self.fc2 = torch.nn.Linear(hidden_dim, output_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = global_mean_pool(x, data.batch)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x


batch_size = 32
train_loader = DataLoader(drug_graphs, batch_size=batch_size, shuffle=True)


model = GraphTransformer(input_dim=drug_graphs[0].x.shape[1], hidden_dim=128, output_dim=64)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


def contrastive_loss(embedding1, embedding2, label, margin=1.0):
    distance = F.pairwise_distance(embedding1, embedding2, p=2)
    loss_pos = label * torch.pow(distance, 2)
    loss_neg = (1 - label) * torch.pow(torch.clamp(margin - distance, min=0.0), 2)
    loss = torch.mean(loss_pos + loss_neg)
    return loss

def train_model_with_contrastive_loss(model, loader, epochs=10, margin=1.0):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in loader:
            optimizer.zero_grad()

            batch_embeddings = model(batch)


            num_samples = batch_embeddings.shape[0]
            pairs = []
            labels = []


            for i in range(num_samples):
                for j in range(i + 1, num_samples):
                    pairs.append((batch_embeddings[i], batch_embeddings[j]))
                    if random.random() > 0.5:
                        labels.append(1)
                    else:
                        labels.append(0)


            pair_embeddings_1 = torch.stack([pair[0] for pair in pairs])
            pair_embeddings_2 = torch.stack([pair[1] for pair in pairs])
            labels_tensor = torch.tensor(labels, dtype=torch.float)

            loss = contrastive_loss(pair_embeddings_1, pair_embeddings_2, labels_tensor, margin=margin)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f'Epoch {epoch + 1}, Loss: {total_loss / len(loader)}')


train_model_with_contrastive_loss(model, train_loader, epochs=10, margin=1.0)


def similarity(A, B):
    numerator = torch.dot(A, B)
    denominator = A.norm()**2 + B.norm()**2 - numerator
    return numerator / denominator if denominator != 0 else 0

def compute_similarity_matrix(graphs, model, features):
    model.eval()
    drug_embeddings = []
    for graph in graphs:
        with torch.no_grad():
            batch = Batch.from_data_list([graph])
            embeddings = model(batch)
            drug_embeddings.append(embeddings)


    drug_embeddings = torch.stack(drug_embeddings).squeeze().numpy()
    features_array = np.array(features)


    combined_features = np.concatenate([drug_embeddings, features_array], axis=1)


    num_drugs = combined_features.shape[0]
    similarity_matrix = np.zeros((num_drugs, num_drugs))

    for i in range(num_drugs):
        for j in range(num_drugs):
            similarity_matrix[i][j] = similarity(
                torch.tensor(combined_features[i]),
                torch.tensor(combined_features[j])
            )

    return similarity_matrix

similarity_matrix = compute_similarity_matrix(drug_graphs, model, drug_features)


similarity_df = pd.DataFrame(similarity_matrix, index=drug_names, columns=drug_names)
similarity_df.to_csv('drug_similarity_matrix.csv', index=True)


print(similarity_df)

FileNotFoundError: [Errno 2] No such file or directory: '/content/drugs_with_smiles.csv'

final prediction

In [15]:
import pandas as pd
import numpy as np
import math
import random
import scipy.sparse as sp
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, roc_curve, auc
from sklearn.model_selection import StratifiedKFold, cross_val_score
import matplotlib.pyplot as plt

# Partition function
def partition(ls, size):
    return [ls[i:i + size] for i in range(0, len(ls), size)]

# Generate Negative Samples
def NegativeGenerate(DrugDisease, AllDrug, AllDisease):
    NegativeSample = []
    counterN = 0
    while counterN < len(DrugDisease):
        counterR = random.randint(0, len(AllDrug) - 1)
        counterD = random.randint(0, len(AllDisease) - 1)
        DiseaseAndRnaPair = [AllDrug[counterR], AllDisease[counterD]]
        if DiseaseAndRnaPair in DrugDisease or DiseaseAndRnaPair in NegativeSample:
            continue
        NegativeSample.append(DiseaseAndRnaPair)
        counterN += 1
    return NegativeSample

def make_prediction(model, drug_name, Attribute, Embedding, drug_name_to_id):
    """
    Make predictions for a given drug by its name and list of all diseases.
    Returns the top 10 most likely interactions.

    Args:
        model: Trained RandomForestClassifier
        drug_name: The name of the drug to predict interactions for
        Attribute: DataFrame containing attribute information for drugs/diseases
        Embedding: DataFrame containing embedding information for drugs/diseases
        drug_name_to_id: A dictionary mapping drug names to their corresponding IDs

    Returns:
        DataFrame with the top 10 predicted disease-drug interactions
    """
    # Ensure the drug name is valid
    if drug_name not in drug_name_to_id:
        raise ValueError(f"Drug name {drug_name} not found in the drug name mapping.")

    # Get the drug ID from the drug name
    drug_id = drug_name_to_id[drug_name]

    if drug_id not in Attribute.index or drug_id not in Embedding.index:
        raise ValueError(f"Drug ID {drug_id} not found in Attribute or Embedding datasets.")

    # Prepare feature set for the given drug
    drug_feature = pd.concat([
        Attribute.loc[[drug_id]].reset_index(drop=True),
        Embedding.loc[[drug_id]].reset_index(drop=True)
    ], axis=1)

    # Prepare features for all diseases
    disease_ids = Attribute.index.tolist()  # Assuming all disease IDs are available in the Attribute DataFrame
    disease_feature = pd.concat([
        Attribute.loc[disease_ids].reset_index(drop=True),
        Embedding.loc[disease_ids].reset_index(drop=True)
    ], axis=1)

    # Combine drug and disease features for prediction
    features = pd.concat([drug_feature] * len(disease_ids), axis=0, ignore_index=True)
    disease_ids_expanded = pd.DataFrame({'disease_id': disease_ids})
    features = pd.concat([features, disease_feature], axis=1)

    # Make predictions
    probabilities = model.predict_proba(features)[:, 1]

    # Create a DataFrame to store the results
    predictions_df = pd.DataFrame({
        'disease_id': disease_ids,
        'interaction_probability': probabilities
    })

    # Sort by interaction probability (descending) and return top 10
    top_10_predictions = predictions_df.sort_values(by='interaction_probability', ascending=False).head(10)
    return top_10_predictions


# Main function
def main(options):
    if options.dataset == 1:
        dataset = 'B-Dataset'
    else:
        dataset = 'F-Dataset'
    print('fold', options.fold_num)
    print(dataset)

    # Load data
    DrDiNum18416 = pd.read_csv('/content/DrDiNum.csv', header=None)
    DrPrNum3243 = pd.read_csv('/content/DrPrNum.csv', header=None)
    DiPrNum71840 = pd.read_csv('/content/DiPrNum.csv', header=None)

    # Shuffle and partition the data
    RandomList = random.sample(range(0, len(DrDiNum18416)), len(DrDiNum18416))
    print('len(RandomList)', len(RandomList))
    NewRandomList = partition(RandomList, math.ceil(len(RandomList) / options.fold_num))
    print('len(NewRandomList[0])', len(NewRandomList[0]))
    NewRandomList = pd.DataFrame(NewRandomList).fillna(0).astype(int)
    NewRandomList.to_csv('/content/NewRandomList.csv', header=None, index=False)
    del NewRandomList, RandomList

    # Cross-validation split
    Nindex = pd.read_csv('/content/NewRandomList.csv', header=None)
    for i in range(len(Nindex)):
        kk = [j for j in range(options.fold_num) if j != i]
        index = np.hstack([np.array(Nindex)[k] for k in kk])

        # Ensure that indices exist before trying to access them
        valid_index = [idx for idx in index if idx in DrDiNum18416.index]

        DTIs_train = pd.DataFrame(np.array(DrDiNum18416)[valid_index])
        DTIs_train.to_csv(f'/content/DrDiIs_train{i}.csv', header=None, index=False)

        test_indices = np.array(Nindex)[i].tolist()
        valid_test_indices = [idx for idx in test_indices if idx in DrDiNum18416.index]

        DTIs_test = pd.DataFrame(np.array(DrDiNum18416)[valid_test_indices])
        DTIs_test.to_csv(f'/content/DrDiIs_test{i}.csv', header=None, index=False)
        print(i)
    del Nindex, index, DTIs_train, DTIs_test

    # Combine data
    DTIs_train = pd.concat([DrDiNum18416, DrPrNum3243, DiPrNum71840]).sample(frac=1.0)
    DTIs_train.to_csv('/content/AllDrDiIs_train.txt', sep='\t', header=None, index=False)

    # Generate negative samples
    Dr = pd.read_csv('/content/drugName.csv', header=0, names=['id', 'name'])
    Pr = pd.read_csv('/content/diseaseName.csv', header=0, names=['id', 'name'])
    NegativeSample = NegativeGenerate(DrDiNum18416.values.tolist(), Dr['id'].values.tolist(), Pr['id'].values.tolist())
    NegativeSample = pd.DataFrame(NegativeSample)
    NegativeSample.to_csv('/content/NegativeSample.csv', header=None, index=False)
    drug_name_to_id = dict(zip(Dr['name'], Dr['id']))
    print(drug_name_to_id)

    # Load additional data
    Negative = pd.read_csv('/content/NegativeSample.csv', header=None)
    Nindex = pd.read_csv('/content/NewRandomList.csv', header=None)
    Attribute = pd.read_csv('/content/AllNodeAttribute.csv', header=None, index_col=0).iloc[:, 1:]
    Embedding = pd.read_csv('/content/AllEmbedding_DeepWalk.txt', sep=' ', header=None, skiprows=1)
    Embedding = Embedding.sort_values(0, ascending=True).set_index([0])
    print(Embedding)

    # Prepare training and testing data for each fold
    for i in range(options.fold_num):
        train_data = pd.read_csv(f'/content/DrDiIs_train{i}.csv', header=None)
        train_data[2] = 1
        kk = [j for j in range(options.fold_num) if j != i]
        index = np.hstack([np.array(Nindex)[k] for k in kk])
        result = pd.concat([train_data, pd.DataFrame(np.array(Negative)[index])])
        labels_train = result[2]
        data_train_feature = pd.concat([
            pd.concat([Attribute.loc[result[0].values.tolist()], Embedding.loc[result[0].values.tolist()]], axis=1).reset_index(drop=True),
            pd.concat([Attribute.loc[result[1].values.tolist()], Embedding.loc[result[1].values.tolist()]], axis=1).reset_index(drop=True)
        ], axis=1)

        # Replace NaN values in labels_train with 0
        labels_train.fillna(0, inplace=True)

        globals()[f'data_train{i}'] = data_train_feature.values.tolist()
        globals()[f'labels_train{i}'] = labels_train
        print(len(labels_train))
        del result, data_train_feature

        test_data = pd.read_csv(f'/content/DrDiIs_test{i}.csv', header=None)
        test_data[2] = 1
        result = pd.concat([test_data, pd.DataFrame(np.array(Negative)[np.array(Nindex)[i]])])

        # Replace NaN values in labels_test with 0
        labels_test = result[2]
        labels_test.fillna(0, inplace=True)

        data_test_feature = pd.concat([
            pd.concat([Attribute.loc[result[0].values.tolist()], Embedding.loc[result[0].values.tolist()]], axis=1).reset_index(drop=True),
            pd.concat([Attribute.loc[result[1].values.tolist()], Embedding.loc[result[1].values.tolist()]], axis=1).reset_index(drop=True)
        ], axis=1)
        globals()[f'data_test{i}'] = data_test_feature.values.tolist()
        globals()[f'labels_test{i}'] = labels_test
        print(len(labels_test))
        del test_data, labels_test, result, data_test_feature
        print(i)

    # Aggregate data for training and testing
    data_train = [globals()[f'data_train{i}'] for i in range(options.fold_num)]
    data_test = [globals()[f'data_test{i}'] for i in range(options.fold_num)]
    labels_train = [globals()[f'labels_train{i}'] for i in range(options.fold_num)]
    labels_test = [globals()[f'labels_test{i}'] for i in range(options.fold_num)]

    # Cross-validation and ROC AUC computation
    print(f"{options.fold_num}-CV")
    tprs = []
    aucs = []
    mean_fpr = np.linspace(0, 1, 1000)
    AllResult = []

    for i in range(1):
        X_train, X_test = data_train[i], data_test[i]
        Y_train, Y_test = np.array(labels_train[i]), np.array(labels_test[i])

        # Replace NaN values with 0 in Y_train and Y_test
        Y_train[np.isnan(Y_train)] = 0
        Y_test[np.isnan(Y_test)] = 0

        best_RandomF = RandomForestClassifier(n_estimators=options.tree_number)
        best_RandomF.fit(X_train, Y_train)

        # Predict probabilities and handle NaN values in y_score_RandomF
        y_score_RandomF = best_RandomF.predict_proba(X_test)
        y_score_RandomF[np.isnan(y_score_RandomF)] = 0

        fpr, tpr, thresholds = roc_curve(Y_test, y_score_RandomF[:, 1])
        tprs.append(np.interp(mean_fpr, fpr, tpr))  # Changed here to np.interp
        tprs[-1][0] = 0.0
        roc_auc = auc(fpr, tpr)
        aucs.append(roc_auc)
        print('ROC fold %d (AUC=%0.4f)' % (i, roc_auc))

    mean_tpr = np.mean(tprs, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    print('Mean ROC (AUC=%0.4f)' % mean_auc)

    print(Dr['id'].values.tolist())

    drug_name = 'benazepril'  # Change to a valid drug ID from your dataset

    predictions = make_prediction(best_RandomF, drug_name, Attribute, Embedding, drug_name_to_id)
    print("\nTop 10 predicted interactions for drug:", drug_name)
    print(predictions)


if __name__ == '__main__':
    import optparse
    import sys

    # Check if running in a Jupyter notebook or parse command-line arguments
    if 'ipykernel' in sys.modules or any('jupyter' in arg for arg in sys.argv):
        class Options:
            def __init__(self, dataset, fold_num, tree_number):
                self.dataset = dataset
                self.fold_num = fold_num
                self.tree_number = tree_number

        options = Options(dataset=1, fold_num=5, tree_number=100)  # Example options, customize as needed
    else:
        optparser = optparse.OptionParser()
        optparser.add_option('-d', '--dataset', dest='dataset', type='int', default=1)
        optparser.add_option('-f', '--fold_num', dest='fold_num', type='int', default=5)
        optparser.add_option('-t', '--tree_number', dest='tree_number', type='int', default=100)
        (options, args) = optparser.parse_args()

    main(options)


fold 5
B-Dataset
len(RandomList) 18416
len(NewRandomList[0]) 3684
0
1
2
3
4
{'vinorelbine': 1, 'carvedilol': 2, 'benazepril': 3, 'leflunomide': 4, 'lamotrigine': 5, 'nefazodone': 6, 'trandolapril': 7, 'clopidogrel': 8, 'gemcitabine': 9, 'troglitazone': 10, 'pioglitazone': 11, 'zafirlukast': 12, 'mycophenolate mofetil': 13, 'fluvastatin': 14, 'meloxicam': 15, 'docetaxel': 16, 'olanzapine': 17, 'candesartan': 18, 'telmisartan': 19, 'bosentan': 20, 'cerivastatin': 21, 'rosiglitazone': 22, 'fexofenadine': 23, 'montelukast': 24, 'alitretinoin': 25, 'rofecoxib': 26, 'desloratadine': 27, 'gefitinib': 28, 'sorafenib': 29, 'sunitinib': 30, 'celecoxib': 31, 'sildenafil citrate': 32, 'rosuvastatin calcium': 33, 'valsartan': 34, 'imatinib mesylate': 35, 'sitagliptin phosphate': 36, 'atorvastatin calcium': 37, 'erlotinib hydrochloride': 38, 'quetiapine fumarate': 39, 'ezetimibe': 40, 'dasatinib': 41, 'venlafaxine hydrochloride': 42, 'acebutolol': 43, 'acetaminophen': 44, 'acetazolamide': 45, 'acety