In [None]:
# Load ChemBERTa model and tokenizer for drug feature extraction
from transformers import AutoTokenizer, AutoModel
import torch

# Initialize the tokenizer and model for ChemBERTa
chem_tokenizer = AutoTokenizer.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")
chem_model = AutoModel.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")

In [None]:
# Load ESM2 model and tokenizer for protein feature extraction
from transformers import AutoTokenizer, AutoModel

# Initialize the tokenizer and model for ESM2
esm_tokenizer = AutoTokenizer.from_pretrained("D:\Drugllm\esm2_t6_8M_UR50D")
esm_model = AutoModel.from_pretrained("D:\Drugllm\esm2_t6_8M_UR50D")

In [None]:
# Print the hidden size of the ChemBERTa model
print(chem_model.config.hidden_size)

In [None]:
# Print the hidden size of the ESM2 model
print(esm_model.config.hidden_size)

In [None]:
# Define functions to extract features from drugs and proteins using pre-trained models
def extract_chem_features(smiles):
    """Extract ChemBERTa features from SMILES strings."""
    try:
        # Tokenize the SMILES string
        tokens = chem_tokenizer(smiles, return_tensors="pt", padding=True, truncation=True)
        # Generate embeddings using the ChemBERTa model
        with torch.no_grad():
            embeddings = chem_model(**tokens).last_hidden_state.mean(dim=1).squeeze().numpy()
        return embeddings
    except:
        # Return a zero vector if feature extraction fails
        return np.zeros(768)

def extract_esm_features(sequence):
    """Extract ESM2 features from protein sequences."""
    try:
        # Tokenize the protein sequence
        tokens = esm_tokenizer(sequence, return_tensors="pt", padding=True, truncation=True)
        # Generate embeddings using the ESM2 model
        with torch.no_grad():
            embeddings = esm_model(**tokens).last_hidden_state.mean(dim=1).squeeze().numpy()
        return embeddings
    except:
        # Return a zero vector if feature extraction fails
        return np.zeros(320)

In [None]:
# Featurize drugs
# Extract unique drugs and proteins
unique_drugs = bind_db[['Drug']].drop_duplicates()
unique_proteins = bind_db[['Target']].drop_duplicates()

In [None]:
# Import tqdm for progress bars during feature extraction
from tqdm import tqdm

In [None]:
# Extract features for unique drugs in the BindDB dataset
tqdm.pandas()  # Enable progress bar for pandas operations
unique_drugs['drug_features'] = unique_drugs['Drug'].progress_apply(extract_chem_features)

In [None]:
# Extract features for unique proteins in the BindDB dataset
unique_proteins['protein_features'] = unique_proteins['Target'].progress_apply(extract_esm_features)

In [None]:
# Merge extracted features back into the BindDB dataset
bind_db = bind_db.merge(unique_drugs, on='Drug', how='left')
bind_db = bind_db.merge(unique_proteins, on='Target', how='left')

In [None]:
# Save the featurized BindDB dataset to a PyTorch file
torch.save(bind_db, '/content/drive/MyDrive/DrugPLM-Cindy-2025/Code_and_Data/Data/BindDB/BindDB_featurized.pt')

In [None]:
# Extract unique drugs and proteins from the Davis dataset
unique_drugs = davis_db[['Drug']].drop_duplicates()
unique_proteins = davis_db[['Target']].drop_duplicates()

In [None]:
# Extract features for unique drugs in the Davis dataset
tqdm.pandas()  # Enable progress bar for pandas operations
unique_drugs['drug_features'] = unique_drugs['Drug'].progress_apply(extract_chem_features)

In [None]:
# Extract features for unique proteins in the Davis dataset
unique_proteins['protein_features'] = unique_proteins['Target'].progress_apply(extract_esm_features)

In [None]:
# Merge extracted features back into the Davis dataset
davis_db = davis_db.merge(unique_drugs, on='Drug', how='left')
davis_db = davis_db.merge(unique_proteins, on='Target', how='left')

In [None]:
# Save the featurized Davis dataset to a PyTorch file
torch.save(davis_db, '/content/drive/MyDrive/DrugPLM-Cindy-2025/Code_and_Data/Data/Davis/Davis_featurized.pt')

In [None]:
# Extract unique drugs and proteins from the Kiba dataset
unique_drugs = kiba_db[['Drug']].drop_duplicates()
unique_proteins = kiba_db[['Target']].drop_duplicates()

In [None]:
# Extract features for unique drugs in the Kiba dataset
tqdm.pandas()  # Enable progress bar for pandas operations
unique_drugs['drug_features'] = unique_drugs['Drug'].progress_apply(extract_chem_features)

In [None]:
# Extract features for unique proteins in the Kiba dataset
unique_proteins['protein_features'] = unique_proteins['Target'].progress_apply(extract_esm_features)

In [None]:
# Merge extracted features back into the Kiba dataset
kiba_db = kiba_db.merge(unique_drugs, on='Drug', how='left')
kiba_db = kiba_db.merge(unique_proteins, on='Target', how='left')

In [None]:
# Save the featurized Kiba dataset to a PyTorch file
torch.save(kiba_db, '/content/drive/MyDrive/DrugPLM-Cindy-2025/Code_and_Data/Data/Kiba/Kiba_featurized.pt')

In [None]:
# Load the featurized BindDB dataset for further analysis
bind_db = torch.load('/content/drive/MyDrive/DrugPLM-Cindy-2025/Code_and_Data/Data/BindDB/BindDB_featurized.pt', weights_only=False)
# Display the first few rows of the dataset
bind_db.head()