<div>
    <h1 align="center"><font color="blue"> DELIVERABLE 3 </font></h1>
</div>

<div>
    <h4 align="left"><font color="green"> Downloading Libraries </font></h4>
</div>

In [1]:
pip install rdkit-pypi torch_geometric faiss-cpu sacremoses langchain langchain-community langchain-openai --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.4/29.4 MB[0m [31m66.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m57.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m57.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m34.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m79.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m46.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.2/65.2 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━

In [2]:
# rdkit-pypi: Helps me work with chemical structures and SMILES strings for molecules.
# torch_geometric: Allows me to build graph neural networks (GNNs) for processing molecular data.
# faiss-cpu: Used for fast similarity searches with embeddings, like finding similar compounds.
# sacremoses: Likely needed for text processing, possibly for the language model part.
# bitsandbytes: Helps with memory-efficient model training, especially for large language models.
                                             
print("---------- ALL LIBRARIES HAVE BEEN DOWNLOADED ----------")

---------- ALL LIBRARIES HAVE BEEN DOWNLOADED ----------


<div>
    <h4 align="left"><font color="green"> Importing Libraries </font></h4>
</div>

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GINConv, global_add_pool
from torch_geometric.data import Data, Batch
from torch_geometric.loader import DataLoader
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs

import faiss

from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification, AutoModelForSeq2SeqLM, PretrainedConfig
from rdkit.Chem import Descriptors
from tqdm import tqdm
import gc
import os
import ast
import re
from torch.cuda.amp import GradScaler, autocast
from torch.amp import GradScaler, autocast
from sklearn.model_selection import train_test_split
from rdkit import RDLogger

from langchain_community.vectorstores import FAISS
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

print("---------- ALL LIBRARIES HAVE BEEN IMPORTED ----------")

# torch, torch.nn, and torch.nn.functional: For building and training neural networks, like my GNN model.
# torch_geometric modules (GINConv, global_add_pool, Data, Batch, DataLoader): Help me create and process graph-based data for molecules.
# rdkit modules (Chem, AllChem, DataStructs, Descriptors): lets me work with chemical structures, generate fingerprints, and calculate properties like logP.
# faiss: For efficient similarity searches using embeddings.
# transformers modules (AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig): For loading and using large language models (LLMs) like BioMistral.
# tqdm: Adds progress bars to loops, so I can see how long processes take.
# gc: Helps manage memory by cleaning up unused objects.
# ast and re: For parsing strings and extracting information from text, like LLM outputs.
# torch.cuda.amp (GradScaler, autocast): Optimizes training on GPUs to save memory and speed up computations.
# bitsandbytes: Reduces memory usage for LLMs.

---------- ALL LIBRARIES HAVE BEEN IMPORTED ----------


<div>
    <h2 align="center"><font color="purple"> Deliverable 1 Code </font></h2>
</div>

In [4]:
# Set device for training
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


<div>
    <h3 align="left"><font color="red"> STEP 01: Data Loading and Preprocessing </font></h3>
</div>

In [5]:
df = pd.read_csv('/kaggle/input/smiles/SMILES_Big_Data_Set.csv')
print("Dataset columns:", df.columns.tolist())

# Standardizing SMILES strings to ensure consistency and track invalid ones.
invalid_smiles_count = 0
def standardize_smiles(smiles):
    global invalid_smiles_count
    try:
        mol = Chem.MolFromSmiles(smiles)  # Convert SMILES to RDKit molecule object.
        if mol is None:
            invalid_smiles_count += 1 
            return None
        return Chem.MolToSmiles(mol, isomericSmiles=True)  # Convert back to standardized SMILES.
    except:
        invalid_smiles_count += 1  # Increment counter if conversion fails.
        return None

df['standard_smiles'] = df['SMILES'].apply(standardize_smiles) 
df = df.dropna(subset=['standard_smiles']).drop_duplicates(subset=['standard_smiles'])
print(f"Removed {invalid_smiles_count} invalid SMILES strings.")


df['pIC50'] = pd.to_numeric(df['pIC50'], errors='coerce') 
df['num_atoms'] = pd.to_numeric(df['num_atoms'], errors='coerce')  
df['logP'] = pd.to_numeric(df['logP'], errors='coerce') 
df = df.dropna() 

# Creating a column of RDKit molecule objects for later use, like generating fingerprints.
df['mol'] = df['standard_smiles'].apply(Chem.MolFromSmiles)

Dataset columns: ['SMILES', 'pIC50', 'mol', 'num_atoms', 'logP']
Removed 0 invalid SMILES strings.


<div>
    <h3 align="left"><font color="red"> STEP 02: Generating Fingerprints (Morgan Fingerprints) </font></h3>
</div>

In [6]:
# Creating Morgan fingerprints to represent molecular structures numerically for GNN input.
def generate_morgan_fingerprint(mol, radius=2, n_bits=2048):
    if mol is None:
        return None
    try:
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=n_bits)  # Generate 2048-bit Morgan fingerprint with radius 2.
        arr = np.zeros((n_bits,), dtype=np.float32)
        DataStructs.ConvertToNumpyArray(fp, arr)  # Convert fingerprint to NumPy array of 0s and 1s.
        return arr
    except:
        return None

df['morgan_fp'] = df['mol'].apply(generate_morgan_fingerprint)  
df = df[df['morgan_fp'].notnull()]  # Remove rows where fingerprint generation failed.
fp_matrix = np.stack(df['morgan_fp'].values)  # Stack all fingerprints into a single NumPy array for GNN training.
print(f"Fingerprint matrix shape: {fp_matrix.shape}")

Fingerprint matrix shape: (14823, 2048)


<div>
    <h3 align="left"><font color="red"> STEP 03: GNN for Fingerprint Embedding (GIN) </font></h3>
</div>

In [7]:
# Defining a Graph Neural Network (GNN) to create compact embeddings from Morgan fingerprints.
class FingerprintGNN(nn.Module):
    def __init__(self, input_dim=2048, hidden_dim=512, output_dim=256):
        super().__init__()
        self.fp_to_node = nn.Linear(input_dim, hidden_dim)  # Reduce 2048-bit fingerprint to 512 dimensions.
        self.conv1 = GINConv(nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),  # First linear layer for graph convolution.
            nn.ReLU(),  # Activation
            nn.Linear(hidden_dim, hidden_dim)  # Second linear layer for feature transformation.
        ))
        self.conv2 = GINConv(nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),  # Second graph convolution layer.
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim)
        ))
        self.lin = nn.Linear(hidden_dim, output_dim)  # Final layer to output 256-dimensional embedding.

    def forward(self, x, edge_index, batch):
        x = self.fp_to_node(x)  # Transform input fingerprint to hidden dimension.
        x = self.conv1(x, edge_index).relu() 
        x = self.conv2(x, edge_index) 
        pooled = global_add_pool(x, batch)  # Aggregate node features into a single embedding per graph.
        return self.lin(pooled)  

data_list = []
for fp in df['morgan_fp']:
    node_feat = torch.FloatTensor(fp).unsqueeze(0)  # Convert fingerprint to tensor and add batch dimension.
    edge_index = torch.tensor([[0], [0]], dtype=torch.long) 
    data = Data(x=node_feat, edge_index=edge_index) 
    data_list.append(data)

batch_size = 128  # Set batch size for efficient training.
loader = DataLoader(data_list, batch_size=batch_size, shuffle=False)  # Create DataLoader for batching graphs.

# Training the GNN model using an autoencoder-like loss.
gin_model = FingerprintGNN().to(device)  
optimizer = torch.optim.Adam(gin_model.parameters(), lr=0.001)  # Set up Adam optimizer.
target_projection = nn.Linear(2048, 256).to(device)  # Linear layer to project fingerprints to 256 dimensions for loss calculation.

# Ensure=ing target_projection parameters are optimized along with GNN.
combined_params = list(gin_model.parameters()) + list(target_projection.parameters())
optimizer = torch.optim.Adam(combined_params, lr=0.001) 

epochs = 10

print("\nTraining GIN model...")
for epoch in range(epochs):
    gin_model.train()  
    target_projection.train() 
    total_loss = 0
    for batch in loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        out = gin_model(batch.x, batch.edge_index, batch.batch)  # Get GNN embeddings.
        target = target_projection(batch.x) 
        loss = F.mse_loss(out, target)  # Calculate MSE loss between GNN and projected embeddings.
        loss.backward()
        optimizer.step()  # Update model weights.
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss/len(loader)}")

# Generating embeddings for all fingerprints using the trained GNN.
print("\nGenerating GNN embeddings...")
gin_model.eval()  
target_projection.eval() 
embeddings = []
with torch.no_grad():  # Disable gradient tracking to save memory.
    for batch in loader:
        batch = batch.to(device) 
        emb = gin_model(batch.x, batch.edge_index, batch.batch)  # Generate embeddings.
        embeddings.append(emb.cpu().numpy()) 
embedding_matrix = np.vstack(embeddings) 
print(f"Embedding matrix shape: {embedding_matrix.shape}")


Training GIN model...
Epoch 1, Loss: 0.001420613777209555
Epoch 2, Loss: 0.00047226974957397785
Epoch 3, Loss: 0.0003054972710680008
Epoch 4, Loss: 0.0003364992929885469
Epoch 5, Loss: 0.0005024994230740864
Epoch 6, Loss: 0.0010846421520972367
Epoch 7, Loss: 0.003103570847527188
Epoch 8, Loss: 0.001280652909591826
Epoch 9, Loss: 0.0014062831012716774
Epoch 10, Loss: 0.0006510658833952941

Generating GNN embeddings...
Embedding matrix shape: (14823, 256)


<div>
    <h4 align="left"><font color="green"> Saving preprocessed data, embeddings, trained model </font></h4>
</div>

In [8]:
# Saving my processed data and trained GNN model for later use.
df['gnn_embedding'] = embedding_matrix.tolist() 
df.to_csv('preprocessed_data_with_embeddings.csv', index=False) 

# Saving the GNN model's weights to a file.
torch.save(gin_model.state_dict(), "gin_model.pth") 

print("Data Saved!")

Data Saved!


<div>
    <h4 align="left"><font color="green"> Checking if required columns exist in df </font></h4>
</div>

In [9]:
# Checking if my DataFrame has the necessary columns for later steps.
if 'gnn_embedding' not in df.columns or 'standard_smiles' not in df.columns:
    raise ValueError("Required columns 'gnn_embedding' or 'standard_smiles' not found in DataFrame.")
else:
    print("Required Columns Exist!")

# Resetting the DataFrame index to align with the embedding matrix.
df = df.reset_index(drop=True)  # Ensure row indices match embedding matrix to avoid mismatches.

Required Columns Exist!


<div>
    <h3 align="left"><font color="red"> STEP 04: HNSW Index for GNN Embeddings </font></h3>
</div>

In [10]:
# Converting GNN embeddings to a NumPy array for Faiss.
embedding_matrix = np.stack(df['gnn_embedding'].values).astype(np.float32)  
embedding_dim = embedding_matrix.shape[1] 

index = faiss.IndexHNSWFlat(embedding_dim, 32)  # Create HNSW index with M=32 (graph degree).
index.hnsw.efConstruction = 200  # Set construction parameter for better index quality.
index.hnsw.efSearch = 100  # Set search parameter for better accuracy.
faiss.normalize_L2(embedding_matrix)  # Normalize embeddings for cosine similarity.

index.add(embedding_matrix)  # Index all embeddings for similarity searches.
print(f"Indexed {embedding_matrix.shape[0]} compounds.")

# Saving the index to a file for later use.
faiss.write_index(index, "gnn_hnsw_index.faiss")

Indexed 14823 compounds.


<div>
    <h3 align="left"><font color="red"> STEP 05: HNSW Search Function </font></h3>
</div>

In [11]:
# Defining a function to find compounds similar to a query fingerprint using the HNSW index.
def search_similar_compounds(query_fp, gin_model, index, top_k=5, device='cpu'):
    """
    Search for compounds similar to the query fingerprint using HNSW index.
    """
    try:
        # Setting up the GNN model to generate embeddings for the query.
        gin_model.eval() 
        gin_model.to(device) 

        query_fp = np.array(query_fp, dtype=np.float32)  
        node_feat = torch.FloatTensor(query_fp).unsqueeze(0).to(device) 
        edge_index = torch.tensor([[0], [0]], dtype=torch.long).to(device)  # Create self-loop for single-node graph.
        data = Data(x=node_feat, edge_index=edge_index)  # Wrap in Data object.
        batch = torch.zeros(1, dtype=torch.long).to(device)  # Batch tensor for single graph.

        with torch.no_grad(): 
            query_embedding = gin_model(data.x, data.edge_index, batch).cpu().numpy()  # Get 256-dimensional embedding.
        
        query_embedding = query_embedding.astype(np.float32) 
        faiss.normalize_L2(query_embedding)

        # Searching for the top_k most similar compounds.
        _, indices = index.search(query_embedding, top_k)  

        # Retrieving the SMILES strings of similar compounds.
        similar_smiles = df.iloc[indices[0]]['standard_smiles'].values.tolist() 
        return similar_smiles
    
    except Exception as e:
        print(f"Error during similarity search: {e}")
        return []  

print("Similar Compound Search Function made!")

Similar Compound Search Function made!


<div>
    <h4 align="left"><font color="green"> Example Search Using HNSW </font></h4>
</div>

In [12]:
print("\nSearching for similar compounds...")

# Testing the similarity search with a sample SMILES string.
query_smiles = "NS(=O)(=O)N1CCC(NC(=O)c2cnn3ccc(N4CCCC4c4cc(F)ccc4F)nc23)CC1"
query_mol = Chem.MolFromSmiles(query_smiles)  # Convert SMILES to RDKit molecule.
if query_mol is None:
    print("Error: Invalid query SMILES string.")
else:
    query_fp = generate_morgan_fingerprint(query_mol)  # Generate Morgan fingerprint for query.
    if query_fp is None:
        print("Error: Failed to generate fingerprint for query molecule.")
    else:
        # Using the search function to find similar compounds.
        similar_compounds = search_similar_compounds(query_fp, gin_model, index, top_k=5, device=device)  # Find top 5 similar compounds.
        print("\nTop 5 Similar Compounds:")
        for i, smiles in enumerate(similar_compounds, 1):
            print(f"{i}. {smiles}")


Searching for similar compounds...

Top 5 Similar Compounds:
1. NS(=O)(=O)N1CCC(NC(=O)c2cnn3ccc(N4CCCC4c4cc(F)ccc4F)nc23)CC1
2. CC(C)N(C=Nc1ccn(C2CCC(CO)O2)c(=O)n1)C(C)C
3. CCCS(=O)(=O)CC1CC(N(C)c2[nH]cnc3nccc2-3)C1
4. CS(=O)(=O)c1cnc2ccccc2n1
5. CCCC1CCCc2ncc(C(=O)OC)c(=O)n21


<div>
    <h2 align="center"><font color="purple"> Deliverable 2 Code </font></h2>
</div>

<div>
    <h4 align="left"><font color="green"> Suppress Warnings </font></h4>
</div>

In [13]:
RDLogger.DisableLog('rdApp.*')

print("Suppress command executed!")

Suppress command executed!


<div>
    <h4 align="left"><font color="green"> Validating and Retrieving data from Deliverable 01 </font></h4>
</div>

In [14]:
# Loading preprocessed data from Deliverable 1
df = pd.read_csv('/kaggle/working/preprocessed_data_with_embeddings.csv')
print("Loaded columns:", df.columns)

# Validating required columns
required_columns = ['standard_smiles', 'gnn_embedding']
missing = [col for col in required_columns if col not in df.columns]
if missing:
    raise ValueError(f"Missing columns: {missing}")

# Loading FAISS HNSW index
d = len(df['gnn_embedding'].iloc[0])  # Embedding dimension
index = faiss.read_index('/kaggle/working/gnn_hnsw_index.faiss')
print("HNSW index loaded with", index.ntotal, "embeddings")

Loaded columns: Index(['SMILES', 'pIC50', 'mol', 'num_atoms', 'logP', 'standard_smiles',
       'morgan_fp', 'gnn_embedding'],
      dtype='object')
HNSW index loaded with 14823 embeddings


<div>
    <h4 align="left"><font color="green"> Loading Pre-trained GIN Model </font></h4>
</div>

In [15]:
gin_model = FingerprintGNN().to(device)
gin_model.load_state_dict(torch.load('/kaggle/working/gin_model.pth'))
gin_model.eval()

print("Loaded pretrained GIN model successfully.")

Loaded pretrained GIN model successfully.


<div>
    <h3 align="left"><font color="red"> R2.2 (Step 01) </font></h3>
</div>

In [16]:
# Dictionary to store LLM configurations (only one active at a time)
llm_configs = {
    # 'BioGPT': {
    #     'model_name': 'microsoft/biogpt',
    #     'tokenizer': AutoTokenizer.from_pretrained('microsoft/biogpt'),
    #     'model': AutoModelForCausalLM.from_pretrained('microsoft/biogpt').to(device)
    # },
    'MolT5': {
        'model_name': 'laituan245/molt5-large-smiles2caption',
        'tokenizer': AutoTokenizer.from_pretrained('laituan245/molt5-large-smiles2caption'),
        'model': AutoModelForSeq2SeqLM.from_pretrained('laituan245/molt5-large-smiles2caption').to(device)
    },
    # 'ChemBERTa': {
    #     'model_name': 'DeepChem/ChemBERTa-77M-MTR',
    #     'tokenizer': AutoTokenizer.from_pretrained('DeepChem/ChemBERTa-77M-MTR'),
    #     'model': AutoModelForSequenceClassification.from_pretrained('DeepChem/ChemBERTa-77M-MTR').to(device)
    # }
}
active_llm = 'MolT5'  # Change to 'BioGPT', 'MolT5', or 'ChemBERTa' to switch models

tokenizer_config.json:   0%|          | 0.00/2.13k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/700 [00:00<?, ?B/s]

2025-06-09 08:30:42.908154: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749457843.095892      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749457843.149957      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


pytorch_model.bin:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

<div>
    <h3 align="left"><font color="red"> R2.2 (Step 02) & R2.3 (Step 01) </font></h3>
</div>

In [17]:
# Set environment variable for CUDA memory optimization
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Fine-tune active LLM on SMILES dataset with diverse prompts
def fine_tune_llm(model, tokenizer, smiles_list, epochs=2, batch_size=1, accumulation_steps=4):
    model.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
    scaler = torch.cuda.amp.GradScaler()  # Enable mixed precision
    # Determine model type using config
    config = model.config
    print(f"Model config class: {config.__class__.__name__}")  # Debug model type
    if isinstance(config, PretrainedConfig) and hasattr(config, 'model_type'):
        model_type = config.model_type
        if model_type in ['biogpt', 'gpt2', 'llama']:  # Causal LM examples
            model_type = 'causal'
        elif model_type in ['t5', 'molt5']:  # Seq2Seq examples
            model_type = 'seq2seq'
        elif model_type in ['bert', 'roberta', 'chemberta']:  # Classification examples
            model_type = 'classification'
        else:
            raise ValueError(f"Unsupported model type: {model_type}")
    else:
        raise ValueError("Unable to determine model type from configuration")

    # Create training examples with varied targets
    templates = [
        "This compound, a potential drug candidate, may exhibit anti-inflammatory properties.",
        "A novel structure for drug development with possible antimicrobial effects.",
        "This chemical could be a new lead for cancer therapy research."
    ]
    train_data = [f"{smiles}\t{templates[i % len(templates)]}" for i, smiles in enumerate(smiles_list)]

    for epoch in range(epochs):
        np.random.shuffle(train_data)  # Shuffle to improve learning
        optimizer.zero_grad()
        for i in tqdm(range(0, len(train_data), batch_size), desc=f"Epoch {epoch+1}"):
            batch = train_data[i:i+batch_size]
            inputs = tokenizer(batch, return_tensors='pt', padding=True, truncation=True, max_length=128).to(device)
            
            with torch.cuda.amp.autocast():
                if model_type == 'causal':
                    outputs = model(**inputs, labels=inputs['input_ids'])
                elif model_type == 'seq2seq':
                    decoder_input_ids = inputs['input_ids'].clone()
                    decoder_input_ids[:, 1:] = decoder_input_ids[:, :-1].clone()  # Shift for teacher forcing
                    decoder_input_ids[:, 0] = tokenizer.pad_token_id  # Start with pad token
                    outputs = model(**inputs, decoder_input_ids=decoder_input_ids, labels=inputs['input_ids'])
                elif model_type == 'classification':
                    labels = torch.zeros(len(batch), dtype=torch.long).to(device)  # Dummy labels
                    outputs = model(**inputs, labels=labels)
                loss = outputs.loss / accumulation_steps  # Scale loss for accumulation

            scaler.scale(loss).backward()
            if (i + 1) % accumulation_steps == 0 or i + 1 == len(train_data):
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()
    model.eval()

# Generate recommendation using dataset-tuned LLM
def generate_dataset_tuned_recommendation(model, tokenizer, prompt):
    inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True, max_length=128).to(device)
    
    if hasattr(model.config, 'model_type') and model.config.model_type in ['biogpt', 'gpt2', 'llama']:
        outputs = model.generate(
            **inputs,
            max_length=200,
            num_return_sequences=1,
            do_sample=True,
            top_k=40,
            top_p=0.92,
            temperature=0.8,
            no_repeat_ngram_size=2
        )
    elif hasattr(model.config, 'model_type') and model.config.model_type in ['t5', 'molt5']:
        outputs = model.generate(
            **inputs,
            max_length=200,
            num_return_sequences=1,
            do_sample=True,
            top_k=40,
            top_p=0.92,
            temperature=0.8,
            no_repeat_ngram_size=2,
            decoder_start_token_id=tokenizer.pad_token_id
        )
    elif hasattr(model.config, 'model_type') and model.config.model_type in ['bert', 'roberta', 'chemberta']:
        return "ChemBERTa is a classification model and cannot generate recommendations directly."
    else:
        raise ValueError(f"Unsupported model type for generation: {model.config.model_type if hasattr(model.config, 'model_type') else 'Unknown'}")
    
    return tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

print("DATASET TUNING COMPLETE")

DATASET TUNING COMPLETE


<div>
    <h3 align="left"><font color="red"> R2.4 </font></h3>
</div>

In [18]:
# Generate recommendation using Dataset
smiles_list = df['standard_smiles'].tolist()[:100]  # Reduced to 100 SMILES for memory
print(f"Tuning on first 100 SMILES: {smiles_list[:5]}... (total {len(smiles_list)})")  # Debug
fine_tune_llm(llm_configs[active_llm]['model'], llm_configs[active_llm]['tokenizer'], smiles_list)
prompt = "Propose a novel chemical compound for drug development, including a SMILES string and its potential therapeutic application."
dataset_tuned_rec = generate_dataset_tuned_recommendation(
    llm_configs[active_llm]['model'], llm_configs[active_llm]['tokenizer'], prompt
)

print(f"Dataset-Tuned {active_llm} Recommendation: {dataset_tuned_rec}")

  scaler = torch.cuda.amp.GradScaler()  # Enable mixed precision


Tuning on first 100 SMILES: ['O=S(=O)(Nc1cccc(-c2cnc3ccccc3n2)c1)c1cccs1', 'O=c1cc(-c2nc(-c3ccc(-c4cn(CCP(=O)(O)O)nn4)cc3)[nH]c2-c2ccc(F)cc2)cc[nH]1', 'NC(=O)c1ccc2c(c1)nc(C1CCC(O)CC1)n2CCCO', 'NCCCn1c(C2CCNCC2)nc2cc(C(N)=O)ccc21', 'CNC(=S)Nc1cccc(-c2cnc3ccccc3n2)c1']... (total 100)
Model config class: T5Config



  with torch.cuda.amp.autocast():
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.

Epoch 1:   1%|          | 1/100 [00:00<01:18,  1.26it/s][A
Epoch 1:   2%|▏         | 2/100 [00:01<01:00,  1.62it/s][A
Epoch 1:   3%|▎         | 3/100 [00:01<00:58,  1.66it/s][A
Epoch 1:   4%|▍         | 4/100 [00:02<01:02,  1.55it/s][A
Epoch 1:   5%|▌         | 5/100 [00:03<00:57,  1.66it/s][A
Epoch 1:   6%|▌         | 6/100 [00:03<00:50,  1.86it/s][A
Epoch 1:   7%|▋         | 7/100 [00:03<00:44,  2.08it/s][A
Epoch 1:   8%|▊         | 8/100 [00:04<00:42,  2.16it/s][A
Epoch 1:   9%|▉         | 9/100 [00:04<00:38,  2.38it/s][A
Epoch 1:  10%|█         | 10/100 [00:04<00:36,  2.49it/s][A
Epoch 1:  11%|█         | 11/100 [00:05<00:32,  2.75it/s][A
Epoch 1:  12%|█▏        | 12/100 [00:05<00:30,  2.88it/s][A
Epoc

Dataset-Tuned MolT5 Recommendation: The molecule is an organosilicon compound that is dimethylsilane in which one of the hydrogens attached to the silicon is replaced by a p-(methylthio)phenyl group, while the other is substituted by an ethylidene group. It is prone to cause immediate, severe inflammation of internal organs, including those associated with irritable bowel syndrome. Cilastatin is also used in the treatment of rheumatoid arthritis. it shows antitussive activity against primary and inflammation. Additionally it can be used as praziquantel for the alleviation of postoperative pain. Administered as the racemate, only the (S)-enantiomer is active.


<div>
    <h3 align="left"><font color="red"> R2.1 </font></h3>
</div>

In [19]:
# Generate Morgan fingerprint for query and create graph data
def get_morgan_fingerprint_graph(smiles, radius=2, nBits=2048):
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return None
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits)
        node_feat = torch.FloatTensor(list(fp)).unsqueeze(0)  # Convert to tensor with batch dim
        edge_index = torch.tensor([[0], [0]], dtype=torch.long)  # Simple graph structure
        return Data(x=node_feat, edge_index=edge_index)
    except:
        return None

# Similarity search using HNSW with GIN embeddings
def search_similar_compounds(smiles, gin_model, index, k=5):
    graph_data = get_morgan_fingerprint_graph(smiles)
    if graph_data is None:
        return None
    graph_data = graph_data.to(device)
    with torch.no_grad():
        embedding = gin_model(graph_data.x, graph_data.edge_index, torch.zeros(1, dtype=torch.long).to(device))
        print(f"Embedding shape before reshape: {embedding.shape}")  # Debug shape
        # Reshape to 2D and convert to numpy
        embedding = embedding.squeeze().cpu().numpy()  # Remove batch dim
        query = embedding[np.newaxis, :]  # Add batch dimension for FAISS
        print(f"Query shape after reshape: {query.shape}")  # Debug shape
        if query.shape[1] != index.d:
            raise ValueError(f"Query dimension ({query.shape[1]}) does not match index dimension ({index.d})")
        distances, indices = index.search(query, k)  # Search with reshaped query
    return df.iloc[indices[0]]['standard_smiles'].values

# Fine-tune LLM with HNSW-derived compounds
def fine_tune_with_hnsw(model, tokenizer, smiles_list, similar_smiles, epochs=1, batch_size=1):
    model.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
    scaler = torch.cuda.amp.GradScaler()  # Enable mixed precision
    # Determine model type using config
    config = model.config
    print(f"Model config class: {config.__class__.__name__}")  # Debug model type
    if isinstance(config, PretrainedConfig) and hasattr(config, 'model_type'):
        model_type = config.model_type
        if model_type in ['biogpt', 'gpt2', 'llama']:  # Causal LM examples
            model_type = 'causal'
        elif model_type in ['t5', 'molt5']:  # Seq2Seq examples
            model_type = 'seq2seq'
        elif model_type in ['bert', 'roberta', 'chemberta']:  # Classification examples
            model_type = 'classification'
        else:
            raise ValueError(f"Unsupported model type: {model_type}")
    else:
        raise ValueError("Unable to determine model type from configuration")

    # Create training examples combining dataset and HNSW similar compounds
    templates = [
        "This compound, enhanced by similar structures, may exhibit anti-inflammatory properties.",
        "A novel structure for drug development with possible antimicrobial effects based on similar compounds.",
        "This chemical, informed by similar molecules, could be a new lead for cancer therapy research."
    ]
    train_data = [f"{smiles}\t{templates[i % len(templates)]}" for i, smiles in enumerate(similar_smiles)]

    for epoch in range(epochs):
        np.random.shuffle(train_data)  # Shuffle to improve learning
        optimizer.zero_grad()
        for i in tqdm(range(0, len(train_data), batch_size), desc=f"HNSW Tuning Epoch {epoch+1}"):
            batch = train_data[i:i+batch_size]
            inputs = tokenizer(batch, return_tensors='pt', padding=True, truncation=True, max_length=128).to(device)
            
            with torch.cuda.amp.autocast():
                if model_type == 'causal':
                    outputs = model(**inputs, labels=inputs['input_ids'])
                elif model_type == 'seq2seq':
                    decoder_input_ids = inputs['input_ids'].clone()
                    decoder_input_ids[:, 1:] = decoder_input_ids[:, :-1].clone()  # Shift for teacher forcing
                    decoder_input_ids[:, 0] = tokenizer.pad_token_id  # Start with pad token
                    outputs = model(**inputs, decoder_input_ids=decoder_input_ids, labels=inputs['input_ids'])
                elif model_type == 'classification':
                    labels = torch.zeros(len(batch), dtype=torch.long).to(device)  # Dummy labels
                    outputs = model(**inputs, labels=labels)
                loss = outputs.loss  # No accumulation for now, adjust if needed

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
    model.eval()

# Generate recommendation using HNSW-tuned LLM
def generate_hnsw_tuned_recommendation(model, tokenizer):
    prompt = f"Generate a novel chemical compound for drug development. Provide a SMILES string and its potential therapeutic application."
    inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True, max_length=128).to(device)
    
    if hasattr(model.config, 'model_type') and model.config.model_type in ['biogpt', 'gpt2', 'llama']:
        outputs = model.generate(
            **inputs,
            max_length=200,
            num_return_sequences=1,
            do_sample=True,
            top_k=40,
            top_p=0.92,
            temperature=0.8,
            no_repeat_ngram_size=2
        )
    elif hasattr(model.config, 'model_type') and model.config.model_type in ['t5', 'molt5']:
        outputs = model.generate(
            **inputs,
            max_length=200,
            num_return_sequences=1,
            do_sample=True,
            top_k=40,
            top_p=0.92,
            temperature=0.8,
            no_repeat_ngram_size=2,
            decoder_start_token_id=tokenizer.pad_token_id
        )
    elif hasattr(model.config, 'model_type') and model.config.model_type in ['bert', 'roberta', 'chemberta']:
        return "ChemBERTa is a classification model and cannot generate recommendations directly."
    else:
        raise ValueError(f"Unsupported model type for generation: {model.config.model_type if hasattr(model.config, 'model_type') else 'Unknown'}")
    
    return tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

print("HNSW TUNING COMPLETE")

HNSW TUNING COMPLETE


<div>
    <h3 align="left"><font color="red"> R2.4 </font></h3>
</div>

In [20]:
# HNSW-Tuned recommendation
query_smiles = df['standard_smiles'].iloc[0]  # Example query
similar_smiles = search_similar_compounds(query_smiles, gin_model, index)
hnsw_tuned_rec = None
if similar_smiles is not None:
    print("Similar Compounds:")
    for smi in similar_smiles:
        print(smi)
    # Re-tune LLM with HNSW-derived compounds
    fine_tune_with_hnsw(llm_configs[active_llm]['model'], llm_configs[active_llm]['tokenizer'], df['standard_smiles'].tolist()[:100], similar_smiles)
    hnsw_tuned_rec = generate_hnsw_tuned_recommendation(
        llm_configs[active_llm]['model'], llm_configs[active_llm]['tokenizer']
    )
    print(f"HNSW-Tuned {active_llm} Recommendation: {hnsw_tuned_rec}")
else:
    print("HNSW search failed due to invalid query SMILES or index mismatch")

  scaler = torch.cuda.amp.GradScaler()  # Enable mixed precision


Embedding shape before reshape: torch.Size([1, 256])
Query shape after reshape: (1, 256)
Similar Compounds:
O=S(=O)(Nc1cccc(-c2cnc3ccccc3n2)c1)c1cccs1
NC(=O)c1cnc2ccccc2c1
CC(NC(=O)c1c[nH]c2ncc(C3CC3)nc12)C1CCCCC1
COCc1ccc(O)c2ncccc12
O=C1C2C3C=CC(C4C=CC43)C2C(=O)N1CCCCN1CCN(c2ncccn2)CC1
Model config class: T5Config


  with torch.cuda.amp.autocast():
HNSW Tuning Epoch 1: 100%|██████████| 5/5 [00:01<00:00,  3.43it/s]


HNSW-Tuned MolT5 Recommendation: The molecule is an organosilicon compound that is dimethylsilane in which one of the hydrogens attached to the silicon is replaced by a 1,2,4-triazol-1-yl group. It is metabolite of diazinon.


<div>
    <h4 align="left"><font color="green"> Saving Recommendations </font></h4>
</div>

In [21]:
with open('/kaggle/working/recommendations.txt', 'w') as f:
    f.write(f"Dataset-Tuned {active_llm} Recommendation: {dataset_tuned_rec}\n")
    f.write(f"HNSW-Tuned {active_llm} Recommendation: {hnsw_tuned_rec}\n")
print("Recommendations saved to /kaggle/working/recommendations.txt")

Recommendations saved to /kaggle/working/recommendations.txt


<div>
    <h2 align="center"><font color="purple"> Deliverable 3 Code </font></h2>
</div>

In [30]:
import torch
import pandas as pd
import numpy as np
import ast
import re
from rdkit import Chem
from rdkit.Chem import AllChem
import faiss
from torch_geometric.nn import GINConv, global_add_pool
from torch_geometric.data import Data
import torch.nn as nn
from langchain.prompts import ChatPromptTemplate
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM
import gc

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Load preprocessed data
df = pd.read_csv('/kaggle/working/preprocessed_data_with_embeddings.csv')
print("Loaded preprocessed data")

# Convert gnn_embedding strings to arrays
def parse_embedding(embedding_str):
    try:
        return np.array(ast.literal_eval(embedding_str), dtype=np.float32)
    except (ValueError, SyntaxError) as e:
        print(f"Error parsing embedding: {e}")
        return np.array([])

df['gnn_embedding'] = df['gnn_embedding'].apply(parse_embedding)
df = df[df['gnn_embedding'].apply(lambda x: len(x) > 0)]
print("Converted gnn_embedding strings to arrays")

# Load GNN model
class FingerprintGNN(nn.Module):
    def __init__(self, input_dim=2048, hidden_dim=512, output_dim=256):
        super().__init__()
        self.fp_to_node = nn.Linear(input_dim, hidden_dim)
        self.conv1 = GINConv(nn.Sequential(nn.Linear(hidden_dim, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, hidden_dim)))
        self.conv2 = GINConv(nn.Sequential(nn.Linear(hidden_dim, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, hidden_dim)))
        self.lin = nn.Linear(hidden_dim, output_dim)

    def forward(self, x, edge_index, batch):
        x = self.fp_to_node(x)
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return self.lin(global_add_pool(x, batch))

gin_model = FingerprintGNN().to(device)
gin_model.load_state_dict(torch.load('/kaggle/working/gin_model.pth'))
gin_model.eval()
print("Loaded GNN model")

# Load HNSW index
embedding_matrix = np.stack(df['gnn_embedding'].values)
index = faiss.read_index('/kaggle/working/gnn_hnsw_index.faiss')
print("Loaded HNSW index")

# Generate graph data from SMILES
def get_morgan_fingerprint_graph(smiles, radius=2, nBits=2048):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits)
    node_feat = torch.FloatTensor(list(fp)).unsqueeze(0)
    edge_index = torch.tensor([[0], [0]], dtype=torch.long)
    return Data(x=node_feat, edge_index=edge_index)

# Validate SMILES and check novelty
def validate_smiles(smiles, context_smiles, dataset_smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return False, "Invalid SMILES string"
        if smiles in context_smiles:
            return False, "Generated SMILES is not novel (matches context)"
        if smiles in dataset_smiles:
            return False, "Generated SMILES exists in dataset"
        return True, "Valid and novel SMILES"
    except Exception as e:
        return False, f"Error processing SMILES: {str(e)}"

# RAG integration function
def rag_recommendation(llm_config, query_smiles, top_k=5, dataset_smiles=None):
    graph_data = get_morgan_fingerprint_graph(query_smiles)
    if graph_data is None:
        return "SMILES: Invalid Application: None\nModel output issue: Invalid query SMILES"
    graph_data = graph_data.to(device)
    with torch.no_grad():
        query_embedding = gin_model(graph_data.x, graph_data.edge_index, torch.zeros(1, dtype=torch.long).to(device)).cpu().numpy()
    print(f"Query embedding shape: {query_embedding.shape}")
    if query_embedding.shape[1] != index.d:
        raise ValueError(f"Query dimension ({query_embedding.shape[1]}) does not match index dimension ({index.d})")
    query_embedding = query_embedding.reshape(1, -1).astype(np.float32)
    faiss.normalize_L2(query_embedding)
    
    distances, indices = index.search(query_embedding, top_k)
    similar_smiles = df.iloc[indices[0]]['standard_smiles'].values
    context = "\n".join([f"Compound: {smi}" for smi in similar_smiles])
    prompt_template = ChatPromptTemplate.from_messages([
        ("system", "Output ONLY a novel, valid SMILES string and one therapeutic application in this format: SMILES: <smiles> Application: <application>. Do NOT repeat the prompt, include descriptions, extra text, or invalid SMILES. The SMILES must be valid and distinct from context compounds. Examples: SMILES: c1cc(c(c(c1)F)N)NC(=O)c2cnc(s2) Application: Potential use in treating Alzheimer’s disease; SMILES: c1cc(c(c(c1)OC)N)NC(=O)c2cnc(o2) Application: Potential use in treating HIV; SMILES: c1ccccc1C(=O)N Application: Potential use in treating epilepsy"),
        ("user", "Context compounds:\n{context}\nOutput: SMILES: <smiles> Application: <application>")
    ])
    
    tokenizer = llm_config['tokenizer']
    model = llm_config['model']
    prompt = prompt_template.format(context=context)
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=384).to(device)
    
    print(f"Model type: {type(model)}")
    print(f"Decoded prompt: {tokenizer.decode(inputs['input_ids'][0], skip_special_tokens=True)}")
    
    try:
        with torch.no_grad():
            if isinstance(model, AutoModelForCausalLM):
                print("Detected as Causal LM")
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=200,
                    pad_token_id=tokenizer.pad_token_id,
                    do_sample=True,
                    temperature=0.3,
                    top_p=0.9,
                    top_k=50,
                    no_repeat_ngram_size=2
                )
            elif isinstance(model, AutoModelForSeq2SeqLM):
                print("Detected as Seq2Seq LM")
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=200,
                    pad_token_id=tokenizer.pad_token_id,
                    do_sample=True,
                    temperature=0.5,
                    top_p=0.9,
                    top_k=40,
                    no_repeat_ngram_size=2,
                    decoder_start_token_id=0
                )
            else:
                raise ValueError(f"Unsupported model type: {type(model)}")
        
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        print(f"Raw model output: {response}")
        
        # Flexible parsing
        smiles_match = re.search(r'(?:SMILES?|SMID:)\s*([^\n;]+)', response, re.IGNORECASE)
        app_match = re.search(r'Application:\s*([^\n;]+)', response, re.IGNORECASE)
        
        if smiles_match and app_match:
            smiles = smiles_match.group(1).strip()
            application = app_match.group(1).strip()
            is_valid, message = validate_smiles(smiles, similar_smiles, dataset_smiles)
            if is_valid:
                print(f"Valid output: SMILES: {smiles} Application: {application}")
                return f"SMILES: {smiles} Application: {application}"
            else:
                print(f"Validation failed: {message}")
                return f"SMILES: Invalid Application: None\nModel output issue: {message}"
        else:
            error_msg = f"Parsing failed: Missing {'SMILES' if not smiles_match else ''}{' and ' if not smiles_match and not app_match else ''}{'Application' if not app_match else ''}"
            print(error_msg)
            return f"SMILES: Invalid Application: None\nModel output issue: {error_msg}"
    
    except Exception as e:
        error_msg = f"Generation failed: {str(e)}"
        print(error_msg)
        return f"SMILES: Invalid Application: None\nModel output issue: {error_msg}"

# LLM configurations
llm_configs = {
    'BioGPT': {
        'model_name': 'microsoft/biogpt',
        'tokenizer': AutoTokenizer.from_pretrained('microsoft/biogpt'),
        'model': AutoModelForCausalLM.from_pretrained('microsoft/biogpt', torch_dtype=torch.float16).to(device)
    },
    'MolT5': {
        'model_name': 'laituan245/molt5-large-smiles2caption',
        'tokenizer': AutoTokenizer.from_pretrained('laituan245/molt5-large-smiles2caption'),
        'model': AutoModelForSeq2SeqLM.from_pretrained('laituan245/molt5-large-smiles2caption', torch_dtype=torch.float16).to(device)
    },
    'T5-small': {
        'model_name': 'google/t5-v1_1-small',
        'tokenizer': AutoTokenizer.from_pretrained('google/t5-v1_1-small'),
        'model': AutoModelForSeq2SeqLM.from_pretrained('google/t5-v1_1-small', torch_dtype=torch.float16).to(device)
    }
}

# Execute models
for llm_name in llm_configs:
    print(f"Processing {llm_name} with RAG:...")
    query_smiles = df['standard_smiles'].iloc[0]
    recommendation = rag_recommendation(llm_configs[llm_name], query_smiles, dataset_smiles=dataset_smiles)
    print(f"{llm_name} RAG Recommendation:\n{recommendation}\n\n")
    # Clear memory
    del llm_configs[llm_name]['model']
    del llm_configs[llm_name]['tokenizer']
    torch.cuda.empty_cache()
    gc.collect()

print("Deliverable 3 code execution completed")

Using device: cuda
Loaded preprocessed data
Converted gnn_embedding strings to arrays
Loaded GNN model
Loaded HNSW index


tokenizer_config.json:   0%|          | 0.00/1.86k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/537 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


pytorch_model.bin:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Processing BioGPT with RAG:...
Query embedding shape: (1, 256)
Model type: <class 'transformers.models.biogpt.modeling_biogpt.BioGptForCausalLM'>
Decoded prompt: System: Output ONLY a novel, valid SMILES string and one therapeutic application in this format: SMILES: < smiles > Application: < application >. Do NOT repeat the prompt, include descriptions, extra text, or invalid SMILES. The SMILES must be valid and distinct from context compounds. Examples: SMILES: c1cc (c (c (c1) F) N) NC (= O) c2cnc (s2) Application: Potential use in treating Alzheimer s disease; SMILES: c1cc (c (c (c1) OC) N) NC (= O) c2cnc (o2) Application: Potential use in treating HIV; SMILES: c1ccccc1C (= O) N Application: Potential use in treating epilepsy Human: Context compounds: Compound: O = S (= O) (Nc1cccc (-c2cnc3ccccc3n2) c1) c1cccs1 Compound: NC (= O) c1cnc2ccccc2c1 Compound: CC (NC (= O) c1c [nH] c2ncc (C3CC3) nc12) C1CCCCC1 Compound: COCc1ccc (O) c2ncccc12 Compound: O = C1C2C3C = CC (C4C = CC43) C2C (= 

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

Processing T5-small with RAG:...
Query embedding shape: (1, 256)
Model type: <class 'transformers.models.t5.modeling_t5.T5ForConditionalGeneration'>
Decoded prompt: System: Output ONLY a novel, valid SMILES string and one therapeutic application in this format: SMILES: smiles> Application: application>. Do NOT repeat the prompt, include descriptions, extra text, or invalid SMILES. The SMILES must be valid and distinct from context compounds. Examples: SMILES: c1cc(c(c(c1)F)N)NC(=O)c2cnc(s2) Application: Potential use in treating Alzheimer’s disease; SMILES: c1cc(c(c(c1)OC)N)NC(=O)c2cnc(o2) Application: Potential use in treating HIV; SMILES: c1ccccc1C(=O)N Application: Potential use in treating epilepsy Human: Context compounds: Compound: O=S(=O)(Nc1cccc(-c2cnc3ccccc3n2)c1)c1cccs1 Compound: NC(=O)c1cnc2ccccc2c1 Compound: CC(NC(=O)c1c[nH]c2ncc(C3CC3)nc12)C1CCCCC1 Compound: COCc1ccc(O)c2ncccc12 Compound: O=C1C2C3C=CC(C4C=CC43)C2C(=O)N1CCCCN1CCN(c2ncccn2)CC1 Output: SMILES: smiles> Applica