In [16]:
pip install rdkit-pypi hnswlib torch_geometric

Collecting torch_geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m55.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch_geometric
Successfully installed torch_geometric-2.6.1
Note: you may need to restart the kernel to use updated packages.


In [None]:
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from transformers import pipeline
import hnswlib
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GINConv, global_add_pool

# ===========================================
# STEP 1: Data Loading and Preprocessing
# ===========================================
print("Loading and preprocessing data...")

# Load dataset
df = pd.read_csv('/kaggle/input/smiles/SMILES_Big_Data_Set.csv')

# Standardize molecules
def standardize_smiles(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return None
        return Chem.MolToSmiles(mol, isomericSmiles=True)
    except:
        return None

df['standard_smiles'] = df['SMILES'].apply(standardize_smiles)
df = df.dropna(subset=['standard_smiles']).drop_duplicates(subset=['standard_smiles'])
df['mol'] = df['standard_smiles'].apply(Chem.MolFromSmiles)

# ===========================================
# STEP 2: Fingerprint Generation (Morgan only)
# ===========================================
print("\nGenerating Morgan fingerprints...")

def generate_morgan_fingerprint(mol, radius=2, n_bits=2048):
    if mol is None:
        return np.zeros(n_bits)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=n_bits)
    arr = np.zeros((1,))
    DataStructs.ConvertToNumpyArray(fp, arr)
    return arr

df['morgan_fp'] = df['mol'].apply(lambda x: generate_morgan_fingerprint(x))
fp_matrix = np.stack(df['morgan_fp'].values)
print(f"Fingerprint matrix shape: {fp_matrix.shape}")

# ===========================================
# STEP 3: GNN for Fingerprint Embedding
# ===========================================
print("\nBuilding GNN for fingerprint embedding...")

class FingerprintGNN(nn.Module):
    def __init__(self, input_dim=2048, hidden_dim=512, output_dim=256):
        super().__init__()
        # Convert fingerprint to graph (artificial topology)
        self.fp_to_node = nn.Linear(input_dim, hidden_dim)
        self.conv1 = GINConv(nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim)
        ))
        self.conv2 = GINConv(nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim)
        ))
        self.lin = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):
        # Create artificial graph structure
        batch_size = x.shape[0]
        x = self.fp_to_node(x)
        
        # Create fully connected graph for each sample
        edge_index = []
        for i in range(batch_size):
            # Connect all nodes to all other nodes (fully connected)
            for j in range(batch_size):
                if i != j:
                    edge_index.append([i, j])
        
        if len(edge_index) == 0:
            edge_index = torch.zeros((2, 0), dtype=torch.long)
        else:
            edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
        
        # Batch index
        batch = torch.arange(batch_size, dtype=torch.long)
        
        # GNN processing
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        x = global_add_pool(x, batch)
        return self.lin(x)

# Initialize and train GNN (simplified training loop)
gnn_model = FingerprintGNN()
gnn_model.eval()

def fp_to_embedding(fingerprint):
    with torch.no_grad():
        tensor = torch.FloatTensor(fingerprint).unsqueeze(0)
        return gnn_model(tensor).squeeze().numpy()

print("Generating GNN embeddings from fingerprints...")
df['gnn_embedding'] = [fp_to_embedding(fp) for fp in df['morgan_fp']]
embedding_matrix = np.stack(df['gnn_embedding'].values)
print(f"Embedding matrix shape: {embedding_matrix.shape}")

Loading and preprocessing data...

Generating Morgan fingerprints...
Fingerprint matrix shape: (15872, 2048)

Building GNN for fingerprint embedding...
Generating GNN embeddings from fingerprints...
Embedding matrix shape: (15872, 256)


In [18]:

# ===========================================
# STEP 4: HNSW Index Construction
# ===========================================
print("\nBuilding HNSW index...")

dim = embedding_matrix.shape[1]
num_elements = embedding_matrix.shape[0]

hnsw_index = hnswlib.Index(space='cosine', dim=dim)
hnsw_index.init_index(max_elements=num_elements, ef_construction=200, M=16)
hnsw_index.add_items(embedding_matrix)
hnsw_index.set_ef(50)
hnsw_index.save_index('compound_index.hnsw')



Building HNSW index...


In [None]:

# ===========================================
# STEP 5: Enhanced Recommendation System
# ===========================================
print("\nBuilding recommendation system with LLM enhancement...")

class CompoundRecommender:
    def __init__(self, df, hnsw_index):
        self.df = df
        self.hnsw_index = hnsw_index
        self.llms = {}
        self._init_llms()
        
    def _init_llms(self):
        # Initialize LLMs with error handling
        if not self.llms:
            self.llms = {}
            try:
                self.llms['deepseek'] = pipeline(
                    "text-generation", 
                    model="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
                    device="cuda" if torch.cuda.is_available() else "cpu"
                )
            except Exception as e:
                print(f"Could not load DeepSeek: {e}")

            try:
                self.llms['llama'] = pipeline(
                    "text-generation", 
                    model="meta-llama/Llama-3.2-1B",
                    device="cuda" if torch.cuda.is_available() else "cpu"
                )
            except Exception as e:
                print(f"Could not load DeepSeek: {e}")
            
            try:
                self.llms['phi3'] = pipeline(
                    "text-generation", 
                    model="microsoft/Phi-3-mini-4k-instruct",
                    device="cuda" if torch.cuda.is_available() else "cpu"
                )
            except Exception as e:
                print(f"Could not load Phi-3: {e}")
    
    def _llm_rerank(self, query_smiles, candidates):
        """Use available LLMs to rerank candidates"""
        if not self.llms:
            print("abc")
            return candidates[:5]  # Fallback to top 5 if no LLMs loaded
            
        prompt = f"""Rank these chemical compounds by similarity to {query_smiles}:
        {chr(10).join([f"{i+1}. {c['standard_smiles']}" for i, c in enumerate(candidates)])}
        Return ONLY the numbers in order of most similar to least."""
        
        try:
            if 'deepseek' in self.llms:
                result = self.llms['deepseek'](prompt, max_new_tokens=50)[0]['generated_text']
                print(result)
                # Parse result and reorder (simplified)
                return candidates[:5]  # Placeholder
            elif 'phi3' in self.llms:
                # Similar implementation for Phi-3
                return candidates[:5]
        except:
            return candidates[:5]
    
    def recommend(self, smiles, k=5):
        """Get recommendations for a query SMILES"""
        try:
            # Generate fingerprint and embedding
            mol = Chem.MolFromSmiles(smiles)
            if not mol:
                return []
                
            fp = generate_morgan_fingerprint(mol)
            embedding = fp_to_embedding(fp)
            
            # Get initial candidates
            labels, distances = self.hnsw_index.knn_query(embedding, k=k*3)
            candidates = [self.df.iloc[idx].to_dict() for idx in labels[0]]
            
            # LLM reranking
            return self._llm_rerank(smiles, candidates)[:k]
        except Exception as e:
            print(f"Recommendation error: {e}")
            return []

# Initialize recommender
recommender = CompoundRecommender(df, hnsw_index)

# ===========================================
# Example Usage
# ===========================================
print("\nExample recommendation for caffeine:")
results = recommender.recommend("CN1C=NC2=C1C(=O)N(C(=O)N2C)C")
for i, res in enumerate(results, 1):
    print(f"{i}. {res['standard_smiles']} (pIC50: {res['pIC50']})")


Building recommendation system with LLM enhancement...
Could not load DeepSeek: deepseek-ai/deepseek-llm-7b is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`


config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/16.5k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]


Example recommendation for caffeine:
1. Cn1c(=O)c2c(ncn2C)n(C)c1=O (pIC50: nan)
2. Cc1nonc1C (pIC50: 0.21)
3. Cc1nc(Cl)c(C)nc1Cl (pIC50: 0.0)
4. C[Si](C)(C)C (pIC50: 0.01)
5. Cc1nc(C)c(C)nc1C (pIC50: 0.02)


In [21]:
import numpy as np
from rdkit import DataStructs
from rdkit.Chem import AllChem
from sklearn.metrics import precision_score, recall_score
import matplotlib.pyplot as plt
import seaborn as sns

class RecommenderEvaluator:
    def __init__(self, recommender, df):
        """
        Initialize evaluator with:
        - recommender: Your CompoundRecommender instance
        - df: DataFrame containing all compounds
        """
        self.recommender = recommender
        self.df = df
        self.test_set = self._create_test_set()
        
    def _create_test_set(self, sample_size=100):
        """Create a balanced test set of valid molecules"""
        valid_smiles = [s for s in self.df['standard_smiles'] if Chem.MolFromSmiles(s)]
        return np.random.choice(valid_smiles, min(sample_size, len(valid_smiles)), replace=False)
    
    def _get_ground_truth(self, query_smiles, k=5):
        """Calculate ground truth using Tanimoto similarity"""
        query_mol = Chem.MolFromSmiles(query_smiles)
        query_fp = AllChem.GetMorganFingerprint(query_mol, 2)
        
        similarities = []
        for _, row in self.df.iterrows():
            mol = Chem.MolFromSmiles(row['standard_smiles'])
            if mol:
                fp = AllChem.GetMorganFingerprint(mol, 2)
                sim = DataStructs.TanimotoSimilarity(query_fp, fp)
                similarities.append((row['standard_smiles'], sim))
        
        # Sort by similarity and take top k
        similarities.sort(key=lambda x: x[1], reverse=True)
        return [s[0] for s in similarities[:k]]
    
    def evaluate_recommendations(self, k=5):
        """
        Evaluate recommendation quality using:
        - Hit Rate @ k
        - Precision @ k
        - Recall @ k
        - Mean Reciprocal Rank (MRR)
        - Coverage
        """
        hit_rates = []
        precisions = []
        recalls = []
        reciprocal_ranks = []
        recommended_smiles = set()
        
        for query_smiles in tqdm(self.test_set, desc="Evaluating"):
            # Get recommendations
            recommendations = [r['standard_smiles'] for r in self.recommender.recommend(query_smiles, k=k)]
            recommended_smiles.update(recommendations)
            
            # Get ground truth
            ground_truth = self._get_ground_truth(query_smiles, k=k)
            
            # Calculate metrics
            hits = len(set(recommendations) & set(ground_truth))
            hit_rates.append(hits / k)
            
            y_true = [1 if smi in ground_truth else 0 for smi in recommendations]
            y_pred = [1] * len(y_true)  # All recommendations are predicted positives
            
            precisions.append(precision_score(y_true, y_pred, zero_division=0))
            recalls.append(recall_score(y_true, y_pred, zero_division=0))
            
            # Calculate MRR
            for rank, smi in enumerate(recommendations, 1):
                if smi in ground_truth:
                    reciprocal_ranks.append(1 / rank)
                    break
            else:
                reciprocal_ranks.append(0)
        
        # Calculate coverage
        coverage = len(recommended_smiles) / len(self.df)
        
        return {
            'hit_rate@k': np.mean(hit_rates),
            'precision@k': np.mean(precisions),
            'recall@k': np.mean(recalls),
            'mrr': np.mean(reciprocal_ranks),
            'coverage': coverage
        }
    
    def evaluate_embedding_quality(self):
        """Evaluate the GNN embedding space quality"""
        distances = []
        similarities = []
        
        # Sample pairs of molecules
        sample_size = min(500, len(self.df))
        sampled_df = self.df.sample(sample_size)
        
        for _, row in tqdm(sampled_df.iterrows(), total=sample_size, desc="Evaluating embeddings"):
            mol = Chem.MolFromSmiles(row['standard_smiles'])
            if not mol:
                continue
                
            # Get nearest neighbor in embedding space
            emb = row['gnn_embedding'].reshape(1, -1)
            labels, _ = self.recommender.hnsw_index.knn_query(emb, k=2)
            neighbor_smiles = self.df.iloc[labels[0][1]]['standard_smiles']  # Skip self
            
            # Calculate chemical similarity
            neighbor_mol = Chem.MolFromSmiles(neighbor_smiles)
            if neighbor_mol:
                fp1 = AllChem.GetMorganFingerprint(mol, 2)
                fp2 = AllChem.GetMorganFingerprint(neighbor_mol, 2)
                similarities.append(DataStructs.TanimotoSimilarity(fp1, fp2))
        
        return {
            'mean_similarity': np.mean(similarities),
            'similarity_distribution': similarities
        }
    

# Example Usage
if __name__ == "__main__":
    # Initialize with your recommender and dataframe
    evaluator = RecommenderEvaluator(recommender, df)
    
    # Evaluate recommendation quality
    print("\nEvaluating recommendation quality...")
    rec_metrics = evaluator.evaluate_recommendations(k=5)
    print("\nRecommendation Metrics:")
    for k, v in rec_metrics.items():
        print(f"{k}: {v:.3f}")
    
    # Evaluate embedding quality
    print("\nEvaluating embedding quality...")
    emb_metrics = evaluator.evaluate_embedding_quality()
    print(f"\nMean chemical similarity of neighbors: {emb_metrics['mean_similarity']:.3f}")
    


Evaluating recommendation quality...


Evaluating: 100%|██████████| 100/100 [07:26<00:00,  4.46s/it]



Recommendation Metrics:
hit_rate@k: 0.590
precision@k: 0.590
recall@k: 0.990
mrr: 0.990
coverage: 0.031

Evaluating embedding quality...


Evaluating embeddings: 100%|██████████| 500/500 [00:00<00:00, 1232.22it/s]


Mean chemical similarity of neighbors: 0.623



