In [1]:
import requests
import time
import pandas as pd
from typing import List, Dict
import logging
from rdkit import Chem

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class CompoundCrawler:
    def __init__(self):
        self.pubchem_base_url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug"
        self.chembl_base_url = "https://www.ebi.ac.uk/chembl/api/data"
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Python CompoundCrawler/1.0 (responsible data mining)',
            'Accept': 'application/json'
        })

    def get_pubchem_toxicity(self, cid: str) -> Dict:
        """
        Retrieve toxicity data from PubChem
        """
        try:
            # Get bioassay data related to toxicity
            url = f"{self.pubchem_base_url}/compound/cid/{cid}/assaysummary/JSON"
            response = self.session.get(url)
            response.raise_for_status()
            
            tox_data = {
                'LD50': [],
                'LC50': [],
                'EC50': [],
                'IC50': [],
                'toxicity_studies': []
            }
            
            if response.ok:
                data = response.json()
                if 'AssaySummaries' in data:
                    for assay in data['AssaySummaries']:
                        if any(term in assay.get('Name', '').lower() for term in ['toxic', 'ld50', 'lc50', 'ec50', 'ic50']):
                            organism = assay.get('OrganismName', '')
                            name = assay.get('Name', '')
                            value = assay.get('Value', '')
                            
                            study = {
                                'organism': organism,
                                'assay_name': name,
                                'value': value
                            }
                            
                            if 'ld50' in name.lower():
                                tox_data['LD50'].append(study)
                            elif 'lc50' in name.lower():
                                tox_data['LC50'].append(study)
                            elif 'ec50' in name.lower():
                                tox_data['EC50'].append(study)
                            elif 'ic50' in name.lower():
                                tox_data['IC50'].append(study)
                            else:
                                tox_data['toxicity_studies'].append(study)
            
            return tox_data
            
        except Exception as e:
            logger.error(f"PubChem toxicity data error: {e}")
            return {}

    def get_chembl_toxicity(self, chembl_id: str) -> Dict:
        """
        Retrieve toxicity data from ChEMBL
        """
        try:
            # Get activities data
            url = f"{self.chembl_base_url}/activity"
            params = {
                'molecule_chembl_id': chembl_id,
                'type': 'IC50,EC50,LC50,LD50,TOX'
            }
            
            response = self.session.get(url, params=params)
            response.raise_for_status()
            
            tox_data = {
                'activities': []
            }
            
            if response.ok:
                data = response.json()
                for activity in data.get('activities', []):
                    study = {
                        'assay_type': activity.get('standard_type', ''),
                        'organism': activity.get('assay_organism', ''),
                        'target_name': activity.get('target_pref_name', ''),
                        'value': activity.get('standard_value', ''),
                        'units': activity.get('standard_units', ''),
                        'activity_comment': activity.get('activity_comment', '')
                    }
                    tox_data['activities'].append(study)
            
            return tox_data
            
        except Exception as e:
            logger.error(f"ChEMBL toxicity data error: {e}")
            return {}

    def get_pubchem_info(self, smiles: str) -> Dict:
        """
        Retrieve compound information from PubChem using SMILES
        """
        try:
            encoded_smiles = requests.utils.quote(smiles)
            url = f"{self.pubchem_base_url}/compound/smiles/{encoded_smiles}/property/IUPACName,MolecularFormula,ExactMass,CanonicalSMILES,Title/JSON"
            
            response = self.session.get(url)
            response.raise_for_status()
            
            data = response.json()
            if 'PropertyTable' in data and 'Properties' in data['PropertyTable']:
                properties = data['PropertyTable']['Properties'][0]
                
                # Get CID for toxicity data
                cid = properties.get('CID')
                if cid:
                    # Get toxicity data
                    tox_data = self.get_pubchem_toxicity(cid)
                    properties['toxicity_data'] = tox_data
                    
                    # Get description
                    try:
                        desc_url = f"{self.pubchem_base_url}/compound/cid/{cid}/description/JSON"
                        desc_response = self.session.get(desc_url)
                        if desc_response.ok:
                            desc_data = desc_response.json()
                            if 'InformationList' in desc_data and 'Information' in desc_data['InformationList']:
                                properties['Description'] = desc_data['InformationList']['Information'][0].get('Description', '')
                    except Exception as e:
                        logger.warning(f"Could not fetch description: {e}")
                        properties['Description'] = ''
                
                return properties
            
            return {'error': 'No PubChem data found'}
            
        except requests.exceptions.RequestException as e:
            logger.error(f"PubChem API error: {e}")
            return {'error': str(e)}
        
        finally:
            time.sleep(0.3)

    def get_chembl_info(self, smiles: str) -> Dict:
        """
        Retrieve compound information from ChEMBL using SMILES
        """
        try:
            url = f"{self.chembl_base_url}/molecule"
            params = {
                'molecule_structures__canonical_smiles__exact': smiles
            }
            
            response = self.session.get(url, params=params)
            response.raise_for_status()
            
            data = response.json()
            
            if not data.get('molecules'):
                params = {
                    'molecule_structures__canonical_smiles__flexmatch': smiles
                }
                response = self.session.get(url, params=params)
                response.raise_for_status()
                data = response.json()
            
            if not data.get('molecules'):
                return {'error': 'No ChEMBL data found'}
                
            molecule = data['molecules'][0]
            
            result = {
                'chembl_id': molecule.get('molecule_chembl_id'),
                'pref_name': molecule.get('pref_name'),
                'molecule_type': molecule.get('molecule_type'),
                'max_phase': molecule.get('max_phase'),
                'molecular_formula': molecule.get('molecule_properties', {}).get('full_molformula'),
                'molecular_weight': molecule.get('molecule_properties', {}).get('full_mwt'),
            }
            
            # Get toxicity data
            if result['chembl_id']:
                tox_data = self.get_chembl_toxicity(result['chembl_id'])
                result['toxicity_data'] = tox_data
            
            # Get mechanism data
            try:
                if result['chembl_id']:
                    activity_url = f"{self.chembl_base_url}/mechanism"
                    params = {'molecule_chembl_id': result['chembl_id']}
                    activity_response = self.session.get(activity_url, params=params)
                    if activity_response.ok:
                        activity_data = activity_response.json()
                        if activity_data.get('mechanisms'):
                            result['mechanism_of_action'] = activity_data['mechanisms'][0].get('mechanism_of_action')
                            result['target_name'] = activity_data['mechanisms'][0].get('target_name')
            except Exception as e:
                logger.warning(f"Could not fetch mechanism data: {e}")
            
            return result
            
        except requests.exceptions.RequestException as e:
            logger.error(f"ChEMBL API error: {e}")
            return {'error': str(e)}
        
        finally:
            time.sleep(0.3)

    def validate_smiles(self, smiles: str) -> bool:
        """
        Validate SMILES string using RDKit
        """
        try:
            mol = Chem.MolFromSmiles(smiles)
            return mol is not None
        except Exception as e:
            logger.error(f"SMILES validation error: {e}")
            return False

def format_toxicity_data(tox_data: Dict) -> Dict:
    """
    Format toxicity data for CSV output
    """
    formatted = {}
    
    # Format PubChem toxicity data
    if 'LD50' in tox_data:
        formatted['LD50_studies'] = len(tox_data['LD50'])
        for i, study in enumerate(tox_data['LD50'][:3]):  # Take first 3 studies
            formatted[f'LD50_study_{i+1}'] = f"{study['organism']}: {study['value']}"
    
    if 'LC50' in tox_data:
        formatted['LC50_studies'] = len(tox_data['LC50'])
        for i, study in enumerate(tox_data['LC50'][:3]):
            formatted[f'LC50_study_{i+1}'] = f"{study['organism']}: {study['value']}"
    
    if 'toxicity_studies' in tox_data:
        formatted['other_tox_studies'] = len(tox_data['toxicity_studies'])
        for i, study in enumerate(tox_data['toxicity_studies'][:3]):
            formatted[f'tox_study_{i+1}'] = f"{study['organism']}: {study['assay_name']}"
    
    return formatted

def process_smiles_list(smiles_list: List[str], output_file: str = 'compound_info.csv'):
    """
    Process a list of SMILES strings and save results to CSV
    """
    crawler = CompoundCrawler()
    results = []
    
    total = len(smiles_list)
    for idx, smiles in enumerate(smiles_list, 1):
        logger.info(f"Processing compound {idx}/{total}: {smiles}")
        
        if not crawler.validate_smiles(smiles):
            logger.warning(f"Invalid SMILES: {smiles}")
            continue
        
        pubchem_data = crawler.get_pubchem_info(smiles)
        chembl_data = crawler.get_chembl_info(smiles)
        
        # Format toxicity data
        pubchem_tox = format_toxicity_data(pubchem_data.get('toxicity_data', {}))
        chembl_tox = {}
        if 'toxicity_data' in chembl_data and 'activities' in chembl_data['toxicity_data']:
            for i, activity in enumerate(chembl_data['toxicity_data']['activities'][:5]):
                chembl_tox[f'chembl_tox_{i+1}'] = (
                    f"{activity['assay_type']} - {activity['organism']}: "
                    f"{activity['value']} {activity['units']}"
                )
        
        result = {
            'SMILES': smiles,
            'PubChem_IUPAC_Name': pubchem_data.get('IUPACName', ''),
            'PubChem_Formula': pubchem_data.get('MolecularFormula', ''),
            'PubChem_Mass': pubchem_data.get('ExactMass', ''),
            'PubChem_Title': pubchem_data.get('Title', ''),
            'PubChem_Description': pubchem_data.get('Description', ''),
            'ChEMBL_ID': chembl_data.get('chembl_id', ''),
            'ChEMBL_Name': chembl_data.get('pref_name', ''),
            'ChEMBL_Formula': chembl_data.get('molecular_formula', ''),
            'ChEMBL_Weight': chembl_data.get('molecular_weight', ''),
            'ChEMBL_Type': chembl_data.get('molecule_type', ''),
            'ChEMBL_Phase': chembl_data.get('max_phase', ''),
            'ChEMBL_Mechanism': chembl_data.get('mechanism_of_action', ''),
            'ChEMBL_Target': chembl_data.get('target_name', ''),
            **pubchem_tox,  # Add PubChem toxicity data
            **chembl_tox    # Add ChEMBL toxicity data
        }
        
        results.append(result)
        
        # Save intermediate results every 10 compounds
        if idx % 10 == 0:
            pd.DataFrame(results).to_csv(output_file, index=False)
            logger.info(f"Intermediate results saved to {output_file}")
    
    # Final save
    df = pd.DataFrame(results)
    df.to_csv(output_file, index=False)
    logger.info(f"Final results saved to {output_file}")
    return df

# Example usage
if __name__ == "__main__":
    smiles_list = [
        # "CC(=O)OC1=CC=CC=C1C(=O)O",  # Aspirin
        # "CN1C=NC2=C1C(=O)N(C(=O)N2C)C"  # Caffeine
        "C[C@@H](C(=O)N[C@@H](C)C(=O)NC1[C@H]2[C@@H]1CN(C2)C3=C(C=C4C(=O)C(=CN(C4=N3)C5=C(C=C(C=C5)F)F)C(=O)O)F)N",
        "C1C(C(C(C(C1N)OC2C(C(C(C(O2)CN)O)O)N)OC3C(C(C(O3)CO)OC4C(C(C(C(O4)CN)O)O)N)O)O)N"
    ]
    
    df = process_smiles_list(smiles_list, 'compound_info_with_toxicity.csv')

INFO:__main__:Processing compound 1/2: C[C@@H](C(=O)N[C@@H](C)C(=O)NC1[C@H]2[C@@H]1CN(C2)C3=C(C=C4C(=O)C(=CN(C4=N3)C5=C(C=C(C=C5)F)F)C(=O)O)F)N
INFO:__main__:Processing compound 2/2: C1C(C(C(C(C1N)OC2C(C(C(C(O2)CN)O)O)N)OC3C(C(C(O3)CO)OC4C(C(C(C(O4)CN)O)O)N)O)O)N
INFO:__main__:Final results saved to compound_info_with_toxicity.csv


In [2]:
df

Unnamed: 0,SMILES,PubChem_IUPAC_Name,PubChem_Formula,PubChem_Mass,PubChem_Title,PubChem_Description,ChEMBL_ID,ChEMBL_Name,ChEMBL_Formula,ChEMBL_Weight,ChEMBL_Type,ChEMBL_Phase,ChEMBL_Mechanism,ChEMBL_Target,LD50_studies,LC50_studies,other_tox_studies
0,C[C@@H](C(=O)N[C@@H](C)C(=O)NC1[C@H]2[C@@H]1CN...,"7-[(1S,5R)-6-[[(2S)-2-[[(2S)-2-aminopropanoyl]...",C26H25F3N6O5,558.18385241,Alatrofloxacin,,CHEMBL1200498,ALATROFLOXACIN MESYLATE,C27H29F3N6O8S,654.62,Small molecule,4.0,Topoisomerase IV inhibitor,,0,0,0
1,C1C(C(C(C(C1N)OC2C(C(C(C(O2)CN)O)O)N)OC3C(C(C(...,"5-amino-2-(aminomethyl)-6-[4,6-diamino-2-[4-[3...",C23H46N6O13,614.31228554,"4,6-Diamino-2-{[3-o-(2,6-diamino-2,6-dideoxyhe...",,CHEMBL266347,,C23H46N6O13,614.65,Small molecule,,,,0,0,0


In [3]:
pd_train = pd.read_csv("data_smiles/Training_Group.csv")
pd_train

Unnamed: 0,Smiles,Liver
0,S=C=Nc1c2c(ccc1)cccc2,Hepatotoxicity
1,c1(c(cc(cc1[N+](=O)[O-])[N+](=O)[O-])[N+](=O)[...,Hepatotoxicity
2,c1(c(cc(cc1)[N+](=O)[O-])[N+](=O)[O-])O,Hepatotoxicity
3,O(CCO)CC,Hepatotoxicity
4,Oc1cc2c(cc1)cccc2,Hepatotoxicity
...,...,...
1236,N1C(=NN=N1)c1ccccc1c1ccc(cc1)CN([C@H](C(=O)O)C...,Hepatotoxicity
1237,n1cc(c(c(c1C)O)CO)CO,NonHepatotoxicity
1238,O1[C@H](C(=C(C1=O)O)O)[C@@H](CO)O,NonHepatotoxicity
1239,N(c1ccccc1)C(=O)CCCCCCC(=O)NO,NonHepatotoxicity


In [None]:
%%time
smiles_list = pd_train["Smiles"].tolist()
pd_train_metadata = process_smiles_list(smiles_list, 'train_compound_info_with_toxicity.csv')
print(pd_train_metadata.shape)
pd_train_metadata.head()

In [None]:
import pandas as pd
import numpy as np
from openai import OpenAI
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
import pickle
from typing import List, Dict, Tuple
import logging
import json
from tqdm import tqdm

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class ToxicityEmbedding:
    def __init__(self, api_key: str):
        """
        Initialize the embedding creator with OpenAI API key
        """
        self.client = OpenAI(api_key=api_key)
        self.model = "text-embedding-ada-002"
        
    def create_compound_text(self, row: pd.Series) -> str:
        """
        Create a text representation of compound information
        """
        text_parts = []
        
        # Basic information
        if row.get('PubChem_IUPAC_Name'):
            text_parts.append(f"IUPAC Name: {row['PubChem_IUPAC_Name']}")
        if row.get('PubChem_Description'):
            text_parts.append(f"Description: {row['PubChem_Description']}")
        if row.get('ChEMBL_Mechanism'):
            text_parts.append(f"Mechanism: {row['ChEMBL_Mechanism']}")
        if row.get('ChEMBL_Target'):
            text_parts.append(f"Target: {row['ChEMBL_Target']}")
            
        # Toxicity information
        tox_parts = []
        # Add LD50 studies
        for col in row.index:
            if 'LD50_study' in col and pd.notna(row[col]):
                tox_parts.append(f"LD50: {row[col]}")
            elif 'LC50_study' in col and pd.notna(row[col]):
                tox_parts.append(f"LC50: {row[col]}")
            elif 'tox_study' in col and pd.notna(row[col]):
                tox_parts.append(f"Toxicity: {row[col]}")
            elif 'chembl_tox' in col and pd.notna(row[col]):
                tox_parts.append(f"ChEMBL toxicity: {row[col]}")
        
        if tox_parts:
            text_parts.append("Toxicity Data: " + "; ".join(tox_parts))
        
        return " ".join(text_parts)
    
    def get_embedding(self, text: str) -> List[float]:
        """
        Get embedding from OpenAI API
        """
        try:
            response = self.client.embeddings.create(
                model=self.model,
                input=text
            )
            return response.data[0].embedding
        except Exception as e:
            logger.error(f"Error getting embedding: {e}")
            raise

    def process_compounds(self, 
                         data_path: str, 
                         labels_path: str,
                         output_path: str = 'compound_embeddings.pkl') -> Tuple[np.ndarray, np.ndarray]:
        """
        Process compounds and create embeddings
        """
        # Load data
        df = pd.read_csv(data_path)
        labels_df = pd.read_csv(labels_path)
        
        # Merge with labels
        df = df.merge(labels_df, on='SMILES', how='inner')
        
        embeddings = []
        texts = []
        labels = []
        
        # Process each compound
        for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing compounds"):
            try:
                # Create text representation
                text = self.create_compound_text(row)
                texts.append(text)
                
                # Get embedding
                embedding = self.get_embedding(text)
                embeddings.append(embedding)
                
                # Store label
                labels.append(row['label'])
                
            except Exception as e:
                logger.error(f"Error processing compound {idx}: {e}")
                continue
        
        # Convert to numpy arrays
        X = np.array(embeddings)
        y = np.array(labels)
        
        # Save the processed data
        with open(output_path, 'wb') as f:
            pickle.dump({
                'embeddings': X,
                'labels': y,
                'texts': texts
            }, f)
        
        return X, y

def train_classifier(X: np.ndarray, 
                    y: np.ndarray, 
                    model_type: str = 'lgb',
                    test_size: float = 0.2,
                    random_state: int = 42):
    """
    Train a classifier on the embeddings
    """
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )
    
    # Scale the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Train model
    if model_type == 'lgb':
        import lightgbm as lgb
        model = lgb.LGBMClassifier(random_state=random_state)
    elif model_type == 'xgb':
        import xgboost as xgb
        model = xgb.XGBClassifier(random_state=random_state)
    else:
        raise ValueError(f"Unknown model type: {model_type}")
    
    # Train
    model.fit(X_train_scaled, y_train)
    
    # Evaluate
    y_pred = model.predict(X_test_scaled)
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    
    # Save model and scaler
    with open(f'toxicity_{model_type}_model.pkl', 'wb') as f:
        pickle.dump({
            'model': model,
            'scaler': scaler
        }, f)
    
    return model, scaler

def predict_new_compounds(model_path: str,
                        embedder: ToxicityEmbedding,
                        new_data_path: str):
    """
    Predict toxicity for new compounds
    """
    # Load model and scaler
    with open(model_path, 'rb') as f:
        model_dict = pickle.load(f)
    model = model_dict['model']
    scaler = model_dict['scaler']
    
    # Load new data
    df = pd.read_csv(new_data_path)
    
    predictions = []
    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Predicting"):
        try:
            # Create text and embedding
            text = embedder.create_compound_text(row)
            embedding = embedder.get_embedding(text)
            
            # Scale and predict
            X = scaler.transform([embedding])
            pred = model.predict(X)[0]
            prob = model.predict_proba(X)[0]
            
            predictions.append({
                'SMILES': row['SMILES'],
                'prediction': int(pred),
                'probability': float(prob[1])
            })
            
        except Exception as e:
            logger.error(f"Error predicting compound {idx}: {e}")
            continue
    
    # Save predictions
    pred_df = pd.DataFrame(predictions)
    pred_df.to_csv('predictions.csv', index=False)
    return pred_df

# Example usage
if __name__ == "__main__":
    # Initialize with your OpenAI API key
    OPENAI_API_KEY = ""
    
    # Create embedder
    embedder = ToxicityEmbedding(OPENAI_API_KEY)
    
    # Process compounds
    X, y = embedder.process_compounds(
        data_path='compound_info_with_toxicity.csv',
        labels_path='compound_labels.csv'  # CSV with SMILES and label columns
    )
    
    # Train classifier
    model, scaler = train_classifier(X, y, model_type='lgb')
    
    # Example of predicting new compounds
    predictions = predict_new_compounds(
        model_path='toxicity_lgb_model.pkl',
        embedder=embedder,
        new_data_path='new_compounds.csv'
    )