In [24]:
import requests
import time
import pandas as pd
from typing import List, Dict
import logging
from rdkit import Chem
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Lock
from queue import Queue
import itertools
from datetime import datetime

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Data crawling

In [25]:
class RateLimiter:
    def __init__(self, calls_per_second: float):
        self.delay = 1.0 / calls_per_second
        self.last_call = {}
        self.locks = {}
    
    def wait(self, key: str):
        if key not in self.locks:
            self.locks[key] = Lock()
            self.last_call[key] = 0
            
        with self.locks[key]:
            now = time.time()
            time_passed = now - self.last_call[key]
            if time_passed < self.delay:
                time.sleep(self.delay - time_passed)
            self.last_call[key] = time.time()

In [26]:
class CompoundCrawler:
    def __init__(self, max_workers: int = 4):
        self.pubchem_base_url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug"
        self.chembl_base_url = "https://www.ebi.ac.uk/chembl/api/data"
        self.max_workers = max_workers
        self.rate_limiter = RateLimiter(calls_per_second=3)  # 3 requests per second
        self.results_queue = Queue()
        self.session = self._create_session()

    def _create_session(self):
        session = requests.Session()
        session.headers.update({
            'User-Agent': 'Python CompoundCrawler/1.0 (responsible data mining)',
            'Accept': 'application/json'
        })
        return session

    def _make_request(self, url: str, params: Dict = None, api_type: str = 'pubchem'):
        """Make a rate-limited API request"""
        self.rate_limiter.wait(api_type)
        try:
            response = self.session.get(url, params=params)
            response.raise_for_status()
            return response.json()
        except Exception as e:
            logger.error(f"API request error ({url}): {e}")
            return None

    def process_compound(self, smiles: str, idx: int, total: int) -> Dict:
        """Process a single compound with both PubChem and ChEMBL data"""
        try:
            logger.info(f"Processing compound {idx}/{total}: {smiles}")
            
            if not self.validate_smiles(smiles):
                logger.warning(f"Invalid SMILES: {smiles}")
                return None
            
            pubchem_data = self.get_pubchem_info(smiles)
            chembl_data = self.get_chembl_info(smiles)
            
            # Format toxicity data
            pubchem_tox = format_toxicity_data(pubchem_data.get('toxicity_data', {}))
            chembl_tox = {}
            if 'toxicity_data' in chembl_data and 'activities' in chembl_data['toxicity_data']:
                for i, activity in enumerate(chembl_data['toxicity_data']['activities'][:5]):
                    chembl_tox[f'chembl_tox_{i+1}'] = (
                        f"{activity['assay_type']} - {activity['organism']}: "
                        f"{activity['value']} {activity['units']}"
                    )
            
            result = {
                'SMILES': smiles,
                'PubChem_IUPAC_Name': pubchem_data.get('IUPACName', ''),
                'PubChem_Formula': pubchem_data.get('MolecularFormula', ''),
                'PubChem_Mass': pubchem_data.get('ExactMass', ''),
                'PubChem_Title': pubchem_data.get('Title', ''),
                'PubChem_Description': pubchem_data.get('Description', ''),
                'ChEMBL_ID': chembl_data.get('chembl_id', ''),
                'ChEMBL_Name': chembl_data.get('pref_name', ''),
                'ChEMBL_Formula': chembl_data.get('molecular_formula', ''),
                'ChEMBL_Weight': chembl_data.get('molecular_weight', ''),
                'ChEMBL_Type': chembl_data.get('molecule_type', ''),
                'ChEMBL_Phase': chembl_data.get('max_phase', ''),
                'ChEMBL_Mechanism': chembl_data.get('mechanism_of_action', ''),
                'ChEMBL_Target': chembl_data.get('target_name', ''),
                **pubchem_tox,
                **chembl_tox
            }
            
            return result
        
        except Exception as e:
            logger.error(f"Error processing compound {smiles}: {e}")
            return None
        
    def validate_smiles(self, smiles: str) -> bool:
        """
        Validate SMILES string using RDKit
        """
        try:
            mol = Chem.MolFromSmiles(smiles)
            return mol is not None
        except Exception as e:
            logger.error(f"SMILES validation error: {e}")
            return False
        
    def get_pubchem_info(self, smiles: str) -> Dict:
        """
        Retrieve compound information from PubChem using SMILES
        """
        try:
            encoded_smiles = requests.utils.quote(smiles)
            url = f"{self.pubchem_base_url}/compound/smiles/{encoded_smiles}/property/IUPACName,MolecularFormula,ExactMass,CanonicalSMILES,Title/JSON"
            
            response = self.session.get(url)
            response.raise_for_status()
            
            data = response.json()
            if 'PropertyTable' in data and 'Properties' in data['PropertyTable']:
                properties = data['PropertyTable']['Properties'][0]
                
                # Get CID for toxicity data
                cid = properties.get('CID')
                if cid:
                    # Get toxicity data
                    tox_data = self.get_pubchem_toxicity(cid)
                    properties['toxicity_data'] = tox_data
                    
                    # Get description
                    try:
                        desc_url = f"{self.pubchem_base_url}/compound/cid/{cid}/description/JSON"
                        desc_response = self.session.get(desc_url)
                        if desc_response.ok:
                            desc_data = desc_response.json()
                            if 'InformationList' in desc_data and 'Information' in desc_data['InformationList']:
                                properties['Description'] = desc_data['InformationList']['Information'][0].get('Description', '')
                    except Exception as e:
                        logger.warning(f"Could not fetch description: {e}")
                        properties['Description'] = ''
                
                return properties
            
            return {'error': 'No PubChem data found'}
            
        except requests.exceptions.RequestException as e:
            logger.error(f"PubChem API error: {e}")
            return {'error': str(e)}
        
        finally:
            time.sleep(0.3)

    def get_pubchem_toxicity(self, cid: str) -> Dict:
        """
        Retrieve toxicity data from PubChem
        """
        try:
            # Get bioassay data related to toxicity
            url = f"{self.pubchem_base_url}/compound/cid/{cid}/assaysummary/JSON"
            response = self.session.get(url)
            response.raise_for_status()
            
            tox_data = {
                'LD50': [],
                'LC50': [],
                'EC50': [],
                'IC50': [],
                'toxicity_studies': []
            }
            
            if response.ok:
                data = response.json()
                if 'AssaySummaries' in data:
                    for assay in data['AssaySummaries']:
                        if any(term in assay.get('Name', '').lower() for term in ['toxic', 'ld50', 'lc50', 'ec50', 'ic50']):
                            organism = assay.get('OrganismName', '')
                            name = assay.get('Name', '')
                            value = assay.get('Value', '')
                            
                            study = {
                                'organism': organism,
                                'assay_name': name,
                                'value': value
                            }
                            
                            if 'ld50' in name.lower():
                                tox_data['LD50'].append(study)
                            elif 'lc50' in name.lower():
                                tox_data['LC50'].append(study)
                            elif 'ec50' in name.lower():
                                tox_data['EC50'].append(study)
                            elif 'ic50' in name.lower():
                                tox_data['IC50'].append(study)
                            else:
                                tox_data['toxicity_studies'].append(study)
            
            return tox_data
            
        except Exception as e:
            logger.error(f"PubChem toxicity data error: {e}")
            return {}
        
    def get_chembl_toxicity(self, chembl_id: str) -> Dict:
        """
        Retrieve toxicity data from ChEMBL
        """
        try:
            # Get activities data
            url = f"{self.chembl_base_url}/activity"
            params = {
                'molecule_chembl_id': chembl_id,
                'type': 'IC50,EC50,LC50,LD50,TOX'
            }
            
            response = self.session.get(url, params=params)
            response.raise_for_status()
            
            tox_data = {
                'activities': []
            }
            
            if response.ok:
                data = response.json()
                for activity in data.get('activities', []):
                    study = {
                        'assay_type': activity.get('standard_type', ''),
                        'organism': activity.get('assay_organism', ''),
                        'target_name': activity.get('target_pref_name', ''),
                        'value': activity.get('standard_value', ''),
                        'units': activity.get('standard_units', ''),
                        'activity_comment': activity.get('activity_comment', '')
                    }
                    tox_data['activities'].append(study)
            
            return tox_data
            
        except Exception as e:
            logger.error(f"ChEMBL toxicity data error: {e}")
            return {}
        
    def get_chembl_info(self, smiles: str) -> Dict:
        """
        Retrieve compound information from ChEMBL using SMILES
        """
        try:
            url = f"{self.chembl_base_url}/molecule"
            params = {
                'molecule_structures__canonical_smiles__exact': smiles
            }
            
            response = self.session.get(url, params=params)
            response.raise_for_status()
            
            data = response.json()
            
            if not data.get('molecules'):
                params = {
                    'molecule_structures__canonical_smiles__flexmatch': smiles
                }
                response = self.session.get(url, params=params)
                response.raise_for_status()
                data = response.json()
            
            if not data.get('molecules'):
                return {'error': 'No ChEMBL data found'}
                
            molecule = data['molecules'][0]
            
            result = {
                'chembl_id': molecule.get('molecule_chembl_id'),
                'pref_name': molecule.get('pref_name'),
                'molecule_type': molecule.get('molecule_type'),
                'max_phase': molecule.get('max_phase'),
                'molecular_formula': molecule.get('molecule_properties', {}).get('full_molformula'),
                'molecular_weight': molecule.get('molecule_properties', {}).get('full_mwt'),
            }
            
            # Get toxicity data
            if result['chembl_id']:
                tox_data = self.get_chembl_toxicity(result['chembl_id'])
                result['toxicity_data'] = tox_data
            
            # Get mechanism data
            try:
                if result['chembl_id']:
                    activity_url = f"{self.chembl_base_url}/mechanism"
                    params = {'molecule_chembl_id': result['chembl_id']}
                    activity_response = self.session.get(activity_url, params=params)
                    if activity_response.ok:
                        activity_data = activity_response.json()
                        if activity_data.get('mechanisms'):
                            result['mechanism_of_action'] = activity_data['mechanisms'][0].get('mechanism_of_action')
                            result['target_name'] = activity_data['mechanisms'][0].get('target_name')
            except Exception as e:
                logger.warning(f"Could not fetch mechanism data: {e}")
            
            return result
            
        except requests.exceptions.RequestException as e:
            logger.error(f"ChEMBL API error: {e}")
            return {'error': str(e)}
        
        finally:
            time.sleep(0.3)

In [27]:
def format_toxicity_data(tox_data: Dict) -> Dict:
    """
    Format toxicity data for CSV output
    """
    formatted = {}
    
    # Format PubChem toxicity data
    if 'LD50' in tox_data:
        formatted['LD50_studies'] = len(tox_data['LD50'])
        for i, study in enumerate(tox_data['LD50'][:3]):  # Take first 3 studies
            formatted[f'LD50_study_{i+1}'] = f"{study['organism']}: {study['value']}"
    
    if 'LC50' in tox_data:
        formatted['LC50_studies'] = len(tox_data['LC50'])
        for i, study in enumerate(tox_data['LC50'][:3]):
            formatted[f'LC50_study_{i+1}'] = f"{study['organism']}: {study['value']}"
    
    if 'toxicity_studies' in tox_data:
        formatted['other_tox_studies'] = len(tox_data['toxicity_studies'])
        for i, study in enumerate(tox_data['toxicity_studies'][:3]):
            formatted[f'tox_study_{i+1}'] = f"{study['organism']}: {study['assay_name']}"
    
    return formatted

def process_smiles_list(smiles_list: List[str], output_file: str = 'compound_info.csv', max_workers: int = 8):
    """
    Process a list of SMILES strings using multiple threads and save results to CSV
    """
    crawler = CompoundCrawler(max_workers=max_workers)
    results = []
    save_lock = Lock()
    total = len(smiles_list)
    
    def save_results(current_results: List[Dict], final: bool = False):
        """Save intermediate results to CSV"""
        if not current_results:
            return
            
        with save_lock:
            df = pd.DataFrame(current_results)
            mode = 'w' if final or not final else 'a'
            header = True if final or not final else False
            df.to_csv(output_file, mode=mode, header=header, index=False)
            logger.info(f"{'Final' if final else 'Intermediate'} results saved to {output_file}")

    def process_batch(batch: List[tuple]):
        """Process a batch of SMILES strings"""
        batch_results = []
        for idx, smiles in batch:
            result = crawler.process_compound(smiles, idx, total)
            if result:
                batch_results.append(result)
        return batch_results

    try:
        # Create batches of SMILES strings
        batch_size = 10
        smiles_with_index = list(enumerate(smiles_list, 1))
        batches = [smiles_with_index[i:i + batch_size] 
                  for i in range(0, len(smiles_with_index), batch_size)]

        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            future_to_batch = {executor.submit(process_batch, batch): batch 
                             for batch in batches}

            for future in as_completed(future_to_batch):
                batch_results = future.result()
                if batch_results:
                    results.extend(batch_results)
                    
                    # Save intermediate results every batch_size compounds
                    if len(results) % batch_size == 0:
                        save_results(results, final=False)

        # Save final results
        save_results(results, final=True)
        return pd.DataFrame(results)

    except Exception as e:
        logger.error(f"Error in batch processing: {e}")
        # Try to save any results we have
        if results:
            save_results(results, final=True)
        return pd.DataFrame(results)

In [28]:
smiles_list = [
    "C[C@@H](C(=O)N[C@@H](C)C(=O)NC1[C@H]2[C@@H]1CN(C2)C3=C(C=C4C(=O)C(=CN(C4=N3)C5=C(C=C(C=C5)F)F)C(=O)O)F)N",
    "C1C(C(C(C(C1N)OC2C(C(C(C(O2)CN)O)O)N)OC3C(C(C(O3)CO)OC4C(C(C(C(O4)CN)O)O)N)O)O)N"
]

start_time = time.time()
df = process_smiles_list(smiles_list, 'compound_info_with_toxicity.csv', max_workers=4)
end_time = time.time()

logger.info(f"Processing completed in {end_time - start_time:.2f} seconds")

INFO:__main__:Processing compound 1/2: C[C@@H](C(=O)N[C@@H](C)C(=O)NC1[C@H]2[C@@H]1CN(C2)C3=C(C=C4C(=O)C(=CN(C4=N3)C5=C(C=C(C=C5)F)F)C(=O)O)F)N
INFO:__main__:Processing compound 2/2: C1C(C(C(C(C1N)OC2C(C(C(C(O2)CN)O)O)N)OC3C(C(C(O3)CO)OC4C(C(C(C(O4)CN)O)O)N)O)O)N
INFO:__main__:Final results saved to compound_info_with_toxicity.csv
INFO:__main__:Processing completed in 8.97 seconds


In [29]:
df

Unnamed: 0,SMILES,PubChem_IUPAC_Name,PubChem_Formula,PubChem_Mass,PubChem_Title,PubChem_Description,ChEMBL_ID,ChEMBL_Name,ChEMBL_Formula,ChEMBL_Weight,ChEMBL_Type,ChEMBL_Phase,ChEMBL_Mechanism,ChEMBL_Target,LD50_studies,LC50_studies,other_tox_studies
0,C[C@@H](C(=O)N[C@@H](C)C(=O)NC1[C@H]2[C@@H]1CN...,"7-[(1S,5R)-6-[[(2S)-2-[[(2S)-2-aminopropanoyl]...",C26H25F3N6O5,558.18385241,Alatrofloxacin,,CHEMBL1200498,ALATROFLOXACIN MESYLATE,C27H29F3N6O8S,654.62,Small molecule,4.0,Topoisomerase IV inhibitor,,0,0,0
1,C1C(C(C(C(C1N)OC2C(C(C(C(O2)CN)O)O)N)OC3C(C(C(...,"5-amino-2-(aminomethyl)-6-[4,6-diamino-2-[4-[3...",C23H46N6O13,614.31228554,"4,6-Diamino-2-{[3-o-(2,6-diamino-2,6-dideoxyhe...",,CHEMBL266347,,C23H46N6O13,614.65,Small molecule,,,,0,0,0


## Training data

In [31]:
pd_train = pd.read_csv("data_smiles/Training_Group.csv")
pd_train.head()

Unnamed: 0,Smiles,Liver
0,S=C=Nc1c2c(ccc1)cccc2,Hepatotoxicity
1,c1(c(cc(cc1[N+](=O)[O-])[N+](=O)[O-])[N+](=O)[...,Hepatotoxicity
2,c1(c(cc(cc1)[N+](=O)[O-])[N+](=O)[O-])O,Hepatotoxicity
3,O(CCO)CC,Hepatotoxicity
4,Oc1cc2c(cc1)cccc2,Hepatotoxicity


In [35]:
%%time
smiles_list = pd_train["Smiles"].tolist()
pd_train_metadata = process_smiles_list(smiles_list, 'train_compound_info_with_toxicity.csv', max_workers=8)
print(pd_train_metadata.shape)
pd_train_metadata.head()

INFO:__main__:Processing compound 1/1241: S=C=Nc1c2c(ccc1)cccc2
INFO:__main__:Processing compound 11/1241: OC(C)(C)C#N
INFO:__main__:Processing compound 21/1241: O[C@@H](CNC(C)(C)C)c1cc(c(O)cc1)CO
INFO:__main__:Processing compound 31/1241: Oc1cc([C@]2([C@H](CN(CC2)C[C@H](Cc2ccccc2)C(=O)NCC(=O)O)C)C)ccc1
INFO:__main__:Processing compound 41/1241: S=C(N/N=C/c1ccc(NC(=O)C)cc1)N
INFO:__main__:Processing compound 51/1241: Clc1c2CN3C(=NC(=O)C3)Nc2ccc1Cl
INFO:__main__:Processing compound 61/1241: O[C@@H](CNCCCCc1ccc(O)cc1)c1cc(O)c(O)cc1
INFO:__main__:Processing compound 71/1241: N1(CCC(=C2c3c(CCc4c2nccc4)cccc3)CC1)C
ERROR:__main__:PubChem API error: 503 Server Error: PUGREST.ServerBusy for url: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/N1%28CCC%28%3DC2c3c%28CCc4c2nccc4%29cccc3%29CC1%29C/property/IUPACName,MolecularFormula,ExactMass,CanonicalSMILES,Title/JSON
ERROR:__main__:PubChem API error: 400 Client Error: PUGREST.BadRequest for url: https://pubchem.ncbi.nlm.nih.gov/rest/pu

(1241, 17)
CPU times: user 35.7 s, sys: 6.06 s, total: 41.8 s
Wall time: 12min 48s


Unnamed: 0,SMILES,PubChem_IUPAC_Name,PubChem_Formula,PubChem_Mass,PubChem_Title,PubChem_Description,ChEMBL_ID,ChEMBL_Name,ChEMBL_Formula,ChEMBL_Weight,ChEMBL_Type,ChEMBL_Phase,ChEMBL_Mechanism,ChEMBL_Target,LD50_studies,LC50_studies,other_tox_studies
0,O[C@@H](CNC(C)(C)C)c1cc(c(O)cc1)CO,4-[(1R)-2-(tert-butylamino)-1-hydroxyethyl]-2-...,C13H21NO3,239.15214353,Levalbuterol,,CHEMBL714,ALBUTEROL,C13H21NO3,239.31,Small molecule,4.0,Beta-2 adrenergic receptor agonist,,0.0,0.0,0.0
1,Cl[C@H]1[C@H]2[C@H]3[C@@]([C@](O)([C@@H](C3)C)...,"(7R,8S,9S,10R,11S,13S,14S,16R,17R)-7-chloro-11...",C22H29ClO5,408.1703517,Alclometasone,,CHEMBL1201361,ALCLOMETASONE,C22H29ClO5,408.92,Small molecule,,,,0.0,0.0,0.0
2,Cl[C@@]12[C@H]3[C@@H]([C@@](Cl)(C1(Cl)Cl)C(=C2...,"(1S,2S,3S,6R,7R,8R)-1,8,9,10,11,11-hexachlorot...",C12H8Cl6,363.872766,Aldrin,,CHEMBL195953,ALDRIN,C12H8Cl6,364.91,Small molecule,,,,0.0,0.0,0.0
3,O1[C@@H](CCC1)C(=O)NCCCN(c1nc2c(c(n1)N)cc(OC)c...,,,,,,CHEMBL709,ALFUZOSIN,C19H27N5O4,389.46,Small molecule,4.0,,,,,
4,OC(=O)/C=C(/C=C/C=C(\C=C\C1=C(CCCC1(C)C)C)/C)\C,,,,,,CHEMBL38,TRETINOIN,C20H28O2,300.44,Small molecule,4.0,Retinoic acid receptor agonist,,,,


In [33]:
pd_test = pd.read_csv("data_smiles/Testing_Group.csv")
pd_test.head()

Unnamed: 0,Smiles,Liver
0,C[C@@H](C(=O)N[C@@H](C)C(=O)NC1[C@H]2[C@@H]1CN...,Hepatotoxicity
1,C=CCOC1=C(C=C(C=C1)CC(=O)O)Cl,Hepatotoxicity
2,CCCN(CCC)C(=O)CC1=C(N=C2N1C=C(C=C2)Cl)C3=CC=C(...,Hepatotoxicity
3,C1CC2=CC=CC=C2C(C3=CC=CC=C31)NCCCCCCC(=O)O,Hepatotoxicity
4,C1=CC=C(C=C1)CN2C3=CC=CC=C3C(=N2)OCC(=O)O,Hepatotoxicity


In [34]:
%%time
smiles_list = pd_test["Smiles"].tolist()
pd_test_metadata = process_smiles_list(smiles_list, 'test_compound_info_with_toxicity.csv', max_workers=8)
print(pd_test_metadata.shape)
pd_test_metadata.head()

INFO:__main__:Processing compound 1/286: C[C@@H](C(=O)N[C@@H](C)C(=O)NC1[C@H]2[C@@H]1CN(C2)C3=C(C=C4C(=O)C(=CN(C4=N3)C5=C(C=C(C=C5)F)F)C(=O)O)F)N
INFO:__main__:Processing compound 11/286: CN1C(S(=O)(=O)CCC1=O)C2=CC=C(C=C2)Cl
INFO:__main__:Processing compound 21/286: C1CN(CCN1CC2=CC3=C(C=C2)OCO3)C(=O)COC4=CC=C(C=C4)Cl
INFO:__main__:Processing compound 31/286: CS(=O)(=O)NC1=C(C=C(C=C1)[N+](=O)[O-])OC2=CC=CC=C2
INFO:__main__:Processing compound 41/286: C1=CC(=CC=C1N)S(=O)(=O)NC(=O)N
INFO:__main__:Processing compound 51/286: C1[C@@H]2[C@@H](C2N)CN1C3=C(C=C4C(=O)C(=CN(C4=N3)C5=C(C=C(C=C5)F)F)C(=O)O)F
INFO:__main__:Processing compound 61/286: C1=CN=CC=C1C(=O)NN
INFO:__main__:Processing compound 71/286: CCCC1=CC(=O)NC(=S)N1
ERROR:__main__:PubChem toxicity data error: 503 Server Error: PUGREST.ServerBusy for url: https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/62959/assaysummary/JSON
ERROR:__main__:PubChem toxicity data error: 503 Server Error: PUGREST.ServerBusy for url: https://pubche

(286, 17)
CPU times: user 9.38 s, sys: 1.64 s, total: 11 s
Wall time: 3min 22s


Unnamed: 0,SMILES,PubChem_IUPAC_Name,PubChem_Formula,PubChem_Mass,PubChem_Title,PubChem_Description,ChEMBL_ID,ChEMBL_Name,ChEMBL_Formula,ChEMBL_Weight,ChEMBL_Type,ChEMBL_Phase,ChEMBL_Mechanism,ChEMBL_Target,LD50_studies,LC50_studies,other_tox_studies
0,C[C@@H](C(=O)N[C@@H](C)C(=O)NC1[C@H]2[C@@H]1CN...,"7-[(1S,5R)-6-[[(2S)-2-[[(2S)-2-aminopropanoyl]...",C26H25F3N6O5,558.18385241,Alatrofloxacin,,CHEMBL1200498,ALATROFLOXACIN MESYLATE,C27H29F3N6O8S,654.62,Small molecule,4.0,Topoisomerase IV inhibitor,,,,
1,C=CCOC1=C(C=C(C=C1)CC(=O)O)Cl,2-(3-chloro-4-prop-2-enoxyphenyl)acetic acid,C11H11ClO3,226.0396719,Alclofenac,,CHEMBL94081,ALCLOFENAC,C11H11ClO3,226.66,Small molecule,4.0,Cyclooxygenase-1 inhibitor,,0.0,0.0,0.0
2,CCCN(CCC)C(=O)CC1=C(N=C2N1C=C(C=C2)Cl)C3=CC=C(...,"2-[6-chloro-2-(4-chlorophenyl)imidazo[1,2-a]py...",C21H23Cl2N3O,403.1218178,Alpidem,,CHEMBL54349,ALPIDEM,C21H23Cl2N3O,404.34,Small molecule,4.0,Translocator protein partial agonist,,0.0,0.0,0.0
3,C1CC2=CC=CC=C2C(C3=CC=CC=C31)NCCCCCCC(=O)O,"7-(2-tricyclo[9.4.0.03,8]pentadeca-1(15),3,5,7...",C22H27NO2,337.204179104,Amineptine,,CHEMBL418995,AMINEPTINE,C22H27NO2,337.46,Small molecule,4.0,Dopamine D1 and D2 receptor antagonist,,0.0,0.0,0.0
4,C1=CC=C(C=C1)CN2C3=CC=CC=C3C(=N2)OCC(=O)O,2-(1-benzylindazol-3-yl)oxyacetic acid,C16H14N2O3,282.10044231,Bendazac,,CHEMBL1089221,BENDAZAC,C16H14N2O3,282.3,Small molecule,4.0,Xanthine dehydrogenase inhibitor,,0.0,0.0,0.0


# OpenAI embedding

In [None]:
import pandas as pd
import numpy as np
from openai import OpenAI
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
import pickle
from typing import List, Dict, Tuple
import logging
import json
from tqdm import tqdm

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class ToxicityEmbedding:
    def __init__(self, api_key: str):
        """
        Initialize the embedding creator with OpenAI API key
        """
        self.client = OpenAI(api_key=api_key)
        self.model = "text-embedding-ada-002"
        
    def create_compound_text(self, row: pd.Series) -> str:
        """
        Create a text representation of compound information
        """
        text_parts = []
        
        # Basic information
        if row.get('PubChem_IUPAC_Name'):
            text_parts.append(f"IUPAC Name: {row['PubChem_IUPAC_Name']}")
        if row.get('PubChem_Description'):
            text_parts.append(f"Description: {row['PubChem_Description']}")
        if row.get('ChEMBL_Mechanism'):
            text_parts.append(f"Mechanism: {row['ChEMBL_Mechanism']}")
        if row.get('ChEMBL_Target'):
            text_parts.append(f"Target: {row['ChEMBL_Target']}")
            
        # Toxicity information
        tox_parts = []
        # Add LD50 studies
        for col in row.index:
            if 'LD50_study' in col and pd.notna(row[col]):
                tox_parts.append(f"LD50: {row[col]}")
            elif 'LC50_study' in col and pd.notna(row[col]):
                tox_parts.append(f"LC50: {row[col]}")
            elif 'tox_study' in col and pd.notna(row[col]):
                tox_parts.append(f"Toxicity: {row[col]}")
            elif 'chembl_tox' in col and pd.notna(row[col]):
                tox_parts.append(f"ChEMBL toxicity: {row[col]}")
        
        if tox_parts:
            text_parts.append("Toxicity Data: " + "; ".join(tox_parts))
        
        return " ".join(text_parts)
    
    def get_embedding(self, text: str) -> List[float]:
        """
        Get embedding from OpenAI API
        """
        try:
            response = self.client.embeddings.create(
                model=self.model,
                input=text
            )
            return response.data[0].embedding
        except Exception as e:
            logger.error(f"Error getting embedding: {e}")
            raise

    def process_compounds(self, 
                         data_path: str, 
                         labels_path: str,
                         output_path: str = 'compound_embeddings.pkl') -> Tuple[np.ndarray, np.ndarray]:
        """
        Process compounds and create embeddings
        """
        # Load data
        df = pd.read_csv(data_path)
        labels_df = pd.read_csv(labels_path)
        
        # Merge with labels
        df = df.merge(labels_df, on='SMILES', how='inner')
        
        embeddings = []
        texts = []
        labels = []
        
        # Process each compound
        for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing compounds"):
            try:
                # Create text representation
                text = self.create_compound_text(row)
                texts.append(text)
                
                # Get embedding
                embedding = self.get_embedding(text)
                embeddings.append(embedding)
                
                # Store label
                labels.append(row['label'])
                
            except Exception as e:
                logger.error(f"Error processing compound {idx}: {e}")
                continue
        
        # Convert to numpy arrays
        X = np.array(embeddings)
        y = np.array(labels)
        
        # Save the processed data
        with open(output_path, 'wb') as f:
            pickle.dump({
                'embeddings': X,
                'labels': y,
                'texts': texts
            }, f)
        
        return X, y

def train_classifier(X: np.ndarray, 
                    y: np.ndarray, 
                    model_type: str = 'lgb',
                    test_size: float = 0.2,
                    random_state: int = 42):
    """
    Train a classifier on the embeddings
    """
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )
    
    # Scale the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Train model
    if model_type == 'lgb':
        import lightgbm as lgb
        model = lgb.LGBMClassifier(random_state=random_state)
    elif model_type == 'xgb':
        import xgboost as xgb
        model = xgb.XGBClassifier(random_state=random_state)
    else:
        raise ValueError(f"Unknown model type: {model_type}")
    
    # Train
    model.fit(X_train_scaled, y_train)
    
    # Evaluate
    y_pred = model.predict(X_test_scaled)
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    
    # Save model and scaler
    with open(f'toxicity_{model_type}_model.pkl', 'wb') as f:
        pickle.dump({
            'model': model,
            'scaler': scaler
        }, f)
    
    return model, scaler

def predict_new_compounds(model_path: str,
                        embedder: ToxicityEmbedding,
                        new_data_path: str):
    """
    Predict toxicity for new compounds
    """
    # Load model and scaler
    with open(model_path, 'rb') as f:
        model_dict = pickle.load(f)
    model = model_dict['model']
    scaler = model_dict['scaler']
    
    # Load new data
    df = pd.read_csv(new_data_path)
    
    predictions = []
    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Predicting"):
        try:
            # Create text and embedding
            text = embedder.create_compound_text(row)
            embedding = embedder.get_embedding(text)
            
            # Scale and predict
            X = scaler.transform([embedding])
            pred = model.predict(X)[0]
            prob = model.predict_proba(X)[0]
            
            predictions.append({
                'SMILES': row['SMILES'],
                'prediction': int(pred),
                'probability': float(prob[1])
            })
            
        except Exception as e:
            logger.error(f"Error predicting compound {idx}: {e}")
            continue
    
    # Save predictions
    pred_df = pd.DataFrame(predictions)
    pred_df.to_csv('predictions.csv', index=False)
    return pred_df

# Example usage
if __name__ == "__main__":
    # Initialize with your OpenAI API key
    OPENAI_API_KEY = ""
    
    # Create embedder
    embedder = ToxicityEmbedding(OPENAI_API_KEY)
    
    # Process compounds
    X, y = embedder.process_compounds(
        data_path='compound_info_with_toxicity.csv',
        labels_path='compound_labels.csv'  # CSV with SMILES and label columns
    )
    
    # Train classifier
    model, scaler = train_classifier(X, y, model_type='lgb')
    
    # Example of predicting new compounds
    predictions = predict_new_compounds(
        model_path='toxicity_lgb_model.pkl',
        embedder=embedder,
        new_data_path='new_compounds.csv'
    )