In [None]:
import requests
import time
import json
from typing import List, Dict, Optional
import csv
import pandas as pd

In [None]:
API_KEY = "GA0VdI73pl7Zy1hzH9Pvo454ALhUtlkk8crdhoPx"

In [None]:
class CASCommonChemistryAPI:
    """Client for CAS Common Chemistry API to map CAS RNs to SMILES strings."""

    BASE_URL = "https://commonchemistry.cas.org/api"

    def __init__(self, api_key: str):
        """
        Initialize the API client.

        Args:
            api_key: Your CAS Common Chemistry API token
        """
        self.api_key = api_key
        self.headers = {"X-Api-Key": api_key, "Content-Type": "application/json"}
        self.session = requests.Session()
        self.session.headers.update(self.headers)

    def get_detail(self, cas_rn: str) -> Optional[Dict]:
        """
        Get detailed information for a single CAS RN.

        Args:
            cas_rn: CAS Registry Number (with or without dashes)

        Returns:
            Dictionary containing substance details, or None if not found
        """
        url = f"{self.BASE_URL}/detail"
        params = {"cas_rn": cas_rn}

        try:
            response = self.session.get(url, params=params, timeout=10)

            if response.status_code == 200:
                return response.json()
            elif response.status_code == 404:
                print(f"CAS RN {cas_rn} not found in database")
                return None
            else:
                print(f"Error for {cas_rn}: Status {response.status_code}")
                return None

        except requests.exceptions.RequestException as e:
            print(f"Request failed for {cas_rn}: {e}")
            return None

    def cas_to_smiles(self, cas_rn: str) -> Optional[str]:
        """
        Convert a single CAS RN to SMILES string.

        Args:
            cas_rn: CAS Registry Number

        Returns:
            Canonical SMILES string, or None if not found
        """
        detail = self.get_detail(cas_rn)
        if detail:
            return detail.get("canonicalSmile") or detail.get("smile")
        return None

    def batch_cas_to_smiles(
        self,
        cas_list: List[str],
        output_file: str = "cas_smiles_mapping.csv",
        delay: float = 0.1,
        checkpoint_interval: int = 100,
    ) -> List[Dict]:
        """
        Convert a list of CAS RNs to SMILES strings with progress tracking.

        Args:
            cas_list: List of CAS Registry Numbers
            output_file: Path to output CSV file
            delay: Delay between API calls in seconds (to respect rate limits)
            checkpoint_interval: Save progress every N molecules

        Returns:
            List of dictionaries with CAS RN, SMILES, and other data
        """
        results = []
        total = len(cas_list)

        print(f"Starting batch conversion of {total} CAS RNs...")
        print(f"Results will be saved to: {output_file}")

        # Open CSV file for writing
        with open(output_file, "w", newline="", encoding="utf-8") as csvfile:
            fieldnames = [
                "cas_rn",
                "name",
                "canonical_smiles",
                "molecular_formula",
                "molecular_mass",
                "inchi",
                "inchikey",
                "status",
            ]
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()

            for idx, cas_rn in enumerate(cas_list, 1):
                # Get details
                detail = self.get_detail(cas_rn)

                if detail:
                    result = {
                        "cas_rn": cas_rn,
                        "name": detail.get("name", ""),
                        "canonical_smiles": detail.get("canonicalSmile")
                        or detail.get("smile", ""),
                        "molecular_formula": detail.get("molecularFormula", ""),
                        "molecular_mass": detail.get("molecularMass", ""),
                        "inchi": detail.get("inchi", ""),
                        "inchikey": detail.get("inchiKey", ""),
                        "status": "success",
                    }
                else:
                    result = {
                        "cas_rn": cas_rn,
                        "name": "",
                        "canonical_smiles": "",
                        "molecular_formula": "",
                        "molecular_mass": "",
                        "inchi": "",
                        "inchikey": "",
                        "status": "not_found",
                    }

                results.append(result)
                writer.writerow(result)

                # Progress update
                if idx % 10 == 0 or idx == total:
                    success_count = sum(1 for r in results if r["status"] == "success")
                    print(
                        f"Progress: {idx}/{total} ({idx / total * 100:.1f}%) - "
                        f"Successful: {success_count} ({success_count / idx * 100:.1f}%)"
                    )

                # Checkpoint save
                if idx % checkpoint_interval == 0:
                    csvfile.flush()
                    print(f"Checkpoint: Progress saved at {idx} molecules")

                # Rate limiting
                if idx < total:
                    time.sleep(delay)

        print("\nBatch conversion complete!")
        print(f"Total processed: {total}")
        print(f"Successful: {sum(1 for r in results if r['status'] == 'success')}")
        print(f"Not found: {sum(1 for r in results if r['status'] == 'not_found')}")
        print(f"Results saved to: {output_file}")

        return results

In [None]:
df = pd.read_csv("./compounds.csv")
CAS = df["CAS Number"].values
CAS

array(['18979-61-8', '5979-01-1', '1429651-50-2', ..., '1173239-39-8',
       '410536-97-9', '56715-13-0'], shape=(12355,), dtype=object)

In [None]:
# Initialize API client
client = CASCommonChemistryAPI(api_key=API_KEY)


# Batch query example
print("Starting batch query...")
results = client.batch_cas_to_smiles(
    cas_list=CAS,
    output_file="cas_smiles_mapping.csv",
    delay=0.02,  # 20ms delay between requests
    checkpoint_interval=100,
)

# Optional: Save as JSON as well
with open("cas_smiles_mapping.json", "w") as f:
    json.dump(results, f, indent=2)
print("\nResults also saved to cas_smiles_mapping.json")

Starting batch query...
Starting batch conversion of 12355 CAS RNs...
Results will be saved to: cas_smiles_mapping.csv
CAS RN 1429651-50-2 not found in database
CAS RN 2185857-97-8 not found in database
CAS RN 265646-85-3 not found in database
CAS RN 1380341-99-0 not found in database
CAS RN 164658-13-3 not found in database
CAS RN 2309668-15-1 not found in database
Progress: 10/12355 (0.1%) - Successful: 4 (40.0%)
CAS RN 497223-28-6 not found in database
CAS RN 2235358-74-2 not found in database
Progress: 20/12355 (0.2%) - Successful: 12 (60.0%)
CAS RN 79183-19-0 not found in database
CAS RN 120173-57-1 not found in database
CAS RN 620113-73-7 not found in database
CAS RN 443798-47-8 not found in database
CAS RN 2093393-05-4 not found in database
Progress: 30/12355 (0.2%) - Successful: 17 (56.7%)
CAS RN 1062648-63-8 not found in database
CAS RN 16373-93-6 not found in database
CAS RN 383392-66-3 not found in database
CAS RN 1247819-59-5 not found in database
Progress: 40/12355 (0.3%) 