# **1. Import Libraries**

In [1]:
import os
import pandas as pd
from pubchempy import get_compounds, NotFoundError, PubChemHTTPError
import re
import time
import requests

# **2. Define Data Directory and Output File**

In [2]:
data_dir = "./PhotochemCAD/Common Compounds/"
output_file = "./data/molecular_data.csv"

# **3. Function to Extract Data from .abs.txt Files**

In [3]:
def extract_absorption_data(filepath):
    """
    Extracts absorption maxima and wavelength from .abs.txt files.

    Args:
        filepath: Path to the .abs.txt file.

    Returns:
        A tuple containing:
            - wavelength_max: Wavelength at which maximum absorption occurs.
            - absorption_max: Maximum absorption value.
            - None: if there is an error.
    """
    try:
        with open(filepath, "r") as f:
            lines = f.readlines()

        # Skip header lines (find the line starting with "Wavelength")
        data_start_index = 0
        for i, line in enumerate(lines):
            if line.strip().startswith("Wavelength"):
                data_start_index = i + 1
                break

        wavelengths = []
        absorptions = []
        for line in lines[data_start_index:]:
            parts = line.strip().split()
            if len(parts) == 2:
                wavelengths.append(float(parts[0]))
                absorptions.append(float(parts[1]))

        if not wavelengths or not absorptions:
            return None, None

        absorption_max = max(absorptions)
        wavelength_max = wavelengths[absorptions.index(absorption_max)]

        return wavelength_max, absorption_max

    except (FileNotFoundError, ValueError, IndexError) as e:
        print(f"Error processing {filepath}: {e}")
        return None, None

# **4. Function to Get SMILES from PubChem**

In [4]:
def get_smiles_from_pubchem(identifier, identifier_type="name"):
    """
    Retrieves SMILES string from PubChem using either name, CAS or CID, with retries.

    Args:
        identifier: Name, CAS, or CID of the molecule.
        identifier_type: Type of identifier ("name", "cas", or "cid").

    Returns:
        SMILES string if found, otherwise None.
    """

    max_retries = 3
    retry_delay = 2

    identifier = identifier.replace("_", " ")

    if identifier_type == "cid":
        if not identifier.isdigit():
            print(
                f"Invalid CID format for identifier: {identifier}. CID must be numeric."
            )
            return None

    for attempt in range(max_retries):
        try:
            if identifier_type == "cid":
                url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{identifier}/property/IsomericSMILES/txt"
                response = requests.get(url)
                if response.status_code == 200:
                    return response.text.strip()
                else:
                    print(
                        f"Failed to retrieve data for CID '{identifier}' with status code: {response.status_code}"
                    )
                    return None
            elif identifier_type == "cas" or identifier_type == "name": 
                url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{identifier}/property/IsomericSMILES/txt"
                response = requests.get(url)
                if response.status_code == 200:
                    return response.text.strip()
                else:
                    print(
                        f"Failed to retrieve data for synonym '{identifier}' with status code: {response.status_code}"
                    )
                    return None
        except NotFoundError:
            return None
        except PubChemHTTPError as e:
            if e.msg == "Status: 503":
                print(
                    f"PubChem service unavailable, retrying in {retry_delay} seconds... (Attempt {attempt + 1})"
                )
                time.sleep(retry_delay)
            else:
                print(
                    f"PubChem HTTP Error for {identifier_type} '{identifier}': {e.msg}"
                )
                return None
        except Exception as e:
            print(
                f"An unexpected error occurred for {identifier_type} '{identifier}': {e}"
            )
            return None

    print(
        f"Failed to retrieve SMILES for {identifier_type} '{identifier}' after multiple attempts."
    )
    return None

# **5. Main Processing Loop**

In [5]:
data = []
processed_molecules = set()

for filename in os.listdir(data_dir):
    if filename.endswith(".abs.txt"):
        filepath = os.path.join(data_dir, filename)

        match = re.match(r"([A-Z]\d+)_((\d+-)+\d+)_(.+?)\.abs\.txt", filename)
        if not match:
            print(f"Skipping file {filename} due to invalid format.")
            continue

        molecule_code, molecule_id, molecule_name = (
            match.group(1),
            match.group(2),
            match.group(4),
        )

        if molecule_name in processed_molecules:
            continue
        processed_molecules.add(molecule_name)

        # Try getting SMILES by CAS first
        smiles = get_smiles_from_pubchem(molecule_id, "cas")

        # If CAS search fails, try with name
        if smiles is None:
            print(f"Trying with name for {molecule_name}")
            smiles = get_smiles_from_pubchem(molecule_name, "name")

        # If name search also fails, use a placeholder
        if smiles is None:
            print(
                f"Could not retrieve SMILES for {molecule_name} (CAS: {molecule_id}). Using a placeholder."
            )
            smiles = "N/A"  # Or any other suitable placeholder

        # Ensure SMILES is treated as a list
        smiles_list = [s.strip() for s in smiles.splitlines() if s.strip()]

        wavelength_max, absorption_max = extract_absorption_data(filepath)

        if wavelength_max is not None and absorption_max is not None:
            # Create multiple entries for multiple SMILES strings
            for smile in smiles_list:
                data.append(
                    {
                        "Molecule Code": molecule_code,
                        "Molecule CAS": molecule_id,
                        "Molecule Name": molecule_name.replace("_", " "),
                        "SMILES": smile,
                        "Absorption Maxima": absorption_max,
                        "Wavelength": wavelength_max,
                    }
                )

Failed to retrieve data for synonym '7659-95-2' with status code: 404
Trying with name for Betanin
Failed to retrieve data for synonym '118762-53-1' with status code: 404
Trying with name for 5-Phenyldipyrrin
Failed to retrieve data for synonym '5-Phenyldipyrrin' with status code: 404
Could not retrieve SMILES for 5-Phenyldipyrrin (CAS: 118762-53-1). Using a placeholder.
Failed to retrieve data for synonym '5522-66-7' with status code: 404
Trying with name for Protoporphyrin_IX_dimethyl_ester
Skipping file P04_N,N'-Difluoroboryl-1,9-dimethyl-5-(4-iodophenyl)dipyrrin.abs.txt due to invalid format.
Skipping file Q26_PdTBP(CO2Bu).abs.txt due to invalid format.
Skipping file T14_CuCOxo-1.abs.txt due to invalid format.
Skipping file Q20_ZnTMP+.abs.txt due to invalid format.
Skipping file P06_Bis(5-phenyldipyrrinato)zinc.abs.txt due to invalid format.
Skipping file Q30_ZnTCPH(CO2Me)Ph.abs.txt due to invalid format.
Skipping file Q03_ZnP.abs.txt due to invalid format.
Skipping file T10_CuC-1.

In [6]:
df = pd.DataFrame(data)
df.to_csv(output_file, index=False)

print(f"Data saved to {output_file}")

Data saved to ./data/molecular_data.csv
