# DO NOT RUN

## I ran this on my jupyternotbook server since the run time is too long for github codespaces


In [28]:
import pandas as pd

# Load the CSV file (make sure it's in your working directory or give full path)
df = pd.read_csv(r"C:\Users\julia\OneDrive\Desktop\MBBE 447\CHEMICALS.csv")

# Use the 'CHEM' column for PubChem lookup
names = df['CHEM'].astype(str).str.strip().tolist()

# Clean: remove blanks and duplicates
names = [n for n in names if n]  # drop empty strings
unique_names = list(dict.fromkeys(names))  # preserves order

print(f"Loaded {len(unique_names)} unique chemical names from 'CHEM' column.")

Loaded 29585 unique chemical names from 'CHEM' column.


In [1]:
import pandas as pd

In [None]:
import requests
import time

# Set up a requests session for performance
session = requests.Session()
results = []
not_found_count = 0

for i, name in enumerate(unique_names):
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{requests.utils.requote_uri(name)}/cids/JSON"
    
    try:
        response = session.get(url)
    except requests.exceptions.RequestException as e:
        print(f"[ERROR] Request failed for '{name}': {e}")
        results.append((name, None))
        not_found_count += 1
        continue

    if response.status_code != 200:
        results.append((name, None))
        not_found_count += 1
    else:
        try:
            data = response.json()
            cid = data.get('IdentifierList', {}).get('CID', [None])[0]
            results.append((name, cid))
        except Exception as e:
            print(f"[ERROR] Could not parse JSON for '{name}': {e}")
            results.append((name, None))
            not_found_count += 1

    # Throttle requests to ~5 per second
    if i % 5 == 0:
        time.sleep(1)

print(f"Done. Found CIDs for {len(results) - not_found_count} names. Not found: {not_found_count}")

In [29]:
# Save results to CSV
result_df = pd.DataFrame(results, columns=["Chemical_Name", "PubChem_CID"])
result_df.to_csv("chemical_name_to_cid.csv", index=False)

# Quick preview
result_df.head()

Unnamed: 0,Chemical_Name,PubChem_CID


In [14]:
# Load it as a CSV with one column
df_raw = pd.read_csv(r"C:\Users\julia\OneDrive\Desktop\MBBE 447\chemical_name_to_cid.xls", header=None, names=["Raw"])

# Split on the comma into two new columns
df_cid_map = df_raw["Raw"].str.split(",", n=1, expand=True)
df_cid_map.columns = ["Chemical_Name", "PubChem_CID"]

# Clean up any leading/trailing spaces
df_cid_map["Chemical_Name"] = df_cid_map["Chemical_Name"].str.strip()
df_cid_map["PubChem_CID"] = pd.to_numeric(df_cid_map["PubChem_CID"], errors='coerce')  # convert to float/int
df_cid_map = df_cid_map.dropna(subset=["PubChem_CID"])
df_cid_map["PubChem_CID"] = df_cid_map["PubChem_CID"].astype(int)

df_cid_map.head()



Chemical_Name    (+)-3-THUJONE(+)-ALPHA-CYPERONE(+)-ALPHA-PINEN...
PubChem_CID                                           176104132513
dtype: object

In [None]:
import requests, time

def get_pubchem_properties(cid):
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{cid}/property/CanonicalSMILES,MolecularWeight,LogP,HBondDonorCount,HBondAcceptorCount/JSON"
    try:
        response = requests.get(url)
        if response.status_code == 200:
            props = response.json()['PropertyTable']['Properties'][0]
            return {
                'CID': cid,
                'SMILES': props.get('CanonicalSMILES'),
                'MolWt': props.get('MolecularWeight'),
                'LogP': props.get('LogP'),
                'HDonors': props.get('HBondDonorCount'),
                'HAcceptors': props.get('HBondAcceptorCount')
            }
        else:
            return None
    except:
        return None

# Get properties for all CIDs
cids = df_cid_map["PubChem_CID"].tolist()
results = []

for i, cid in enumerate(cids):
    props = get_pubchem_properties(cid)
    if props:
        results.append(props)
    if i % 5 == 0:
        time.sleep(1)  # avoid overloading the server

df_props = pd.DataFrame(results)
df_props.head()

row_sum = df_props.sum(axis=1)


In [16]:
row_sum = df_props.sum(axis=1)

  row_sum = df_props.sum(axis=1)


In [17]:
#Not run yet, cell above is still processing

In [4]:
import requests, time
import pandas as pd

# Function to retrieve properties for one CID
def get_pubchem_properties(cid):
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{cid}/property/CanonicalSMILES,MolecularWeight,XLogP,HBondDonorCount,HBondAcceptorCount/JSON"
    try:
        response = requests.get(url)
        if response.status_code == 200:
            props = response.json()['PropertyTable']['Properties'][0]
            return {
                'CID': cid,
                'SMILES': props.get('CanonicalSMILES'),
                'MolWt': props.get('MolecularWeight'),
                'LogP': props.get('XLogP'),  # <- Use XLogP here
                'HDonors': props.get('HBondDonorCount'),
                'HAcceptors': props.get('HBondAcceptorCount')
            }
        else:
            print(f"[SKIP] CID {cid} – status {response.status_code}")
            return None
    except Exception as e:
        print(f"[ERROR] CID {cid} – {e}")
        return None

# ⚠️ Only take a subset of 1000 for now
subset_size = 1000
cids = df_cid_map["PubChem_CID"].tolist()[:subset_size]

# Store results
results = []

# Loop with checkpointing
for i, cid in enumerate(cids):
    props = get_pubchem_properties(cid)
    if props:
        results.append(props)
    
    # Print progress every 100
    if i % 100 == 0:
        print(f"...processed {i} compounds")

    # Save checkpoint every 250
    if i % 250 == 0 and i > 0:
        df_temp = pd.DataFrame(results)
        df_temp.to_csv(f"pubchem_props_checkpoint_{i}.csv", index=False)
        print(f"✔️ Checkpoint saved at {i} entries")

    # Throttle: 5 requests per second
    if i % 5 == 0:
        time.sleep(1)

# Final save
df_props = pd.DataFrame(results)
df_props.to_csv("pubchem_props_subset_complete.csv", index=False)
print("✅ All done! Final subset saved.")

...processed 0 compounds
...processed 100 compounds
...processed 200 compounds
✔️ Checkpoint saved at 250 entries
...processed 300 compounds
...processed 400 compounds
...processed 500 compounds
✔️ Checkpoint saved at 500 entries
...processed 600 compounds
...processed 700 compounds
✔️ Checkpoint saved at 750 entries
...processed 800 compounds
...processed 900 compounds
✅ All done! Final subset saved.


In [5]:
import requests, time
import pandas as pd

# Function to retrieve properties for one CID
def get_pubchem_properties(cid):
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{cid}/property/CanonicalSMILES,MolecularWeight,XLogP,HBondDonorCount,HBondAcceptorCount/JSON"
    try:
        response = requests.get(url)
        if response.status_code == 200:
            props = response.json()['PropertyTable']['Properties'][0]
            return {
                'CID': cid,
                'SMILES': props.get('CanonicalSMILES'),
                'MolWt': props.get('MolecularWeight'),
                'LogP': props.get('XLogP'),  # <- Use XLogP here
                'HDonors': props.get('HBondDonorCount'),
                'HAcceptors': props.get('HBondAcceptorCount')
            }
        else:
            print(f"[SKIP] CID {cid} – status {response.status_code}")
            return None
    except Exception as e:
        print(f"[ERROR] CID {cid} – {e}")
        return None

# next 1000
start = 1000
end = 2000
cids = df_cid_map["PubChem_CID"].tolist()[start:end]

# Store results
results = []

# Loop with checkpointing
for i, cid in enumerate(cids):
    props = get_pubchem_properties(cid)
    if props:
        results.append(props)
    
    # Print progress every 100
    if i % 100 == 0:
        print(f"...processed {i} compounds")

    # Save checkpoint every 250
    if i % 250 == 0 and i > 0:
        df_temp = pd.DataFrame(results)
        df_temp.to_csv(f"pubchem_props_checkpoint_{i}_batch2.csv", index=False)
        print(f"✔️ Batch 2 checkpoint saved at {i} entries")

    # Throttle: 5 requests per second
    if i % 5 == 0:
        time.sleep(1)

# Final save
df_props = pd.DataFrame(results)
df_props.to_csv("pubchem_props_batch2_complete.csv", index=False)
print("✅ Batch 2 done! Saved to pubchem_props_batch2_complete.csv")

...processed 0 compounds
...processed 100 compounds
...processed 200 compounds
✔️ Batch 2 checkpoint saved at 250 entries
...processed 300 compounds
...processed 400 compounds
...processed 500 compounds
✔️ Batch 2 checkpoint saved at 500 entries
...processed 600 compounds
...processed 700 compounds
✔️ Batch 2 checkpoint saved at 750 entries
...processed 800 compounds
...processed 900 compounds
✅ Batch 2 done! Saved to pubchem_props_batch2_complete.csv


In [6]:
import requests, time
import pandas as pd

# Function to retrieve properties for one CID
def get_pubchem_properties(cid):
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{cid}/property/CanonicalSMILES,MolecularWeight,XLogP,HBondDonorCount,HBondAcceptorCount/JSON"
    try:
        response = requests.get(url)
        if response.status_code == 200:
            props = response.json()['PropertyTable']['Properties'][0]
            return {
                'CID': cid,
                'SMILES': props.get('CanonicalSMILES'),
                'MolWt': props.get('MolecularWeight'),
                'LogP': props.get('XLogP'),  # <- Use XLogP here
                'HDonors': props.get('HBondDonorCount'),
                'HAcceptors': props.get('HBondAcceptorCount')
            }
        else:
            print(f"[SKIP] CID {cid} – status {response.status_code}")
            return None
    except Exception as e:
        print(f"[ERROR] CID {cid} – {e}")
        return None

# next 1000
start = 2000
end = 3000
cids = df_cid_map["PubChem_CID"].tolist()[start:end]

# Store results
results = []

# Loop with checkpointing
for i, cid in enumerate(cids):
    props = get_pubchem_properties(cid)
    if props:
        results.append(props)
    
    # Print progress every 100
    if i % 100 == 0:
        print(f"...processed {i} compounds")

    # Save checkpoint every 250
    if i % 250 == 0 and i > 0:
        df_temp = pd.DataFrame(results)
        df_temp.to_csv(f"pubchem_props_checkpoint_{i}_batch3.csv", index=False)
        print(f"✔️ Batch 3 checkpoint saved at {i} entries")

    # Throttle: 5 requests per second
    if i % 5 == 0:
        time.sleep(1)

# Final save
df_props = pd.DataFrame(results)
df_props.to_csv("pubchem_props_batch3_complete.csv", index=False)
print("✅ Batch 3 done! Saved to pubchem_props_batch3_complete.csv")

...processed 0 compounds
...processed 100 compounds
...processed 200 compounds
✔️ Batch 3 checkpoint saved at 250 entries
...processed 300 compounds
...processed 400 compounds
...processed 500 compounds
✔️ Batch 3 checkpoint saved at 500 entries
...processed 600 compounds
...processed 700 compounds
✔️ Batch 3 checkpoint saved at 750 entries
...processed 800 compounds
...processed 900 compounds
✅ Batch 3 done! Saved to pubchem_props_batch3_complete.csv


In [7]:
import requests, time
import pandas as pd

# Function to retrieve properties for one CID
def get_pubchem_properties(cid):
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{cid}/property/CanonicalSMILES,MolecularWeight,XLogP,HBondDonorCount,HBondAcceptorCount/JSON"
    try:
        response = requests.get(url)
        if response.status_code == 200:
            props = response.json()['PropertyTable']['Properties'][0]
            return {
                'CID': cid,
                'SMILES': props.get('CanonicalSMILES'),
                'MolWt': props.get('MolecularWeight'),
                'LogP': props.get('XLogP'),  # <- Use XLogP here
                'HDonors': props.get('HBondDonorCount'),
                'HAcceptors': props.get('HBondAcceptorCount')
            }
        else:
            print(f"[SKIP] CID {cid} – status {response.status_code}")
            return None
    except Exception as e:
        print(f"[ERROR] CID {cid} – {e}")
        return None

# next 1000
start = 3000
end = 4000
cids = df_cid_map["PubChem_CID"].tolist()[start:end]

# Store results
results = []

# Loop with checkpointing
for i, cid in enumerate(cids):
    props = get_pubchem_properties(cid)
    if props:
        results.append(props)
    
    # Print progress every 100
    if i % 100 == 0:
        print(f"...processed {i} compounds")

    # Save checkpoint every 250
    if i % 250 == 0 and i > 0:
        df_temp = pd.DataFrame(results)
        df_temp.to_csv(f"pubchem_props_checkpoint_{i}_batch4.csv", index=False)
        print(f"✔️ Batch 4 checkpoint saved at {i} entries")

    # Throttle: 5 requests per second
    if i % 5 == 0:
        time.sleep(1)

# Final save
df_props = pd.DataFrame(results)
df_props.to_csv("pubchem_props_batch4_complete.csv", index=False)
print("✅ Batch 4 done! Saved to pubchem_props_batch4_complete.csv")

...processed 0 compounds
...processed 100 compounds
...processed 200 compounds
✔️ Batch 4 checkpoint saved at 250 entries
...processed 300 compounds
...processed 400 compounds
...processed 500 compounds
✔️ Batch 4 checkpoint saved at 500 entries
...processed 600 compounds
...processed 700 compounds
✔️ Batch 4 checkpoint saved at 750 entries
...processed 800 compounds
...processed 900 compounds
✅ Batch 4 done! Saved to pubchem_props_batch4_complete.csv


In [8]:
import requests, time
import pandas as pd

# Function to retrieve properties for one CID
def get_pubchem_properties(cid):
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{cid}/property/CanonicalSMILES,MolecularWeight,XLogP,HBondDonorCount,HBondAcceptorCount/JSON"
    try:
        response = requests.get(url)
        if response.status_code == 200:
            props = response.json()['PropertyTable']['Properties'][0]
            return {
                'CID': cid,
                'SMILES': props.get('CanonicalSMILES'),
                'MolWt': props.get('MolecularWeight'),
                'LogP': props.get('XLogP'),  # <- Use XLogP here
                'HDonors': props.get('HBondDonorCount'),
                'HAcceptors': props.get('HBondAcceptorCount')
            }
        else:
            print(f"[SKIP] CID {cid} – status {response.status_code}")
            return None
    except Exception as e:
        print(f"[ERROR] CID {cid} – {e}")
        return None

# next 1000
start = 4000
end = 5000
cids = df_cid_map["PubChem_CID"].tolist()[start:end]

# Store results
results = []

# Loop with checkpointing
for i, cid in enumerate(cids):
    props = get_pubchem_properties(cid)
    if props:
        results.append(props)
    
    # Print progress every 100
    if i % 100 == 0:
        print(f"...processed {i} compounds")

    # Save checkpoint every 250
    if i % 250 == 0 and i > 0:
        df_temp = pd.DataFrame(results)
        df_temp.to_csv(f"pubchem_props_checkpoint_{i}_batch5.csv", index=False)
        print(f"✔️ Batch 5 checkpoint saved at {i} entries")

    # Throttle: 5 requests per second
    if i % 5 == 0:
        time.sleep(1)

# Final save
df_props = pd.DataFrame(results)
df_props.to_csv("pubchem_props_batch5_complete.csv", index=False)
print("✅ Batch 5 done! Saved to pubchem_props_batch5_complete.csv")

...processed 0 compounds
...processed 100 compounds
...processed 200 compounds
✔️ Batch 5 checkpoint saved at 250 entries
...processed 300 compounds
...processed 400 compounds
...processed 500 compounds
✔️ Batch 5 checkpoint saved at 500 entries
...processed 600 compounds
...processed 700 compounds
✔️ Batch 5 checkpoint saved at 750 entries
...processed 800 compounds
...processed 900 compounds
✅ Batch 5 done! Saved to pubchem_props_batch5_complete.csv


In [9]:
import requests, time
import pandas as pd

# Function to retrieve properties for one CID
def get_pubchem_properties(cid):
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{cid}/property/CanonicalSMILES,MolecularWeight,XLogP,HBondDonorCount,HBondAcceptorCount/JSON"
    try:
        response = requests.get(url)
        if response.status_code == 200:
            props = response.json()['PropertyTable']['Properties'][0]
            return {
                'CID': cid,
                'SMILES': props.get('CanonicalSMILES'),
                'MolWt': props.get('MolecularWeight'),
                'LogP': props.get('XLogP'),  # <- Use XLogP here
                'HDonors': props.get('HBondDonorCount'),
                'HAcceptors': props.get('HBondAcceptorCount')
            }
        else:
            print(f"[SKIP] CID {cid} – status {response.status_code}")
            return None
    except Exception as e:
        print(f"[ERROR] CID {cid} – {e}")
        return None

# next 1000
start = 5000
end = 6000
cids = df_cid_map["PubChem_CID"].tolist()[start:end]

# Store results
results = []

# Loop with checkpointing
for i, cid in enumerate(cids):
    props = get_pubchem_properties(cid)
    if props:
        results.append(props)
    
    # Print progress every 100
    if i % 100 == 0:
        print(f"...processed {i} compounds")

    # Save checkpoint every 250
    if i % 250 == 0 and i > 0:
        df_temp = pd.DataFrame(results)
        df_temp.to_csv(f"pubchem_props_checkpoint_{i}_batch6.csv", index=False)
        print(f"✔️ Batch 6 checkpoint saved at {i} entries")

    # Throttle: 5 requests per second
    if i % 5 == 0:
        time.sleep(1)

# Final save
df_props = pd.DataFrame(results)
df_props.to_csv("pubchem_props_batch6_complete.csv", index=False)
print("✅ Batch 6 done! Saved to pubchem_props_batch6_complete.csv")

...processed 0 compounds
...processed 100 compounds
...processed 200 compounds
✔️ Batch 6 checkpoint saved at 250 entries
...processed 300 compounds
...processed 400 compounds
...processed 500 compounds
✔️ Batch 6 checkpoint saved at 500 entries
...processed 600 compounds
...processed 700 compounds
✔️ Batch 6 checkpoint saved at 750 entries
...processed 800 compounds
...processed 900 compounds
✅ Batch 6 done! Saved to pubchem_props_batch6_complete.csv


In [10]:
import requests, time
import pandas as pd

# Function to retrieve properties for one CID
def get_pubchem_properties(cid):
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{cid}/property/CanonicalSMILES,MolecularWeight,XLogP,HBondDonorCount,HBondAcceptorCount/JSON"
    try:
        response = requests.get(url)
        if response.status_code == 200:
            props = response.json()['PropertyTable']['Properties'][0]
            return {
                'CID': cid,
                'SMILES': props.get('CanonicalSMILES'),
                'MolWt': props.get('MolecularWeight'),
                'LogP': props.get('XLogP'),  # <- Use XLogP here
                'HDonors': props.get('HBondDonorCount'),
                'HAcceptors': props.get('HBondAcceptorCount')
            }
        else:
            print(f"[SKIP] CID {cid} – status {response.status_code}")
            return None
    except Exception as e:
        print(f"[ERROR] CID {cid} – {e}")
        return None

# next 1000
start = 6000
end = 7000
cids = df_cid_map["PubChem_CID"].tolist()[start:end]

# Store results
results = []

# Loop with checkpointing
for i, cid in enumerate(cids):
    props = get_pubchem_properties(cid)
    if props:
        results.append(props)
    
    # Print progress every 100
    if i % 100 == 0:
        print(f"...processed {i} compounds")

    # Save checkpoint every 250
    if i % 250 == 0 and i > 0:
        df_temp = pd.DataFrame(results)
        df_temp.to_csv(f"pubchem_props_checkpoint_{i}_batch7.csv", index=False)
        print(f"✔️ Batch 7 checkpoint saved at {i} entries")

    # Throttle: 5 requests per second
    if i % 5 == 0:
        time.sleep(1)

# Final save
df_props = pd.DataFrame(results)
df_props.to_csv("pubchem_props_batch7_complete.csv", index=False)
print("✅ Batch 7 done! Saved to pubchem_props_batch7_complete.csv")

...processed 0 compounds
...processed 100 compounds
...processed 200 compounds
✔️ Batch 7 checkpoint saved at 250 entries
...processed 300 compounds
...processed 400 compounds
...processed 500 compounds
✔️ Batch 7 checkpoint saved at 500 entries
...processed 600 compounds
...processed 700 compounds
✔️ Batch 7 checkpoint saved at 750 entries
...processed 800 compounds
...processed 900 compounds
✅ Batch 7 done! Saved to pubchem_props_batch7_complete.csv


In [11]:
import requests, time
import pandas as pd

# Function to retrieve properties for one CID
def get_pubchem_properties(cid):
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{cid}/property/CanonicalSMILES,MolecularWeight,XLogP,HBondDonorCount,HBondAcceptorCount/JSON"
    try:
        response = requests.get(url)
        if response.status_code == 200:
            props = response.json()['PropertyTable']['Properties'][0]
            return {
                'CID': cid,
                'SMILES': props.get('CanonicalSMILES'),
                'MolWt': props.get('MolecularWeight'),
                'LogP': props.get('XLogP'),  # <- Use XLogP here
                'HDonors': props.get('HBondDonorCount'),
                'HAcceptors': props.get('HBondAcceptorCount')
            }
        else:
            print(f"[SKIP] CID {cid} – status {response.status_code}")
            return None
    except Exception as e:
        print(f"[ERROR] CID {cid} – {e}")
        return None

# next 1000
start = 7000
end = 8000
cids = df_cid_map["PubChem_CID"].tolist()[start:end]

# Store results
results = []

# Loop with checkpointing
for i, cid in enumerate(cids):
    props = get_pubchem_properties(cid)
    if props:
        results.append(props)
    
    # Print progress every 100
    if i % 100 == 0:
        print(f"...processed {i} compounds")

    # Save checkpoint every 250
    if i % 250 == 0 and i > 0:
        df_temp = pd.DataFrame(results)
        df_temp.to_csv(f"pubchem_props_checkpoint_{i}_batch8.csv", index=False)
        print(f"✔️ Batch 8 checkpoint saved at {i} entries")

    # Throttle: 5 requests per second
    if i % 5 == 0:
        time.sleep(1)

# Final save
df_props = pd.DataFrame(results)
df_props.to_csv("pubchem_props_batch8_complete.csv", index=False)
print("✅ Batch 8 done! Saved to pubchem_props_batch8_complete.csv")

...processed 0 compounds
...processed 100 compounds
...processed 200 compounds
✔️ Batch 8 checkpoint saved at 250 entries
...processed 300 compounds
...processed 400 compounds
...processed 500 compounds
✔️ Batch 8 checkpoint saved at 500 entries
...processed 600 compounds
...processed 700 compounds
✔️ Batch 8 checkpoint saved at 750 entries
...processed 800 compounds
...processed 900 compounds
✅ Batch 8 done! Saved to pubchem_props_batch8_complete.csv


In [12]:
import requests, time
import pandas as pd

# Function to retrieve properties for one CID
def get_pubchem_properties(cid):
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{cid}/property/CanonicalSMILES,MolecularWeight,XLogP,HBondDonorCount,HBondAcceptorCount/JSON"
    try:
        response = requests.get(url)
        if response.status_code == 200:
            props = response.json()['PropertyTable']['Properties'][0]
            return {
                'CID': cid,
                'SMILES': props.get('CanonicalSMILES'),
                'MolWt': props.get('MolecularWeight'),
                'LogP': props.get('XLogP'),  # <- Use XLogP here
                'HDonors': props.get('HBondDonorCount'),
                'HAcceptors': props.get('HBondAcceptorCount')
            }
        else:
            print(f"[SKIP] CID {cid} – status {response.status_code}")
            return None
    except Exception as e:
        print(f"[ERROR] CID {cid} – {e}")
        return None

# next 1000
start = 8000
end = 9000
cids = df_cid_map["PubChem_CID"].tolist()[start:end]

# Store results
results = []

# Loop with checkpointing
for i, cid in enumerate(cids):
    props = get_pubchem_properties(cid)
    if props:
        results.append(props)
    
    # Print progress every 100
    if i % 100 == 0:
        print(f"...processed {i} compounds")

    # Save checkpoint every 250
    if i % 250 == 0 and i > 0:
        df_temp = pd.DataFrame(results)
        df_temp.to_csv(f"pubchem_props_checkpoint_{i}_batch9.csv", index=False)
        print(f"✔️ Batch 9 checkpoint saved at {i} entries")

    # Throttle: 5 requests per second
    if i % 5 == 0:
        time.sleep(1)

# Final save
df_props = pd.DataFrame(results)
df_props.to_csv("pubchem_props_batch9_complete.csv", index=False)
print("✅ Batch 9 done! Saved to pubchem_props_batch9_complete.csv")

...processed 0 compounds
...processed 100 compounds
...processed 200 compounds
✔️ Batch 9 checkpoint saved at 250 entries
...processed 300 compounds
...processed 400 compounds
...processed 500 compounds
✔️ Batch 9 checkpoint saved at 500 entries
...processed 600 compounds
...processed 700 compounds
✔️ Batch 9 checkpoint saved at 750 entries
...processed 800 compounds
...processed 900 compounds
✅ Batch 9 done! Saved to pubchem_props_batch9_complete.csv


In [13]:
import requests, time
import pandas as pd

# Function to retrieve properties for one CID
def get_pubchem_properties(cid):
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{cid}/property/CanonicalSMILES,MolecularWeight,XLogP,HBondDonorCount,HBondAcceptorCount/JSON"
    try:
        response = requests.get(url)
        if response.status_code == 200:
            props = response.json()['PropertyTable']['Properties'][0]
            return {
                'CID': cid,
                'SMILES': props.get('CanonicalSMILES'),
                'MolWt': props.get('MolecularWeight'),
                'LogP': props.get('XLogP'),  # <- Use XLogP here
                'HDonors': props.get('HBondDonorCount'),
                'HAcceptors': props.get('HBondAcceptorCount')
            }
        else:
            print(f"[SKIP] CID {cid} – status {response.status_code}")
            return None
    except Exception as e:
        print(f"[ERROR] CID {cid} – {e}")
        return None

# next 1000
start = 9000
end = 10000
cids = df_cid_map["PubChem_CID"].tolist()[start:end]

# Store results
results = []

# Loop with checkpointing
for i, cid in enumerate(cids):
    props = get_pubchem_properties(cid)
    if props:
        results.append(props)
    
    # Print progress every 100
    if i % 100 == 0:
        print(f"...processed {i} compounds")

    # Save checkpoint every 250
    if i % 250 == 0 and i > 0:
        df_temp = pd.DataFrame(results)
        df_temp.to_csv(f"pubchem_props_checkpoint_{i}_batch10.csv", index=False)
        print(f"✔️ Batch 10 checkpoint saved at {i} entries")

    # Throttle: 5 requests per second
    if i % 5 == 0:
        time.sleep(1)

# Final save
df_props = pd.DataFrame(results)
df_props.to_csv("pubchem_props_batch10_complete.csv", index=False)
print("✅ Batch 10 done! Saved to pubchem_props_batch10_complete.csv")

...processed 0 compounds
...processed 100 compounds
...processed 200 compounds
✔️ Batch 10 checkpoint saved at 250 entries
...processed 300 compounds
...processed 400 compounds
...processed 500 compounds
✔️ Batch 10 checkpoint saved at 500 entries
...processed 600 compounds
...processed 700 compounds
✔️ Batch 10 checkpoint saved at 750 entries
...processed 800 compounds
...processed 900 compounds
✅ Batch 10 done! Saved to pubchem_props_batch10_complete.csv


In [15]:
import requests, time
import pandas as pd

# Function to retrieve properties for one CID
def get_pubchem_properties(cid):
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{cid}/property/CanonicalSMILES,MolecularWeight,XLogP,HBondDonorCount,HBondAcceptorCount/JSON"
    try:
        response = requests.get(url)
        if response.status_code == 200:
            props = response.json()['PropertyTable']['Properties'][0]
            return {
                'CID': cid,
                'SMILES': props.get('CanonicalSMILES'),
                'MolWt': props.get('MolecularWeight'),
                'LogP': props.get('XLogP'),  # <- Use XLogP here
                'HDonors': props.get('HBondDonorCount'),
                'HAcceptors': props.get('HBondAcceptorCount')
            }
        else:
            print(f"[SKIP] CID {cid} – status {response.status_code}")
            return None
    except Exception as e:
        print(f"[ERROR] CID {cid} – {e}")
        return None

# next 1000
start = 10000
end = 11000
cids = df_cid_map["PubChem_CID"].tolist()[start:end]

# Store results
results = []

# Loop with checkpointing
for i, cid in enumerate(cids):
    props = get_pubchem_properties(cid)
    if props:
        results.append(props)
    
    # Print progress every 100
    if i % 100 == 0:
        print(f"...processed {i} compounds")

    # Save checkpoint every 250
    if i % 250 == 0 and i > 0:
        df_temp = pd.DataFrame(results)
        df_temp.to_csv(f"pubchem_props_checkpoint_{i}_batch11.csv", index=False)
        print(f"✔️ Batch 11 checkpoint saved at {i} entries")

    # Throttle: 5 requests per second
    if i % 5 == 0:
        time.sleep(1)

# Final save
df_props = pd.DataFrame(results)
df_props.to_csv("pubchem_props_batch11_complete.csv", index=False)
print("✅ Batch 11 done! Saved to pubchem_props_batch11_complete.csv")

...processed 0 compounds
...processed 100 compounds
...processed 200 compounds
✔️ Batch 11 checkpoint saved at 250 entries
...processed 300 compounds
...processed 400 compounds
...processed 500 compounds
✔️ Batch 11 checkpoint saved at 500 entries
...processed 600 compounds
...processed 700 compounds
✔️ Batch 11 checkpoint saved at 750 entries
...processed 800 compounds
...processed 900 compounds
✅ Batch 11 done! Saved to pubchem_props_batch11_complete.csv


In [18]:
import requests, time
import pandas as pd

# Function to retrieve properties for one CID
def get_pubchem_properties(cid):
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{cid}/property/CanonicalSMILES,MolecularWeight,XLogP,HBondDonorCount,HBondAcceptorCount/JSON"
    try:
        response = requests.get(url)
        if response.status_code == 200:
            props = response.json()['PropertyTable']['Properties'][0]
            return {
                'CID': cid,
                'SMILES': props.get('CanonicalSMILES'),
                'MolWt': props.get('MolecularWeight'),
                'LogP': props.get('XLogP'),  # <- Use XLogP here
                'HDonors': props.get('HBondDonorCount'),
                'HAcceptors': props.get('HBondAcceptorCount')
            }
        else:
            print(f"[SKIP] CID {cid} – status {response.status_code}")
            return None
    except Exception as e:
        print(f"[ERROR] CID {cid} – {e}")
        return None

# next 1000
start = 11000
end = 13000
cids = df_cid_map["PubChem_CID"].tolist()[start:end]

# Store results
results = []

# Loop with checkpointing
for i, cid in enumerate(cids):
    props = get_pubchem_properties(cid)
    if props:
        results.append(props)
    
    # Print progress every 100
    if i % 100 == 0:
        print(f"...processed {i} compounds")

    # Save checkpoint every 250
    if i % 250 == 0 and i > 0:
        df_temp = pd.DataFrame(results)
        df_temp.to_csv(f"pubchem_props_checkpoint_{i}_batch12.csv", index=False)
        print(f"✔️ Batch 12 checkpoint saved at {i} entries")

    # Throttle: 5 requests per second
    if i % 5 == 0:
        time.sleep(1)

# Final save
df_props = pd.DataFrame(results)
df_props.to_csv("pubchem_props_batch12_complete.csv", index=False)
print("✅ Batch 12 done! Saved to pubchem_props_batch12_complete.csv")

...processed 0 compounds
...processed 100 compounds
...processed 200 compounds
✔️ Batch 12 checkpoint saved at 250 entries
...processed 300 compounds
...processed 400 compounds
...processed 500 compounds
✔️ Batch 12 checkpoint saved at 500 entries
...processed 600 compounds
...processed 700 compounds
✔️ Batch 12 checkpoint saved at 750 entries
...processed 800 compounds
✅ Batch 12 done! Saved to pubchem_props_batch12_complete.csv


In [None]:
from rdkit import Chem
from rdkit.Chem import Descriptors

def generate_rdkit_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return {'TPSA': None, 'NumRotatableBonds': None, 'RingCount': None}
    return {
        'TPSA': Descriptors.TPSA(mol),
        'NumRotatableBonds': Descriptors.NumRotatableBonds(mol),
        'RingCount': Descriptors.RingCount(mol)
    }

# Apply to SMILES in df_props
rdkit_features = df_props['SMILES'].apply(generate_rdkit_descriptors)
rdkit_df = pd.DataFrame(rdkit_features.tolist())

# Merge descriptors with df_props
df_props_rdkit = pd.concat([df_props, rdkit_df], axis=1)
df_props_rdkit.head()

In [None]:
# Merge on PubChem CID
final_df = pd.merge(df_cid_map, df_props_rdkit, left_on="PubChem_CID", right_on="CID", how="inner")

# Clean up
final_df.drop(columns=["CID"], inplace=True)

# Save the final output
final_df.to_csv("final_chemical_data_with_descriptors.csv", index=False)

# Preview
final_df.head()
