In [1]:
import pandas as pd
import requests
from Bio import Entrez
import dotenv

Entrez.email = Entrez.email = dotenv.get_key("../.env", "NCBI_EMAIL")

In [2]:
df = pd.read_csv("raw/exported-molgluedb.csv")
df['DOI'] = df['SourceAddress_Website'].apply(lambda x : x[len("https://doi.org/"):])
df.head()
df.drop("SourceAddress_Website", axis=1, inplace=True)
df.head()
# df.to_csv("ground_truth.csv")

Unnamed: 0,DATAID,SMILES,Name,IUPAC,StdInChl,StdInChIKey,Pharmacophore,Core,ResearchStage,TherapeuticUsage,...,Rings,AliphaticRings,AromaticRings,AliphaticHeteroRings,AromaticHeteroRings,HeavyAtoms,HeteroAtoms,SpiroAtoms,BridgeheadAtoms,DOI
0,1,[2H]C([2H])([2H])OC1=CC=C2C=C(C(=O)NC3=CC=C4C(...,36,"N-[2-(2,6-dioxopiperidin-3-yl)-1-oxo-3H-isoind...",InChI=1S/C25H21N3O5/c1-33-19-6-4-14-10-16(3-2-...,JGWFFKIYBDZDFI-FIBGUPNXSA-N,Glutarimide,LenalidomideType,Discovery,,...,5,2,3,2,0,33,8,0,0,10.1021/acs.jmedchem.3c01736
1,90,CC(C)(C)CC(C)(C)NC1=C(C2=CC=C3C(=O)N(C4CCC(=O)...,Compound 12,"2-(2,6-dioxopiperidin-3-yl)-5-[3-(2,4,4-trimet...","InChI=1S/C28H31N5O4/c1-27(2,3)15-28(4,5)31-23-...",CGKKMJUNHRRYMQ-UHFFFAOYSA-N,Glutarimide,ThalidomideType,Discovery,,...,5,2,3,2,2,37,9,0,0,10.1021/jacs.4c06127
2,91,CC(C)(C)CC(C)(C)NC1=C(C2=CC=C3C(=O)N(C4CCC(=O)...,Compound 11,"3-[3-oxo-6-[3-(2,4,4-trimethylpentan-2-ylamino...","InChI=1S/C28H33N5O3/c1-27(2,3)16-28(4,5)31-24-...",URPATGFIHKJSDH-UHFFFAOYSA-N,Glutarimide,LenalidomideType,Discovery,,...,5,2,3,2,2,36,8,0,0,10.1021/jacs.4c06127
3,98,CC(C)(C)NC1=C(C2=CC=C3C(=O)N(C4CCC(=O)NC4=O)C(...,Compound 4,"5-[3-(tert-butylamino)imidazo[1,2-a]pyridin-2-...","InChI=1S/C24H23N5O4/c1-24(2,3)27-20-19(25-17-6...",IEPVLXVJCUHDAW-UHFFFAOYSA-N,Glutarimide,ThalidomideType,Discovery,,...,5,2,3,2,2,33,9,0,0,10.1021/jacs.4c06127
4,99,CC(C)(C)NC1=C(C2=CC=C3C(=O)N(C4CCC(=O)NC4=O)CC...,Compound 1,"3-[6-[3-(tert-butylamino)imidazo[1,2-a]pyridin...","InChI=1S/C24H25N5O3/c1-24(2,3)27-21-20(25-18-6...",IFLAYQIAAFCPNM-UHFFFAOYSA-N,Glutarimide,LenalidomideType,Discovery,,...,5,2,3,2,2,32,8,0,0,10.1021/jacs.4c06127


In [3]:
# if the PMC searches aren't yielding enough results, just go by DOI
uniqueDOIs = df['DOI'].drop_duplicates()
uniqueDOIs.to_csv('dedup_DOI-molgluedb.csv', index=False)

[PubMed Central ID Converter API](https://pmc.ncbi.nlm.nih.gov/tools/id-converter-api/)

In [4]:
dois = ",".join(uniqueDOIs)
print(dois)
print(dois.count(","))

10.1021/acs.jmedchem.3c01736,10.1021/jacs.4c06127,10.1101/2024.10.01.616159,10.1021/acs.jmedchem.4c02415,10.1021/jacsau.4c00762,10.1038/s41467-024-44698-1,10.1021/acsmedchemlett.4c00250,10.1186/s13045-024-01592-z,10.26508/lsa.202000804,10.1021/acs.jmedchem.1c01832,10.1016/j.ccell.2023.02.010,10.1016/j.bmcl.2025.130193,10.1038/s41467-023-40385-9,10.1002/cbic.202300351,10.1038/nature14610,10.1021/acsmedchemlett.4c00297,10.1007/s00109-020-01943-6,10.1016/j.bmcl.2025.130263
17


In [5]:
# Required parameters
url = "https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/"

params = {
    'tool': 'gamma',
    'email': Entrez.email,
    'ids': dois,
    'format': 'json'  # or 'xml'
}

In [6]:
def process_record(record):
  if record.get('status') == 'error':
    # raise RuntimeError(record.get('errmsg'))
    return " ", record.get("doi")

  return record.get("pmcid"), record.get("doi")

In [7]:
# Make the request
try:
    response = requests.get(url, params=params, timeout=10)
    
    # Check if request was successful
    if response.status_code == 200:
        data = response.json()
        print("Success!")
        print(f"Status: {data.get('status', 'unknown')}")          
        print(f"Response: {data}")
    else:
        print(f"Error: {response.status_code}")
        print(response.text)
        
except requests.exceptions.RequestException as e:
    print(f"Request failed: {e}")


Success!
Status: ok


In [8]:
pmcids = []
for i, (record, id) in enumerate(zip(data.get("records"), uniqueDOIs)):
  print(i, record)
  pmcid, doi = process_record(record)
  assert doi == id, f"{doi} != {id}"
  pmcids.append(pmcid)
  

0 {'doi': '10.1021/acs.jmedchem.3c01736', 'pmcid': 'PMC11302056', 'pmid': 38085607, 'requested-id': '10.1021/acs.jmedchem.3c01736'}
1 {'doi': '10.1021/jacs.4c06127', 'pmcid': 'PMC11800961', 'pmid': 39499896, 'requested-id': '10.1021/jacs.4c06127'}
2 {'doi': '10.1101/2024.10.01.616159', 'requested-id': '10.1101/2024.10.01.616159', 'status': 'error', 'errmsg': 'Identifier not found in PMC'}
3 {'doi': '10.1021/acs.jmedchem.4c02415', 'requested-id': '10.1021/acs.jmedchem.4c02415', 'status': 'error', 'errmsg': 'Identifier not found in PMC'}
4 {'doi': '10.1021/jacsau.4c00762', 'pmcid': 'PMC11600170', 'pmid': 39610741, 'requested-id': '10.1021/jacsau.4c00762'}
5 {'doi': '10.1038/s41467-024-44698-1', 'pmcid': 'PMC10791743', 'pmid': 38228616, 'requested-id': '10.1038/s41467-024-44698-1'}
6 {'doi': '10.1021/acsmedchemlett.4c00250', 'pmcid': 'PMC11472389', 'pmid': 39411539, 'requested-id': '10.1021/acsmedchemlett.4c00250'}
7 {'doi': '10.1186/s13045-024-01592-z', 'pmcid': 'PMC11367868', 'pmid': 39

In [9]:
df = pd.DataFrame({
  "DOI": uniqueDOIs,
  "PMCID": pmcids
})
df = df[df['PMCID'] != " "]
df.to_csv("ground_truth-molgluedb.csv", index=False)