In [1]:
import pandas as pd
import requests
from Bio import Entrez
import dotenv

Entrez.email = Entrez.email = dotenv.get_key("../.env", "NCBI_EMAIL")

In [2]:
df = pd.read_csv("raw/CK1a_papers_compounds.csv")
df['DOI'] = df['Papers'].apply(lambda x : x[len("https://doi.org/"):])
df.head()
df.drop("Papers", axis=1, inplace=True)
df.head()
# df.to_csv("ground_truth.csv")

Unnamed: 0,Year,compounds,DOI
0,2023,1,10.1002/cbic.202300351
1,2023,1,10.1002/cbic.202300351
2,2025,1,10.1016/j.bmcl.2025.130193
3,2025,28,10.1016/j.bmcl.2025.130193
4,2023,3,10.1016/j.ccell.2023.02.010


[PubMed Central ID Converter API](https://pmc.ncbi.nlm.nih.gov/tools/id-converter-api/)

In [3]:
def handle_year(group):
    # This should work correctly - it checks if all years in the group are the same
    if group['Year'].nunique() == 1:
        return group['Year'].iloc[0]  # This gets the actual year value
    else:
        raise RuntimeError("Years don't match")

merged_df = df.groupby('DOI').apply(lambda group: pd.Series({
    'compounds': group['compounds'].sum(),
    'Year': handle_year(group)
})).reset_index()

merged_df

Unnamed: 0,DOI,compounds,Year
0,10.1002/cbic.202300351,2,2023
1,10.1016/j.bmcl.2025.130193,29,2025
2,10.1016/j.ccell.2023.02.010,5,2023
3,10.1021/acs.jmedchem.3c01736,46,2023
4,10.1021/acs.jmedchem.4c02415,24,2023
5,10.1021/acsmedchemlett.4c00250,20,2024
6,10.1021/acsmedchemlett.4c00297,8,2024
7,10.1021/jacs.4c06127,10,2024
8,10.1021/jacsau.4c00762,19,2024
9,10.1038/nature14610,1,2015


In [4]:
df = merged_df

In [5]:
dois = ",".join(df['DOI'])
print(dois)
print(dois.count(","))

10.1002/cbic.202300351,10.1016/j.bmcl.2025.130193,10.1016/j.ccell.2023.02.010,10.1021/acs.jmedchem.3c01736,10.1021/acs.jmedchem.4c02415,10.1021/acsmedchemlett.4c00250,10.1021/acsmedchemlett.4c00297,10.1021/jacs.4c06127,10.1021/jacsau.4c00762,10.1038/nature14610,10.1038/s41467-023-40385-9,10.1038/s41467-024-44698-1,10.1101/2024.10.01.616159,10.1186/s13045-024-01592-z,10.26508/lsa.202000804
14


In [6]:
# Required parameters
url = "https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/"

params = {
    'tool': 'gamma',
    'email': Entrez.email,
    'ids': dois,
    'format': 'json'  # or 'xml'
}

In [7]:
def process_record(record):
  if record.get('status') == 'error':
    # raise RuntimeError(record.get('errmsg'))
    return " ", record.get("doi")

  return record.get("pmcid"), record.get("doi")

In [8]:
# Make the request
try:
    response = requests.get(url, params=params, timeout=10)
    
    # Check if request was successful
    if response.status_code == 200:
        data = response.json()
        print("Success!")
        print(f"Status: {data.get('status', 'unknown')}")          
        print(f"Response: {data}")
    else:
        print(f"Error: {response.status_code}")
        print(response.text)
        
except requests.exceptions.RequestException as e:
    print(f"Request failed: {e}")


Success!
Status: ok


In [9]:
pmcids = []
for i, (record, id) in enumerate(zip(data.get("records"), df['DOI'])):
  print(i, record)
  pmcid, doi = process_record(record)
  assert doi == id 
  pmcids.append(pmcid)
  

0 {'doi': '10.1002/cbic.202300351', 'requested-id': '10.1002/cbic.202300351', 'status': 'error', 'errmsg': 'Identifier not found in PMC'}
1 {'doi': '10.1016/j.bmcl.2025.130193', 'requested-id': '10.1016/j.bmcl.2025.130193', 'status': 'error', 'errmsg': 'Identifier not found in PMC'}
2 {'doi': '10.1016/j.ccell.2023.02.010', 'pmcid': 'PMC10466730', 'pmid': 36898380, 'requested-id': '10.1016/j.ccell.2023.02.010'}
3 {'doi': '10.1021/acs.jmedchem.3c01736', 'pmcid': 'PMC11302056', 'pmid': 38085607, 'requested-id': '10.1021/acs.jmedchem.3c01736'}
4 {'doi': '10.1021/acs.jmedchem.4c02415', 'requested-id': '10.1021/acs.jmedchem.4c02415', 'status': 'error', 'errmsg': 'Identifier not found in PMC'}
5 {'doi': '10.1021/acsmedchemlett.4c00250', 'pmcid': 'PMC11472389', 'pmid': 39411539, 'requested-id': '10.1021/acsmedchemlett.4c00250'}
6 {'doi': '10.1021/acsmedchemlett.4c00297', 'pmcid': 'PMC11403733', 'pmid': 39291008, 'requested-id': '10.1021/acsmedchemlett.4c00297'}
7 {'doi': '10.1021/jacs.4c06127'

In [10]:
print(df)

                               DOI  compounds  Year
0           10.1002/cbic.202300351          2  2023
1       10.1016/j.bmcl.2025.130193         29  2025
2      10.1016/j.ccell.2023.02.010          5  2023
3     10.1021/acs.jmedchem.3c01736         46  2023
4     10.1021/acs.jmedchem.4c02415         24  2023
5   10.1021/acsmedchemlett.4c00250         20  2024
6   10.1021/acsmedchemlett.4c00297          8  2024
7             10.1021/jacs.4c06127         10  2024
8           10.1021/jacsau.4c00762         19  2024
9              10.1038/nature14610          1  2015
10      10.1038/s41467-023-40385-9          5  2023
11      10.1038/s41467-024-44698-1          3  2024
12       10.1101/2024.10.01.616159         29  2024
13      10.1186/s13045-024-01592-z          3  2024
14          10.26508/lsa.202000804          3  2020


In [11]:
df['PMCID'] = pmcids
# df.to_csv("ground_truth.csv")

In [12]:
df = df[df['PMCID'] != " "]
df

Unnamed: 0,DOI,compounds,Year,PMCID
2,10.1016/j.ccell.2023.02.010,5,2023,PMC10466730
3,10.1021/acs.jmedchem.3c01736,46,2023,PMC11302056
5,10.1021/acsmedchemlett.4c00250,20,2024,PMC11472389
6,10.1021/acsmedchemlett.4c00297,8,2024,PMC11403733
7,10.1021/jacs.4c06127,10,2024,PMC11800961
8,10.1021/jacsau.4c00762,19,2024,PMC11600170
9,10.1038/nature14610,1,2015,PMC4853910
10,10.1038/s41467-023-40385-9,5,2023,PMC10439208
11,10.1038/s41467-024-44698-1,3,2024,PMC10791743
13,10.1186/s13045-024-01592-z,3,2024,PMC11367868


In [13]:
df.to_csv("doi-to-n-compounds-ground_truth.csv", index=False)