In [None]:
import requests
import xml.etree.ElementTree as ET
import pandas as pd
import re
import time

def fetch_pubmed_ids(author, retmax=200):
    url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
    params = {
        'db': 'pubmed',
        'term': f'{author}[Author]',
        'retmax': retmax,
        'retmode': 'xml'
    }
    response = requests.get(url, params=params)
    root = ET.fromstring(response.content)
    ids = [id_elem.text for id_elem in root.findall('./IdList/Id')]
    return ids

def fetch_pubmed_records(id_list):
    url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
    if not id_list:
        return b''  # return empty bytes if no IDs
    ids = ','.join(id_list)
    params = {
        'db': 'pubmed',
        'id': ids,
        'retmode': 'xml'
    }
    response = requests.get(url, params=params)
    return response.content

def parse_pubmed_xml(xml_data):
    if not xml_data:
        return []
    root = ET.fromstring(xml_data)
    records = []
    for article in root.findall('.//PubmedArticle'):
        try:
            article_title = article.findtext('.//ArticleTitle')
            journal = article.findtext('.//Journal/Title')

            if len(article_title) < 4:
                article_title = pd.NA  # you used np.nan, but pandas NA is cleaner in newer versions

            authors = []
            affiliations = set()

            for author in article.findall('.//AuthorList/Author'):
                last = author.findtext('LastName')
                fore = author.findtext('ForeName')
                if last and fore:
                    authors.append(f"{last} {fore[0]}.")
                elif last:
                    authors.append(last)

                # Affiliation (can be multiple)
                aff_list = author.findall('.//AffiliationInfo/Affiliation')
                for aff in aff_list:
                    if aff is not None and aff.text:
                        affiliations.add(aff.text.strip())

            authors = ', '.join(authors)
            affiliation_str = '; '.join(affiliations) if affiliations else None

            pub_date_elem = article.find('.//Journal/JournalIssue/PubDate')
            pub_date_str = None
            year = None
            if pub_date_elem is not None:
                year = pub_date_elem.findtext('Year')
                medline_date = pub_date_elem.findtext('MedlineDate')
                month = pub_date_elem.findtext('Month')
                day = pub_date_elem.findtext('Day')
                if year:
                    pub_date_str = year
                    if month:
                        pub_date_str += f"-{month}"
                    if day:
                        pub_date_str += f"-{day}"
                elif medline_date:
                    pub_date_str = medline_date

            doi = None
            for article_id in article.findall('.//ArticleIdList/ArticleId'):
                if article_id.attrib.get('IdType') == 'doi':
                    doi = article_id.text
                    break

            records.append({
                'Title': article_title,
                'Journal': journal,
                'Authors': authors,
                'Affiliations': affiliation_str,
                'Year': year,
                'DOI': doi,
                'DocumentType': 'Article',
                'PublicationDate': pub_date_str
            })
        except Exception:
            continue

    # Convert PublicationDate strings to pandas datetime, coercing errors
    for r in records:
        date_str = r['PublicationDate']
        if date_str:
            r['PublicationDate'] = pd.to_datetime(date_str, errors='coerce')
        else:
            r['PublicationDate'] = pd.NaT
    return records



# List of authors to process
authors_list = [
    "Vivek Muthurangu",
    "Jennifer Steeden",
    "Daniel Knight",
    "Michael Quail"
]

# all_authors = [
#     "Yao",
#     "Muthurangu",
#     "Steeden",
#     "Knight",
#     'Quail',
#     'Jiang',
#     'Yong',
#     'Wrobel',
#     'Pascale',
#     'Montalt',
#     'Jaubert',
#     'Baker',
#     'Raman',
#     'Campbell'
# ]

all_records = []

for author in authors_list:
    print(f"Processing author: {author}")
    ids = fetch_pubmed_ids(author, retmax=200)
    xml_data = fetch_pubmed_records(ids)
    records = parse_pubmed_xml(xml_data)
    all_records.extend(records)
    time.sleep(0.5)  # polite pause to avoid hitting API limits

# Convert all records to a DataFrame



df = pd.DataFrame(all_records).drop_duplicates('Title').sort_values('PublicationDate', ascending=False)

# Build a regex that matches whole phrases (case-insensitive), not substrings
affiliations = [
    "UCL",
    "University College London",
    "Great Ormond Street",
    "Royal Free"
]

# Create pattern using lookarounds to enforce matching whole phrases
affil_pattern = r'(?i)(?<!\w)(' + '|'.join(re.escape(a) for a in affiliations) + r')(?!\w)'

# Filter
df = df[df['Affiliations'].astype(str).str.contains(affil_pattern, na=False)].dropna()

df.to_json('data/pubs.json', orient='records')

Processing author: Rebecca Baker
A Phenotypic Study of CRB1 Retinopathy Secondary to the Variant p.(Pro836Thr) Prevalent in Those of Black African Ancestry.
Polyacrylamide Gel Calibration Phantoms for Quantification in Sodium MRI.
ISCEV standard full-field ERG reference limits from 407 healthy subjects, derived from transference and validation of reference data between electrode types and centres.
An assessment of the European Patient Summary for clinical research: a case study in cardiology.
Using artificial intelligence and predictive modelling to enable learning healthcare systems (LHS) for pandemic preparedness.
Rapid 2D 
Pain management research from the NIH HEAL Initiative.
2D sodium MRI of the human calf using half-sinc excitation pulses and compressed sensing.
Patient care, integration and collaboration of physician associates in multiprofessional teams: A mixed methods study.
Investigating changes in blood-cerebrospinal fluid barrier function in a rat model of chronic hyperten

  df = df[df['Affiliations'].astype(str).str.contains(affil_pattern, na=False)].dropna()


OSError: Cannot save file into a non-existent directory: 'data'

In [28]:
df

Unnamed: 0,Title,Journal,Authors,Affiliations,Year,DOI,DocumentType,PublicationDate
0,A Phenotypic Study of CRB1 Retinopathy Seconda...,Investigative ophthalmology & visual science,"Wong W., Robson A., Baker R., Arno G., Van Aer...","Moorfields Eye Hospital NHS Foundation Trust, ...",2025,10.1167/iovs.66.9.3,Article,2025-07-01
1,Polyacrylamide Gel Calibration Phantoms for Qu...,NMR in biomedicine,"Rot S., Oliver-Taylor A., Baker R., Steeden J....","Department of Brain & Behavioural Sciences, Un...",2025,10.1002/nbm.70056,Article,2025-06-01
2,ISCEV standard full-field ERG reference limits...,Documenta ophthalmologica. Advances in ophthal...,"Baker R., Leo S., Clowes W., Chow I., Jiang X....","UCL Institute of Ophthalmology, London, UK.; M...",2025,10.1007/s10633-025-10009-2,Article,2025-04-01
5,Rapid 2D,Magnetic resonance imaging,"Baker R., Muthurangu V., Rega M., Walsh S., St...","UCL Centre for Medical Imaging, University Col...",2024,10.1016/j.mri.2024.04.027,Article,2024-07-01
7,2D sodium MRI of the human calf using half-sin...,Magnetic resonance in medicine,"Baker R., Muthurangu V., Rega M., Montalt-Tord...",UCL Centre for Translational Cardiovascular Im...,2024,10.1002/mrm.29841,Article,2024-01-01
12,Image-Guided Magnetic Thermoseed Navigation an...,"Advanced science (Weinheim, Baden-Wurttemberg,...","Baker R., Payne C., Yu Y., Mohseni M., Connell...",Division of Surgery and Interventional Science...,2022,10.1002/advs.202105333,Article,2022-04-01
13,Remote and Selective Control of Astrocytes by ...,"Advanced science (Weinheim, Baden-Wurttemberg,...","Yu Y., Payne C., Marina N., Korsak A., Souther...","Department of Neuromuscular Diseases, Queen Sq...",2022,10.1002/advs.202104194,Article,2022-02-01
9,Investigating changes in blood-cerebrospinal f...,Frontiers in molecular neuroscience,"Perera C., Tolomeo D., Baker R., Ohene Y., Kor...","Dementia Research Centre, UCL Queen Square Ins...",2022,10.3389/fnmol.2022.964632,Article,2022-01-01


In [26]:
df[df['Title'].str.contains('rapid', case=False, na=False)]

Unnamed: 0,Title,Journal,Authors,Affiliations,Year,DOI,DocumentType,PublicationDate
14,Rapid 2D,Magnetic resonance imaging,"Baker R., Muthurangu V., Rega M., Walsh S., St...","UCL Centre for Medical Imaging, University Col...",2024,10.1016/j.mri.2024.04.027,Article,2024-07-01
11,Image2Flow: A proof-of-concept hybrid image an...,PLoS computational biology,"Yao T., Pajaziti E., Quail M., Schievano S., S...","Institute of Cardiovascular Science, Universit...",2024,10.1371/journal.pcbi.1012231,Article,2024-06-01
28,Rapid desensitization through immunoadsorption...,Perfusion,"Issitt R., Cudworth E., Cortina-Borja M., Gupt...","Department of Paediatric Cardiology, Institute...",2024,10.1177/02676591221151035,Article,2024-04-01
56,Rapid 3D whole-heart cine imaging using golden...,Magnetic resonance imaging,"Montalt-Tordera J., Kowalik G., Gotschy A., St...","Great Ormond Street Hospital, London, UK; Inst...",2020,10.1016/j.mri.2020.06.008,Article,2020-10-01
54,Rapid whole-heart CMR with single volume super...,Journal of cardiovascular magnetic resonance :...,"Steeden J., Quail M., Gotschy A., Mortensen K....","UCL Centre for Cardiovascular Imaging, Institu...",2020,10.1186/s12968-020-00651-x,Article,2020-08-03
93,Rapid breath-hold assessment of myocardial vel...,Journal of magnetic resonance imaging : JMRI,"Kowalik G., Muthurangu V., Khushnood A., Steed...","UCL Centre for Cardiovascular Imaging, Univers...",2016,10.1002/jmri.25218,Article,2016-10-01
94,Comprehensive assessment of the global and reg...,"American journal of physiology. Regulatory, in...","Hauser J., Muthurangu V., Steeden J., Taylor A...","University College London, Institute of Cardio...",2016,10.1152/ajpregu.00454.2015,Article,2016-03-15
129,Rapid flow assessment of congenital heart dise...,Radiology,"Steeden J., Atkinson D., Hansen M., Taylor A.,...","Centre for Medical Image Computing, UCL Depart...",2011,10.1148/radiol.11101844,Article,2011-07-01
