In [83]:
import time
import requests
import pandas as pd

API_KEY = "460fb2ee63a345a8ae356a6a1738ddf5"
AUTHOR_URL = "https://api.elsevier.com/content/search/scopus"
DOI_URL = "https://api.elsevier.com/content/article/doi/"

headers = {
    "X-ELS-APIKey": API_KEY,
    "Accept": "application/json"
}

author_ids = {'vivek': '7801671480'}
AUTHOR_ID = author_ids['vivek']

params = {
    "query": f"AU-ID({AUTHOR_ID})",
    "start": 0,
    "count": 25
}

all_results = []

response = requests.get(AUTHOR_URL, headers=headers, params=params)
data = response.json()

for entry in data.get("search-results", {}).get("entry", []):
    title = entry.get("dc:title", "No title")
    pub_date = entry.get("prism:coverDate", "")  # full date e.g. "2023-07-10"
    journal = entry.get("prism:publicationName", "No journal")
    doi = entry.get("prism:doi", "")

    # Default authors list if DOI missing or request fails
    authors = []

    if doi:
        data2 = requests.get(f"{DOI_URL}/{doi}", headers=headers).json()
        authors_raw = data2.get('full-text-retrieval-response', {}) \
                        .get('coredata', {}) \
                        .get('dc:creator', [])
        authors = [author.get('$', '') for author in authors_raw if author.get('$')]

        time.sleep(0.5)  # To respect rate limits

    all_results.append({
        "Title": title,
        "Publication Date": pub_date,
        "Journal": journal,
        "Authors": ", ".join(authors) if authors else "N/A"
    })

df = pd.DataFrame(all_results)


In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def fetch_html(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"  # mimic a browser request
    }
    response = requests.get(url, headers=headers)
    response.raise_for_status()  # Raise an exception for HTTP errors
    return response.text

def extract_articles_from_html(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    articles = []

    article_divs = soup.find_all('div', class_='EipsWloU9zQSqU9BLfCH R6oJNgDp_xoNNaLsSD09')

    for div in article_divs:
        try:
            article_type = div.find('span', class_='QJT0jiW54ACd2d_5yV_N').get_text(strip=True)
            title = div.find('div', class_='z1XL_MT5ixPMOKQZBEQG').get_text(strip=True)
            metadata_spans = div.find('div', class_='f7miQZORbbS3b3LIuM4g').find_all('span')
            date = metadata_spans[0].get_text(strip=True)
            journal = metadata_spans[1].get_text(strip=True)
            volume_pages = metadata_spans[2].get_text(strip=True)

            authors_div = div.find('div', class_='M1uzatbE1hJqwcz7SQrK')
            authors = ', '.join([a.get_text(strip=True) for a in authors_div.find_all('span', class_='yfHz2MBO3Bj6yVf6dwAW')])

            doi_span = div.find('a', title='DOI')
            doi = doi_span.get_text(strip=True).replace('DOI', '').strip() if doi_span else None
            doi_link = doi_span['href'] if doi_span else None

            articles.append({
                'Type': article_type,
                'Title': title,
                'Date': date,
                'Journal': journal,
                'Volume/Pages': volume_pages,
                'Authors': authors,
                'DOI': doi,
                'DOI Link': doi_link
            })
        except Exception as e:
            print(f"Error parsing article: {e}")

    return pd.DataFrame(articles)

# Example usage:
html = fetch_html(url)
df = extract_articles_from_html(html)
print(df)


Empty DataFrame
Columns: []
Index: []


In [8]:
import requests
import xml.etree.ElementTree as ET
import pandas as pd

def fetch_pubmed_ids(author, retmax=100):
    url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
    params = {
        'db': 'pubmed',
        'term': f'{author}[Author]',
        'retmax': retmax,
        'retmode': 'xml'
    }
    response = requests.get(url, params=params)
    root = ET.fromstring(response.content)
    ids = [id_elem.text for id_elem in root.findall('./IdList/Id')]
    return ids

def fetch_pubmed_records(id_list):
    url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
    ids = ','.join(id_list)
    params = {
        'db': 'pubmed',
        'id': ids,
        'retmode': 'xml'
    }
    response = requests.get(url, params=params)
    return response.content

def parse_pubmed_xml(xml_data):
    root = ET.fromstring(xml_data)
    records = []
    for article in root.findall('.//PubmedArticle'):
        try:
            article_title = article.findtext('.//ArticleTitle')
            journal = article.findtext('.//Journal/Title')
            
            # Authors list
            authors = []
            for author in article.findall('.//AuthorList/Author'):
                last = author.findtext('LastName')
                fore = author.findtext('ForeName')
                if last and fore:
                    authors.append(f"{fore} {last}")
                elif last:
                    authors.append(last)
            
            # Pub Date (prefer MedlineDate or Year/Month/Day if available)
            pub_date_elem = article.find('.//Journal/JournalIssue/PubDate')
            if pub_date_elem is not None:
                year = pub_date_elem.findtext('Year')
                medline_date = pub_date_elem.findtext('MedlineDate')
                month = pub_date_elem.findtext('Month')
                day = pub_date_elem.findtext('Day')
                if year:
                    pub_date = year
                    if month:
                        pub_date += f"-{month}"
                    if day:
                        pub_date += f"-{day}"
                elif medline_date:
                    pub_date = medline_date
                else:
                    pub_date = None
            else:
                pub_date = None
            
            records.append({
                'Title': article_title,
                'Journal': journal,
                'Authors': authors,
                'PublicationDate': pub_date
            })
        except Exception:
            # Skip article if data extraction fails
            continue
    return records

# Example usage:
author_name = "Vivek Muthurangu"
ids = fetch_pubmed_ids(author_name, retmax=200)
xml_data = fetch_pubmed_records(ids)
records = parse_pubmed_xml(xml_data)

df = pd.DataFrame(records)

In [None]:
import requests
import xml.etree.ElementTree as ET
import pandas as pd
import time

def fetch_pubmed_ids(author, retmax=200):
    url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
    params = {
        'db': 'pubmed',
        'term': f'{author}[Author]',
        'retmax': retmax,
        'retmode': 'xml'
    }
    response = requests.get(url, params=params)
    root = ET.fromstring(response.content)
    ids = [id_elem.text for id_elem in root.findall('./IdList/Id')]
    return ids

def fetch_pubmed_records(id_list):
    url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
    if not id_list:
        return b''  # return empty bytes if no IDs
    ids = ','.join(id_list)
    params = {
        'db': 'pubmed',
        'id': ids,
        'retmode': 'xml'
    }
    response = requests.get(url, params=params)
    return response.content

def parse_pubmed_xml(xml_data):
    if not xml_data:
        return []
    root = ET.fromstring(xml_data)
    records = []
    for article in root.findall('.//PubmedArticle'):
        try:
            article_title = article.findtext('.//ArticleTitle')
            journal = article.findtext('.//Journal/Title')

            if len(article_title)< 4:
                article_title = np.nan
            
            authors = []
            for author in article.findall('.//AuthorList/Author'):
                last = author.findtext('LastName')
                fore = author.findtext('ForeName')
                if last and fore:
                    authors.append(f"{last} {fore[0]}.")
                elif last:
                    authors.append(last)
            authors = ', '.join(authors)

            pub_date_elem = article.find('.//Journal/JournalIssue/PubDate')
            pub_date_str = None
            if pub_date_elem is not None:
                year = pub_date_elem.findtext('Year')
                medline_date = pub_date_elem.findtext('MedlineDate')
                month = pub_date_elem.findtext('Month')
                day = pub_date_elem.findtext('Day')
                if year:
                    pub_date_str = year
                    if month:
                        pub_date_str += f"-{month}"
                    if day:
                        pub_date_str += f"-{day}"
                elif medline_date:
                    # medline_date can be a range or just a year, e.g., "1998 Jan-Feb"
                    pub_date_str = medline_date
                else:
                    pub_date_str = None
            else:
                pub_date_str = None

            # Extract DOI
            doi = None
            for article_id in article.findall('.//ArticleIdList/ArticleId'):
                if article_id.attrib.get('IdType') == 'doi':
                    doi = article_id.text
                    break
            doc_types = [pt.text for pt in article.findall('.//PublicationTypeList/PublicationType')]
            
            records.append({
                'Title': article_title,
                'Journal': journal,
                'Authors': authors,
                'Year': year,
                'DOI': doi,
                'DocumentType':'Article',
                "PublicationDate": pub_date_str
            })
        except Exception:
            continue
    
    # Convert PublicationDate strings to pandas datetime, coercing errors
    for r in records:
        date_str = r['PublicationDate']
        if date_str:
            # Normalize months like "Jan" to "01"
            # Pandas can parse abbreviated months, so just let it try
            r['PublicationDate'] = pd.to_datetime(date_str, errors='coerce')
        else:
            r['PublicationDate'] = pd.NaT
    return records


# List of authors to process
authors_list = [
    "Vivek Muthurangu",
    "Jennifer Steeden",
    "Daniel Knight",
    "Michael Quail"
]

# all_authors = [
#     "Yao",
#     "Muthurangu",
#     "Steeden",
#     "Knight",
#     'Quail',
#     'Jiang',
#     'Yong',
#     'Wrobel',
#     'Pascale',
#     'Montalt',
#     'Jaubert',
#     'Baker',
#     'Raman',
#     'Campbell'
# ]

all_records = []

for author in authors_list:
    print(f"Processing author: {author}")
    ids = fetch_pubmed_ids(author, retmax=200)
    xml_data = fetch_pubmed_records(ids)
    records = parse_pubmed_xml(xml_data)
    all_records.extend(records)
    time.sleep(0.5)  # polite pause to avoid hitting API limits

# Convert all records to a DataFrame
df = pd.DataFrame(all_records).drop_duplicates('Title').sort_values('PublicationDate', ascending=False)
df.to_json('pubs.json', orient='records')


Processing author: Vivek Muthurangu
Processing author: Jennifer Steeden
Processing author: Daniel Knight
Processing author: Michael Quail


In [2]:
df

Unnamed: 0,Title,Journal,Authors,Year,DOI,DocumentType,PublicationDate
1,A lumped parameter model of the coronary circu...,Journal of biomechanics,"Yong E., Latief J., Wang Y., Erlinge D., Dahlg...",2025,10.1016/j.jbiomech.2025.112679,Article,2025-08-01
217,Transthyretin amyloid cardiomyopathy: natural ...,European heart journal,"Patel R., Ioannou A., Sheikh A., Razvi Y., Man...",2025,10.1093/eurheartj/ehaf412,Article,2025-07-11
0,Defining Diastolic Dysfunction Post-Fontan: Th...,American heart journal,"Alsaied T., Li R., Grant H., Schiff M., Li Y.,...",2025,10.1016/j.ahj.2025.07.007,Article,2025-07-10
369,Strategies to enhance recruitment of female pa...,Heart (British Cardiac Society),"Kunadian V., Pompei G., Dasgupta I., Swift P.,...",2025,10.1136/heartjnl-2024-325545,Article,2025-06-01
157,Polyacrylamide Gel Calibration Phantoms for Qu...,NMR in biomedicine,"Rot S., Oliver-Taylor A., Baker R., Steeden J....",2025,10.1002/nbm.70056,Article,2025-06-01
...,...,...,...,...,...,...,...
153,Visualization and tracking of an inflatable ba...,Magnetic resonance in medicine,"Miquel M., Hegde S., Muthurangu V., Corcoran B...",2004,10.1002/mrm.20041,Article,2004-05-01
368,Health care screening for men who have sex wit...,American family physician,Knight D.,2004,,Article,2004-05-01
155,Cardiac catheterisation guided by MRI in child...,"Lancet (London, England)","Razavi R., Hill D., Keevil S., Miquel M., Muth...",2003,10.1016/S0140-6736(03)14956-2,Article,2003-12-06
154,Three-dimensional magnetic resonance imaging o...,Cardiology in the young,"Razavi R., Hill D., Muthurangu V., Miquel M., ...",2003,10.1017/s1047951103000957,Article,2003-10-01
