In [34]:
!pip install Bio



In [35]:
import time
from datetime import datetime

import requests
import pandas as pd
import numpy as np

from Bio import Entrez, SeqIO

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [36]:
Entrez.email = "omar.atwaa16@gmail.com"

In [37]:
def search_pubmed(query: str, max_results: int = 10):
    print(f"Searching PubMed for: '{query}'")

    handle = Entrez.esearch(
        db="pubmed",
        term=query,
        retmax=max_results,
        sort="relevance"
    )

    record = Entrez.read(handle)
    handle.close()

    pmids = record.get("IdList", [])
    print(f"Found {len(pmids)} articles")
    return pmids

In [38]:
def fetch_article_details(pmid: str):
    try:
        handle = Entrez.efetch(
            db="pubmed",
            id=pmid,
            rettype="xml",
            retmode="xml"
        )

        record = Entrez.read(handle)
        handle.close()

        article = record['PubmedArticle'][0]
        medline = article['MedlineCitation']
        article_data = medline['Article']

        # Abstract handling
        abstract_text = ""
        if 'Abstract' in article_data and 'AbstractText' in article_data['Abstract']:
            abstract_parts = article_data['Abstract']['AbstractText']
            if isinstance(abstract_parts, list):
                abstract_text = " ".join(str(x) for x in abstract_parts)
            else:
                abstract_text = str(abstract_parts)

        # Authors
        authors_list = []
        for author in article_data.get('AuthorList', []):
            last = author.get('LastName', '')
            initials = author.get('Initials', '')
            full = f"{last} {initials}".strip()
            if full:
                authors_list.append(full)

        keywordsList = []
        if 'KeywordList' in medline:
          for kw_list in medline['KeywordList']:
            for kw in kw_list:
              keywordsList.append(str(kw))


        details = {
            'pmid': pmid,
            'title': article_data.get('ArticleTitle', ''),
            'abstract': abstract_text,
            'authors': authors_list,
            'journal': article_data.get('Journal', {}).get('Title', ''),
            'pub_date': article_data.get('Journal', {}).get('JournalIssue', {}).get('PubDate', {}),
            'keywords': keywordsList
        }

        return details

    except Exception as e:
        print(f"Error fetching PMID {pmid}: {e}")
        return None

In [39]:
def fetch_multiple_articles(pmids, delay: float = 0.4):
    articles = []

    for i, pmid in enumerate(pmids):
        print(f"Fetching article {i + 1}/{len(pmids)}: PMID {pmid}")
        details = fetch_article_details(pmid)

        if details:
            articles.append(details)

        time.sleep(delay)

    return articles

In [40]:

def save_articles_to_csv(query: str,
                         max_results: int = 50,
                         filename: str = "pubmed_results.csv") -> pd.DataFrame:

    pmids = search_pubmed(query, max_results)
    articles = fetch_multiple_articles(pmids)

    df = pd.DataFrame(articles)

    if 'authors' in df.columns:
        df['authors'] = df['authors'].apply(
            lambda x: '; '.join(x) if isinstance(x, list) else str(x)
        )

    df.to_csv(filename, index=False, encoding='utf-8')
    print(f"\nSaved {len(df)} articles to {filename}")
    return df


In [41]:
if __name__ == "__main__":

    # --- PubMed Example ---
    print("\n=== PubMed basic search + CSV ===")
    df_pubmed = save_articles_to_csv(
        query="cancer immunotherapy",
        max_results=10,
        filename="cancer_immunotherapy_demo.csv"
    )
    print(df_pubmed.head(3))


=== PubMed basic search + CSV ===
Searching PubMed for: 'cancer immunotherapy'
Found 10 articles
Fetching article 1/10: PMID 31311655
Fetching article 2/10: PMID 29860986
Fetching article 3/10: PMID 39361750
Fetching article 4/10: PMID 33282961
Fetching article 5/10: PMID 32259782
Fetching article 6/10: PMID 30309862
Fetching article 7/10: PMID 36290818
Fetching article 8/10: PMID 37232801
Fetching article 9/10: PMID 34403771
Fetching article 10/10: PMID 40847226

Saved 10 articles to cancer_immunotherapy_demo.csv
       pmid                                              title  \
0  31311655    Integrative Approaches to Cancer Immunotherapy.   
1  29860986  Immunotherapy and Prevention of Pancreatic Can...   
2  39361750                Cancer immunotherapy by γδ T cells.   

                                            abstract  \
0  Cancer immunotherapy aims to arm patients with...   
1  Pancreatic cancer is the third-leading cause o...   
2  The premise of cancer immunotherapy is that