In [1]:
!pip install biopython

Collecting biopython
  Downloading biopython-1.83.tar.gz (19.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.4/19.4 MB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: biopython
  Building wheel for biopython (setup.py) ... [?25ldone
[?25h  Created wheel for biopython: filename=biopython-1.83-cp38-cp38-linux_aarch64.whl size=2692682 sha256=ad8c0e646b7606b226cf7f80eadf76028349f8c5f7b9361c69dd2ccd18530a6a
  Stored in directory: /root/.cache/pip/wheels/b7/71/b8/ddd94db6bfab84bc0015d99df08b9d481ec177631e1f03c815
Successfully built biopython
Installing collected packages: biopython
Successfully installed biopython-1.83
[0m

In [2]:
from Bio import Entrez

# Set up your email for NCBI Entrez API
Entrez.email = "your_email@example.com"

def fetch_pubmed_abstracts(query, max_results=100):
    """
    Fetches PubMed abstracts for the given query.
    Args:
        query (str): Search query for PubMed.
        max_results (int): Maximum number of results to fetch.
    Returns:
        str: A concatenated string of all abstracts retrieved.
    """
    print(f"Fetching PubMed abstracts for query: '{query}'...")
    handle = Entrez.esearch(db="pubmed", term=query, retmax=max_results)
    record = Entrez.read(handle)
    handle.close()

    ids = record["IdList"]
    if not ids:
        print("No results found!")
        return ""

    # Fetch the summaries and abstracts
    handle = Entrez.efetch(db="pubmed", id=ids, rettype="abstract", retmode="text")
    abstracts = handle.read()
    handle.close()

    return abstracts

def save_corpus_to_file(corpus, filename):
    """
    Saves the corpus of abstracts to a local file.
    Args:
        corpus (str): The text corpus to save.
        filename (str): The file name to save the corpus.
    """
    with open(filename, "w", encoding="utf-8") as file:
        file.write(corpus)
    print(f"Corpus saved to '{filename}'")


In [3]:
# Define the search query and the maximum number of results
query = "cancer classification"
max_results = 1000  # Adjust as needed

# Fetch PubMed abstracts
corpus = fetch_pubmed_abstracts(query, max_results)

if corpus:
    # Save the corpus to a local file
    save_corpus_to_file(corpus, "pubmed_cancer_corpus.txt")

Fetching PubMed abstracts for query: 'cancer classification'...
Corpus saved to 'pubmed_cancer_corpus.txt'
