In [1]:
import requests
import time
import xml.etree.ElementTree as ET
from requests.adapters import HTTPAdapter
import json
from tqdm import tqdm
from urllib3.util.retry import Retry

# Function to create a retry session with exponential backoff
def create_retry_session():
    session = requests.Session()
    retries = Retry(total=5, backoff_factor=1, status_forcelist=[500, 502, 503, 504, 429])
    adapter = HTTPAdapter(max_retries=retries)
    session.mount("https://", adapter)
    session.mount("http://", adapter)
    return session

# Function to fetch article metadata for a list of PMIDs in batch
def fetch_pubmed_articles_batch(pmids):
    efetch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    
    # Prepare parameters for batch request (up to 100-200 PMIDs per request)
    params = {
        "db": "pubmed",
        "id": ",".join(pmids),  # Join the PMIDs in a comma-separated string
        "retmode": "xml",
        "retmax": str(len(pmids))  # Number of results to return, should be the size of the batch
    }
    
    session = create_retry_session()
    
    try:
        # Send the batch request
        response = session.get(efetch_url, params=params)
        response.raise_for_status()  # Will raise an exception for HTTP 4xx/5xx responses
        
        # Parse the XML response
        tree = ET.ElementTree(ET.fromstring(response.content))
        root = tree.getroot()
        
        # Parse article data from the XML
        articles = []
        for article in root.findall(".//PubmedArticle"):
            title = article.find(".//ArticleTitle").text if article.find(".//ArticleTitle") is not None else "No Title"
            abstract = article.find(".//Abstract/AbstractText").text if article.find(".//Abstract/AbstractText") is not None else "No Abstract"
            authors = article.findall(".//AuthorList/Author")
            author_names = [
                f"{author.find('.//LastName').text} {author.find('.//ForeName').text}"
                for author in authors
                if author.find('.//LastName') is not None and author.find('.//ForeName') is not None
            ]
            articles.append({
                "title": title,
                "abstract": abstract,
                "authors": author_names
            })
        return articles
    
    except requests.exceptions.RequestException as e:
        print(f"Error fetching batch of PMIDs: {e}")
        return None

# Function to fetch concept descriptions (if available)
def fetch_concept_description(concept_url):
    try:
        if "disease-ontology" in concept_url:
            # Fetching disease ontology metadata (dummy response, needs proper API call)
            response = requests.get(concept_url)
            if response.status_code == 200:
                return response.json().get("label", "No description available")
        elif "mesh" in concept_url:
            return "MeSH term description for " + concept_url.split("=")[-1]  # Placeholder text
        return "No description available"
    except requests.exceptions.RequestException:
        return "Error fetching concept description"

# Function to process the BioASQ question and extract article information
def process_bioasq_question(question):
    documents = question["documents"]
    concepts = question.get("concepts", [])
    snippets = question["snippets"]
    pmids = [get_pmid_from_url(url) for url in documents]
    
    result = {
        "question": question["body"],
        "ideal_answer": " ".join(question["ideal_answer"]) if "ideal_answer" in question else "",
        "exact_answer": question.get("exact_answer", []),
        "articles": [],
        "concepts": [],  # Empty list for concepts if they are missing
        "snippets": [],
        "snippet_texts": [snippet["text"] for snippet in snippets]
    }
    
    # Fetch concept descriptions if available
    if concepts:  # Only process concepts if they exist
        for concept_url in concepts:
            description = fetch_concept_description(concept_url)  # Fetch the textual description
            result["concepts"].append(description)
    
    # Process PMIDs in batches (e.g., 100 PMIDs per batch)
    batch_size = 100

    snippet_pmids = [get_pmid_from_url(snippet["document"]) for snippet in snippets]
    for i in range(0, len(snippet_pmids), batch_size):
        batch_pmids = snippet_pmids[i:i + batch_size]
        articles = fetch_pubmed_articles_batch(batch_pmids)
        if articles:
            result["snippets"].extend(articles)
    
    for i in range(0, len(pmids), batch_size):
        batch_pmids = pmids[i:i + batch_size]
        articles = fetch_pubmed_articles_batch(batch_pmids)
        if articles:
            result["articles"].extend(articles)
    
    return result

# Helper function to extract the PMID from the PubMed URL
def get_pmid_from_url(url):
    # PubMed URLs look like: https://pubmed.ncbi.nlm.nih.gov/{pmid}/
    return url.strip('/').split('/')[-1]

# Function to process the entire dataset and save the output in alternating files
def process_bioasq_data(input_data, file):
    # Initialize the output list that will hold all questions
    all_processed_questions = []
    batch_count = 0
    
    # Prepare output files
    output_file_1 = f"{file}_output_batch_1.json"
    output_file_2 = f"{file}_output_batch_2.json"
    
    with tqdm(input_data, desc="Processing questions", unit="question") as question_progress:
        for question in question_progress:
            result = process_bioasq_question(question)
            all_processed_questions.append(result)
            # Alternate between the two output files
            output_file = output_file_1 if batch_count % 2 == 0 else output_file_2
            
            # Append the result to the appropriate output file
            with open(output_file, 'w') as f:
                json.dump(all_processed_questions, f, indent=4)
            
            batch_count += 1

            # Update progress
            question_progress.set_postfix(batch=batch_count)
            
            # Optionally, add a delay to respect the rate limits (1 request per second)
            time.sleep(1)

    # # Process each question in the input data
    # for question in input_data:
    #     result = process_bioasq_question(question)
    #     all_processed_questions.append(result)
    
    # # Write the entire results in alternating files after each batch
    # batch_count = 0
    # for i in tqdm(range(0, len(all_processed_questions), 1), desc="Writing to files"):
    #     batch = [all_processed_questions[i]]  # Each batch contains one question in this case
        
    #     # Alternate between the two output files
    #     output_file = output_file_1 if batch_count % 2 == 0 else output_file_2
    #     with open(output_file, 'w') as f:
    #         json.dump(batch, f, indent=4)
    #     batch_count += 1
        
    #     # Optionally, add a delay to respect the rate limits (1 request per second)
    #     time.sleep(1)

# Example usage:
# Assuming 'input_data' is your list of questions (from BioASQ or your JSON data)
# process_bioasq_data(input_data)
for file in ['12B1_golden', '12B2_golden', '12B3_golden', '12B4_golden']:
    input_file = f'/kaggle/input/bioasq-12b-golden-enriched/{file}.json'
    input_data = json.load(open(input_file))
    process_bioasq_data(input_data.get('questions', []), file)
# input_file = '/kaggle/input/bioasq-training-12b-json/training12b_new.json'  # Path to your BioASQ dataset
# input_data = json.load(open(input_file))
# process_bioasq_data(input_data.get('questions', []))


Processing questions: 100%|██████████| 85/85 [03:26<00:00,  2.43s/question, batch=85]
Processing questions: 100%|██████████| 85/85 [03:19<00:00,  2.35s/question, batch=85]
Processing questions: 100%|██████████| 85/85 [03:33<00:00,  2.51s/question, batch=85]
Processing questions: 100%|██████████| 85/85 [03:20<00:00,  2.36s/question, batch=85]
