In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import pickle
import gzip
import glob
import os
import logging
from urllib.parse import urljoin
import concurrent.futures # For parallelization

# --- Configuration ---
logging.basicConfig(filename='fetch_papers_errors.log', level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')

REQUEST_HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
# This delay is applied *before* fetching the final full-text URL from a publisher/PMC site by each worker.
FETCH_DELAY_SECONDS = 2 # Slightly reduced, but still important for politeness.
MAX_WORKERS = 5 # Number of parallel threads. Adjust based on your network and CPU. Too many can still cause issues.

# --- Helper Functions ---

def fetch_url_content(url, retries=1, base_retry_delay=1, timeout=1):
    for attempt in range(retries):
        try:
            response = requests.get(url, headers=REQUEST_HEADERS, timeout=timeout, allow_redirects=True)
            response.raise_for_status()
            logging.info(f"Successfully fetched {url} with status {response.status_code}")
            return response
        except requests.exceptions.Timeout:
            logging.warning(f"Timeout on attempt {attempt + 1}/{retries} for {url}")
        except requests.exceptions.HTTPError as e:
            logging.warning(f"HTTP error {e.response.status_code} on attempt {attempt + 1}/{retries} for {url}")
            if e.response.status_code == 404: # Don't retry if not found
                break
        except requests.exceptions.RequestException as e:
            logging.warning(f"Request error on attempt {attempt + 1}/{retries} for {url}: {e}")

        if attempt < retries - 1:
            # Simple incremental backoff for retries
            current_delay = base_retry_delay * (attempt + 1)
            logging.info(f"Waiting {current_delay} seconds before retry for {url}...")
            time.sleep(current_delay)
    logging.error(f"Failed to fetch {url} after {retries} retries.")
    return None

def get_full_text_data(pmid: str):
    pubmed_base_url = 'https://pubmed.ncbi.nlm.nih.gov/'
    pmid_url = f'{pubmed_base_url}{pmid}/'
    # Logging the start of processing for a PMID is now better done before submitting to the pool
    # logging.info(f"Processing PMID: {pmid} - URL: {pmid_url}")

    pubmed_response = fetch_url_content(pmid_url)
    if not pubmed_response or not pubmed_response.text:
        logging.error(f"Failed to fetch PubMed page for PMID {pmid}")
        return {"pmid": pmid, "type": "error", "content": "Failed to fetch PubMed page", "final_url": pmid_url}

    soup = BeautifulSoup(pubmed_response.text, 'html.parser')
    full_text_div = soup.find('div', class_='full-text-links-list')
    links_found = []

    if not full_text_div:
        logging.debug(f"No 'full-text-links-list' div found for PMID {pmid}. Checking for single prominent link.")
        single_link = soup.find('a', class_='link-item dialog-focus')
        if single_link and single_link.get('href'):
             links_found = [single_link]
             logging.debug(f"Found single prominent full text link for PMID {pmid}")
        else:
            logging.warning(f"No full text links (list or single) found for PMID {pmid}")
            return {"pmid": pmid, "type": "error", "content": "No full-text links found on PubMed page", "final_url": pmid_url}
    else:
        links_found = full_text_div.find_all('a', class_='link-item')

    if not links_found: # Should be caught above, but as a safeguard
        logging.warning(f"No links extracted from full_text_div or single link for PMID {pmid}")
        return {"pmid": pmid, "type": "error", "content": "No links extracted from full_text_div", "final_url": pmid_url}

    full_text_target_url = None
    is_pmc_link = False

    for link in links_found:
        href = link.get('href')
        if not href:
            continue

        current_link_url = urljoin(pmid_url, href)
        link_text_lower = link.get_text(strip=True).lower()
        href_lower = href.lower()

        if "ncbi.nlm.nih.gov/pmc" in current_link_url or "europepmc.org" in current_link_url or \
           "pmc" in link.get('data-ga-action', '').lower() or "pmc" in link_text_lower or "pmc" in href_lower:
            full_text_target_url = current_link_url
            is_pmc_link = True
            logging.info(f"Prioritized PMC link for PMID {pmid}: {full_text_target_url}")
            break

    if not full_text_target_url and links_found:
        first_link_href = links_found[0].get('href')
        if first_link_href:
            full_text_target_url = urljoin(pmid_url, first_link_href)
            logging.info(f"Using first available link (non-PMC priority) for PMID {pmid}: {full_text_target_url}")

    if not full_text_target_url:
        logging.error(f"No valid full text URL could be resolved for PMID {pmid}")
        return {"pmid": pmid, "type": "error", "content": "No valid full-text URL resolved", "final_url": pmid_url}

    logging.info(f"PMID {pmid}: Attempting to fetch full content from: {full_text_target_url}")
    time.sleep(FETCH_DELAY_SECONDS) # Crucial delay before hitting external sites

    article_response = fetch_url_content(full_text_target_url)
    if article_response and article_response.text:
        content_type_header = article_response.headers.get('Content-Type', '').lower()
        determined_type = 'html'

        if 'xml' in content_type_header:
            determined_type = 'xml'
        elif is_pmc_link and "PMC" in full_text_target_url and not full_text_target_url.endswith(('.pdf', '.epub')):
            if article_response.text.strip().startswith('<') and \
               ("<article" in article_response.text[:1000] or "<front>" in article_response.text[:1000]):
                determined_type = 'xml'
                logging.info(f"Detected XML-like content from PMC for PMID {pmid} by inspection.")

        logging.info(f"Successfully retrieved content for PMID {pmid} from {full_text_target_url}. Type: {determined_type}, Length: {len(article_response.text)}")
        return {
            "pmid": pmid,
            "type": determined_type,
            "content": article_response.text,
            "final_url": full_text_target_url
        }
    else:
        logging.error(f"Failed to fetch full content for PMID {pmid} from {full_text_target_url}")
        return {"pmid": pmid, "type": "error", "content": f"Failed to fetch from {full_text_target_url}", "final_url": full_text_target_url}

# --- Main Script ---
def main():
    csv_files = glob.glob('pubmed_genetic_results_*.csv')
    if not csv_files:
        logging.error("No pubmed_genetic_results_*.csv files found in the current directory.")
        print("Error: No pubmed_genetic_results_*.csv files found.")
        return
    latest_csv_file = max(csv_files, key=os.path.getctime)
    logging.info(f"Using input CSV file: {latest_csv_file}")
    print(f"Using input CSV file: {latest_csv_file}")

    try:
        df = pd.read_csv(latest_csv_file)
    except FileNotFoundError:
        logging.error(f"CSV file {latest_csv_file} not found.")
        print(f"Error: {latest_csv_file} not found.")
        return
    except Exception as e:
        logging.error(f"Error reading CSV {latest_csv_file}: {e}")
        print(f"Error reading CSV {latest_csv_file}: {e}")
        return

    if 'PMID' not in df.columns:
        logging.error("CSV file must contain a 'PMID' column.")
        print("Error: CSV file must contain a 'PMID' column.")
        return

    all_pmids_from_csv = df['PMID'].astype(str).unique().tolist()
    logging.info(f"Found {len(all_pmids_from_csv)} unique PMIDs to process from {latest_csv_file}.")
    print(f"Found {len(all_pmids_from_csv)} unique PMIDs to process.")

    content_dict = {}
    output_filename = "content_dict.pkl.gz"

    if os.path.exists(output_filename):
        try:
            with gzip.open(output_filename, 'rb') as f_load:
                content_dict = pickle.load(f_load)
            logging.info(f"Loaded {len(content_dict)} existing entries from {output_filename}")
            print(f"Loaded {len(content_dict)} existing entries from {output_filename}")
        except Exception as e:
            logging.warning(f"Could not load existing {output_filename}: {e}. Starting fresh.")
            content_dict = {}

    pmids_to_fetch = [pmid for pmid in all_pmids_from_csv if pmid not in content_dict]
    if not pmids_to_fetch:
        print("All PMIDs from the CSV have already been processed. Nothing new to fetch.")
        logging.info("All PMIDs from the CSV have already been processed.")
        return

    logging.info(f"Attempting to fetch content for {len(pmids_to_fetch)} new PMIDs.")
    print(f"Attempting to fetch content for {len(pmids_to_fetch)} new PMIDs.")

    # Using ThreadPoolExecutor for parallel fetching
    # The main rate limiting per external site is handled by FETCH_DELAY_SECONDS within get_full_text_data
    # MAX_WORKERS limits simultaneous calls to PubMed for initial pages.
    results_from_fetch = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        future_to_pmid = {executor.submit(get_full_text_data, pmid): pmid for pmid in pmids_to_fetch}

        for future in tqdm(concurrent.futures.as_completed(future_to_pmid), total=len(pmids_to_fetch), desc="Fetching paper content", unit="PMID"):
            pmid = future_to_pmid[future]
            try:
                data = future.result()
                if data:
                    results_from_fetch.append(data)
            except Exception as exc:
                logging.error(f"PMID {pmid} generated an exception during parallel execution: {exc}")
                results_from_fetch.append({"pmid": pmid, "type": "error", "content": f"Exception: {exc}", "final_url": None})

    newly_processed_count = 0
    for result_item in results_from_fetch:
        pmid = result_item["pmid"]
        # Ensure we don't overwrite potentially valid older data with a new error if it was somehow re-queued
        if pmid not in content_dict or content_dict[pmid].get("type") == "error":
             content_dict[pmid] = {
                "type": result_item["type"],
                "content": result_item["content"],
                "final_url": result_item["final_url"]
            }
        if result_item["type"] != "error":
            newly_processed_count +=1


    if results_from_fetch: # Save if any new processing was attempted
        try:
            with gzip.open(output_filename, 'wb') as f_save:
                pickle.dump(content_dict, f_save)
            logging.info(f"Saved results: {newly_processed_count} new PMIDs successfully processed in this run.")
            logging.info(f"Total entries in {output_filename}: {len(content_dict)}")
            print(f"\nSaved results. Total entries in {output_filename}: {len(content_dict)}")
        except Exception as e_save:
            logging.error(f"Error saving final results to {output_filename}: {e_save}")
            print(f"\nError saving final results: {e_save}")
    else:
        print("No new PMIDs were processed in this run.")


    logging.info(f"Finished processing. Total new PMIDs successfully processed in this run: {newly_processed_count}")
    print(f"\nFetching complete. Total successfully processed new PMIDs: {newly_processed_count}. Total entries in {output_filename}: {len(content_dict)}")

if __name__ == "__main__":
    main()

Using input CSV file: pubmed_genetic_results_68a3f3d2.csv
Found 923 unique PMIDs to process.
Attempting to fetch content for 923 new PMIDs.


Fetching paper content:   0%|          | 0/923 [00:00<?, ?PMID/s]

ERROR:root:Failed to fetch https://www.ahajournals.org/doi/10.1161/CIRCULATIONAHA.109.191959?url_ver=Z39.88-2003&rfr_id=ori:rid:crossref.org&rfr_dat=cr_pub  0pubmed after 1 retries.
ERROR:root:Failed to fetch full content for PMID 19246689 from https://www.ahajournals.org/doi/10.1161/CIRCULATIONAHA.109.191959?url_ver=Z39.88-2003&rfr_id=ori:rid:crossref.org&rfr_dat=cr_pub  0pubmed
ERROR:root:Failed to fetch https://doi.org/10.1038/ng.981 after 1 retries.
ERROR:root:Failed to fetch full content for PMID 22081228 from https://doi.org/10.1038/ng.981
ERROR:root:Failed to fetch https://ashpublications.org/blood/article-lookup/doi/10.1182/blood-2003-07-2531 after 1 retries.
ERROR:root:Failed to fetch full content for PMID 14630794 from https://ashpublications.org/blood/article-lookup/doi/10.1182/blood-2003-07-2531
ERROR:root:Failed to fetch https://doi.org/10.1038/ng.2220 after 1 retries.
ERROR:root:Failed to fetch full content for PMID 22446962 from https://doi.org/10.1038/ng.2220
ERROR:root


Saved results. Total entries in content_dict.pkl.gz: 923

Fetching complete. Total successfully processed new PMIDs: 711. Total entries in content_dict.pkl.gz: 923
