In [None]:
# --- IMPORTS ---
import time
import pandas as pd
import requests
from Bio import Entrez, Medline
from habanero import Crossref
from semanticscholar import SemanticScholar
import pyalex
import os
import math
import warnings
from datetime import datetime
try:
    from dimcli.core.api import Dsl
except ImportError:
    print("Warning: 'dimcli' package not found. Dimensions.ai search will fail.")
    print("Run: pip install dimcli")
    Dsl = None

# --- CONFIGURATION ---
YOUR_EMAIL = "@ufl.edu" # Essential for CrossRef, OpenAlex, Entrez politeness

# !!! --- SECURITY WARNING --- !!!
# PASTE YOUR KEYS/TOKENS DIRECTLY BELOW (NOT RECOMMENDED FOR SHARING)
# SAFER: Replace "YOUR_KEY..." with os.environ.get('ENV_VAR_NAME', None)
SCOPUS_API_KEY = os.environ.get('SCOPUS_API_KEY', "")
NCBI_API_KEY = os.environ.get('NCBI_API_KEY', "")
SEMANTIC_SCHOLAR_API_KEY = os.environ.get('SEMANTIC_SCHOLAR_API_KEY', "")
SPRINGER_API_KEY = os.environ.get('SPRINGER_API_KEY', "") # <<< ADD THIS
WOS_API_KEY = os.environ.get('WOS_API_KEY', "") # <<< NEW
DIMENSIONS_API_KEY = os.environ.get('DIMENSIONS_API_KEY', "") # <<< NEW

# --- Check if keys seem like placeholders ---
if "YOUR_" in SCOPUS_API_KEY: warnings.warn("SCOPUS_API_KEY looks like a placeholder.")
if "YOUR_" in NCBI_API_KEY: warnings.warn("NCBI_API_KEY looks like a placeholder.")
if "YOUR_" in SEMANTIC_SCHOLAR_API_KEY: warnings.warn("SEMANTIC_SCHOLAR_API_KEY looks like a placeholder.")
if "YOUR_" in WOS_API_KEY: warnings.warn("WOS_API_KEY looks like a placeholder. Web of Science search will fail.")
if "YOUR_" in DIMENSIONS_API_KEY: warnings.warn("DIMENSIONS_API_KEY looks like a placeholder. Dimensions.ai search will fail.")
if "YOUR_" in SPRINGER_API_KEY: warnings.warn("SPRINGER_API_KEY looks like a placeholder. Springer search will fail.") # <<< ADD THIS
# !!! --- END SECURITY WARNING --- !!!

# Maximum results to aim for PER SOURCE per species search term
MAX_TOTAL_RESULTS_PER_SOURCE = 100 # Adjust as needed (e.g., 500, 2000)

# API Client Initialization
Entrez.email = YOUR_EMAIL
Entrez.api_key = NCBI_API_KEY
pyalex.config.email = YOUR_EMAIL
ss = SemanticScholar(api_key=SEMANTIC_SCHOLAR_API_KEY)
cr = Crossref(mailto=YOUR_EMAIL) # Habanero uses mailto

# Initialize Dimensions.ai Client
dsl = None
if Dsl and "YOUR_" not in DIMENSIONS_API_KEY:
    try:
        dsl = Dsl(api_key=DIMENSIONS_API_KEY)
        print("Dimensions.ai client initialized.")
    except Exception as e:
        print(f"Warning: Could not initialize Dimensions.ai client: {e}")
else:
    if Dsl:
        print("Dimensions.ai client not initialized (placeholder key).")

# --- LOAD SPECIES LIST FROM CSV ---
SPECIES_FILE = "valid_species_names.csv" # Adjust path if needed
species_df = None
try:
    species_df = pd.read_csv(SPECIES_FILE)
    # Check for necessary columns
    if not (('scientificName' in species_df.columns and species_df['scientificName'].notna().any()) or \
            ('genus' in species_df.columns and species_df['genus'].notna().any() and \
             'specificEpithet' in species_df.columns and species_df['specificEpithet'].notna().any())):
        print(f"ERROR: The file '{SPECIES_FILE}' is missing required name columns ('scientificName' or both 'genus' and 'specificEpithet' with valid data).")
        species_df = None
except FileNotFoundError:
    print(f"ERROR: The species file '{SPECIES_FILE}' was not found.")
except Exception as e:
    print(f"An error occurred while reading the CSV file '{SPECIES_FILE}': {e}")


# --- HELPER FUNCTIONS ---
def standardize_publication(pub, source):
    """
    Attempts to extract and standardize key publication info (DOI, title, authors, year, journal)
    from various API response formats. Returns None if essential info (like a valid DOI) is missing.
    """
    if not isinstance(pub, dict): return None
    try:
        # --- DOI Extraction ---
        doi = None
        # Try common keys first
        doi_val = pub.get("DOI") or pub.get("doi")
        # Try Semantic Scholar externalIds
        if not doi_val and isinstance(pub.get("externalIds"), dict):
            doi_val = pub.get("externalIds").get("DOI")
        # Try Scopus specific key
        if not doi_val:
            doi_val = pub.get("prism:doi")
        # Try Web of Science specific key (pre-processed)
        if not doi_val and source == "WebOfScience":
             doi_val = pub.get("wos_doi") # Use the key we pre-processed

        # Clean and validate DOI (basic check: starts with '10.')
        if doi_val and isinstance(doi_val, str):
            cleaned_doi = doi_val.lower().strip()
            # Remove potential URL prefixes
            if cleaned_doi.startswith('https://doi.org/'): cleaned_doi = cleaned_doi[16:]
            elif cleaned_doi.startswith('http://doi.org/'): cleaned_doi = cleaned_doi[15:]
            elif cleaned_doi.startswith('doi.org/'): cleaned_doi = cleaned_doi[8:]
            elif cleaned_doi.startswith('doi:'): cleaned_doi = cleaned_doi[4:].strip()

            if cleaned_doi.startswith('10.'):
                doi = cleaned_doi
            else:
                 # print(f" - Skipping record from {source}, invalid DOI format: {doi_val}")
                 return None # Skip if not a valid DOI format

        if not doi:
             # print(f" - Skipping record from {source}, missing DOI.")
             return None # Skip if no DOI found

        # --- Author Extraction ---
        authors_data = pub.get("authors", []) # Semantic Scholar, some OpenAlex, Dimensions (pre-processed)
        if not authors_data:
            authors_data = pub.get("author", []) # Crossref, Scopus (can be nested more)
        if not authors_data and 'AU' in pub: # PubMed MEDLINE
            authors_data = pub.get('AU', [])
        if not authors_data and 'authorships' in pub: # OpenAlex standard
             authorships = pub.get('authorships', [])
             authors_data = [a.get('author', {}).get('display_name') for a in authorships if a.get('author')]
        if not authors_data and source == "WebOfScience": # Web of Science (pre-processed)
            authors_data = pub.get("wos_authors", [])


        author_names = []
        if isinstance(authors_data, list):
            for author in authors_data:
                name = None
                if isinstance(author, dict):
                    # Crossref/Elsevier style
                    given = author.get('given', '')
                    family = author.get('family', '')
                    # Semantic Scholar sometimes just has 'name'
                    name = author.get('name')
                    if name:
                        pass # Use name directly
                    elif given or family:
                         name = f"{given} {family}".strip()
                    elif author.get('authname'): # Handle Scopus specific author list style
                         name = author['authname']
                elif isinstance(author, str): # Handles PubMed, pre-processed Dimensions/WoS
                    name = author.strip()

                if name and name not in author_names: # Add if valid and not duplicate
                    author_names.append(name)
        elif isinstance(authors_data, str): # Handle single author as string
             author_names.append(authors_data.strip())


        # --- Title Extraction ---
        title = pub.get("title")
        if isinstance(title, list): # Often happens in Crossref
             title = title[0] if title else None
        if not title: # PubMed MEDLINE tag
             title = pub.get("TI")
        if not title and source == "WebOfScience": # Web of Science (pre-processed)
            title = pub.get("wos_title")

        # --- Journal Extraction ---
        journal_info = pub.get("journal") # Semantic Scholar nested dict, Dimensions (pre-processed)
        journal = None
        if isinstance(journal_info, dict):
            journal = journal_info.get("name")
        elif isinstance(journal_info, str):
            journal = journal_info # Dimensions pre-processed

        # Fallbacks for journal name
        if not journal:
            journal_val = (
                pub.get("venue") # Semantic Scholar backup
                or pub.get("publicationName") # Scopus/Elsevier
                or pub.get("container-title") # Crossref often uses this
                or pub.get("JT") # PubMed MEDLINE tag
                or pub.get("wos_journal") # Web of Science (pre-processed)
            )
            if isinstance(journal_val, list): journal_val = journal_val[0] if journal_val else None # Take first if list
            journal = journal_val

        # OpenAlex journal location
        if not journal and 'primary_location' in pub:
             loc = pub.get('primary_location')
             if loc and isinstance(loc.get('source'), dict):
                 journal = loc['source'].get('display_name')


        # --- Year Extraction ---
        year = None
        # Try direct keys first
        year_val = pub.get("year") or pub.get("publication_year") or pub.get("publicationYear") # Added publicationYear
        if not year_val and source == "WebOfScience":
            year_val = pub.get("wos_year") # Web of Science (pre-processed)
            
        if year_val:
            # Handle potential floats or strings
            try:
                year = str(int(float(str(year_val)))) # Try converting robustly to int string
            except (ValueError, TypeError):
                pass # Leave as None if conversion fails

        # Try date structures (Crossref)
        if not year:
            date_parts = pub.get("published-print") or pub.get("published-online") or pub.get("created") or pub.get("issued") # Added 'issued'
            if isinstance(date_parts, dict) and 'date-parts' in date_parts:
                try: year = str(date_parts['date-parts'][0][0])
                except (IndexError, TypeError, KeyError): pass

        # Try PubMed MEDLINE Date Published 'DP' YYYY ...
        if not year and isinstance(pub.get("DP"), str) and len(pub.get("DP")) >= 4:
                 year = pub.get("DP")[:4]

        # Try Scopus Date 'prism:coverDate' YYYY-MM-DD
        if not year and isinstance(pub.get("prism:coverDate"), str) and len(pub.get("prism:coverDate")) >= 4:
                 year = pub.get("prism:coverDate")[:4]

        return {
            "doi": doi,
            "title": str(title) if title else None,
            "authors": author_names,
            "year": year,
            "journal": str(journal) if journal else None,
            "source": source
            }

    except Exception as e:
        doi_err = pub.get('doi', 'N/A')
        print(f"Error standardizing publication from {source} (DOI: {doi_err}): {e}")
        # Optionally log the problematic pub object here for debugging
        # print(f"Problematic pub data: {pub}")
        return None

def to_bibtex(pub_dict):
    """Generates a BibTeX entry string from a standardized publication dictionary."""
    try:
        authors = pub_dict.get('authors', [])
        if not isinstance(authors, list): authors = [] # Ensure it's a list
        first_author_lastname = authors[0].split()[-1] if authors and authors[0].split() else "Unknown"
        year_str = pub_dict.get('year') if pub_dict.get('year') else "ND"
        title_str = pub_dict.get('title', 'Article') or 'Article' # Ensure not None
        safe_title_word = title_str.split()[0].replace('{','').replace('}','').strip(" .,:;'\"") if title_str.split() else "Art"
        # Make key more robust
        citation_key = f"{first_author_lastname.replace(' ','')}{year_str}{safe_title_word}"[:50] # Limit length
        authors_str = " and ".join(authors)

        # Handle None values gracefully for bibtex fields
        title_val = title_str.replace('{','\\{').replace('}','\\}') # Escape braces
        journal_val = pub_dict.get('journal', '') or ''
        year_val = pub_dict.get('year', '') or ''
        doi_val = pub_dict.get('doi', '') or ''

        # Basic check for empty required fields
        if not title_val or not authors_str or not year_val:
             print(f"Skipping BibTeX for DOI {doi_val} due to missing required fields (Title, Author, Year).")
             return ""

        return (f"@article{{{citation_key},\n"
                f"  title = {{{title_val}}},\n"
                f"  author = {{{authors_str}}},\n"
                f"  journal = {{{journal_val}}},\n"
                f"  year = {{{year_val}}},\n"
                f"  doi = {{{doi_val}}}\n"
                f"}}")
    except Exception as e:
        print(f"Could not generate BibTeX for DOI {pub_dict.get('doi', 'N/A')}: {e}")
        return ""


# --- MODIFIED API SEARCH FUNCTIONS with PAGINATION ---

def search_pubmed(scientific_name, max_total=MAX_TOTAL_RESULTS_PER_SOURCE):
    """Searches PubMed (via Entrez) and fetches details in batches using MEDLINE text format."""
    print(f"-> Searching PubMed for '{scientific_name}' (max {max_total})...")
    all_standardized = []
    retrieved_ids = 0
    try:
        query = f'("{scientific_name}"[Title/Abstract])'
        search_handle = Entrez.esearch(db="pubmed", term=query, retmax=str(max_total), usehistory='y')
        search_results = Entrez.read(search_handle)
        search_handle.close()

        id_list = search_results["IdList"]
        count = int(search_results["Count"])
        webenv = search_results["WebEnv"]
        query_key = search_results["QueryKey"]

        print(f"   PubMed found {count} potential IDs, will fetch details for max {len(id_list)}...")

        if not id_list:
            return []

        batch_size = 200
        for start in range(0, len(id_list), batch_size):
            end = min(len(id_list), start + batch_size)
            print(f"   Fetching PubMed batch {start // batch_size + 1} (IDs {start+1}-{end})...")
            try:
                # Still use rettype='medline', retmode='text'
                fetch_handle = Entrez.efetch(db="pubmed", rettype="medline", retmode="text",
                                            retstart=start, retmax=batch_size,
                                            webenv=webenv, query_key=query_key)
                
                # <<< FIX IS HERE >>>
                # records_bytes is a misleading variable name, it's already a string
                records_data = fetch_handle.read() 
                fetch_handle.close()
                
                # We skip the .decode() line, as records_data is already a string
                # ORIGINAL: records_text = records_bytes.decode('utf-8', errors='ignore')
                records_text = records_data # <-- This is the fix

                # Parse the string, splitting it into lines
                records = Medline.parse(records_text.splitlines(keepends=True))

                processed_in_batch = 0
                for pub in records:
                    if pub: # Ensure the record is not empty
                        std_pub = standardize_publication(pub, "PubMed")
                        if std_pub:
                            all_standardized.append(std_pub)
                            processed_in_batch += 1

                print(f"     Successfully processed {processed_in_batch} records in this batch.")
                retrieved_ids += (end - start) # Count based on requested IDs
                time.sleep(0.4)
            # Catch specific exceptions if possible, otherwise generic Exception
            except requests.exceptions.RequestException as req_e:
                print(f"   Network error during PubMed batch fetch: {req_e}")
                time.sleep(5) # Wait longer after network error
            except Exception as batch_e:
                # Print more details about the error if it's not a simple read error
                import traceback
                print(f"   Error fetching/parsing PubMed batch: {batch_e}")
                # traceback.print_exc() # Uncomment for full traceback if needed
                time.sleep(2)

        print(f"   Finished PubMed fetch. Attempted to fetch {retrieved_ids} IDs. Total standardized: {len(all_standardized)}")
        return all_standardized

    except Entrez.Parser.ValidationError as xml_err:
        print(f"An error occurred parsing PubMed ESearch results: {xml_err}")
        return all_standardized
    except Exception as e:
        print(f"An unexpected error occurred with the PubMed search: {e}")
        return all_standardized
    finally:
        time.sleep(0.5)

def search_openalex(scientific_name, max_results_per_page=200, max_total=MAX_TOTAL_RESULTS_PER_SOURCE):
    """Searches OpenAlex using pagination."""
    print(f"-> Searching OpenAlex for '{scientific_name}' (max {max_total})...")
    all_standardized = []
    count = 0
    try:
        # paginate handles multiple pages up to n_max
        works_pager = pyalex.Works().search(scientific_name).paginate(per_page=max_results_per_page, n_max=max_total)

        for page in works_pager:
            if not page: # Should not happen with paginate, but good practice
                 break
            count += len(page)
            print(f"   Fetched {count} OpenAlex results...")
            for pub in page:
                 std_pub = standardize_publication(pub, "OpenAlex") # Use the modified standardizer
                 if std_pub:
                     all_standardized.append(std_pub)

            # Check if we manually reached the max (paginate might stop slightly over)
            if count >= max_total:
                 print(f"   Reached or exceeded max total ({max_total}) for OpenAlex.")
                 # Trim excess if needed
                 all_standardized = all_standardized[:max_total]
                 break
            # pyalex aims for ~10 req/sec, usually no sleep needed unless rate limited
            # time.sleep(0.1)

        print(f"   Finished OpenAlex search. Total standardized: {len(all_standardized)}")
        return all_standardized

    except Exception as e:
        print(f"An error occurred with the OpenAlex search: {e}")
        return all_standardized # Return what we have
    finally:
        time.sleep(0.5)

def search_semantic_scholar(scientific_name, max_results_per_page=100, max_total=MAX_TOTAL_RESULTS_PER_SOURCE):
    """Searches Semantic Scholar using the library's built-in pagination iterator."""
    print(f"-> Searching Semantic Scholar for '{scientific_name}' (max {max_total})...")
    all_standardized = []
    retrieved_count = 0
    # S2 limits free tier to 100 requests per 5 minutes (~1 every 3 seconds)
    # Be conservative with sleep time
    # This sleep might now only apply implicitly between requests made by the iterator
    sleep_time = 3.1 if not SEMANTIC_SCHOLAR_API_KEY or "YOUR_" in SEMANTIC_SCHOLAR_API_KEY else 0.5

    try:
        print(f"   Fetching Semantic Scholar results (up to {max_total})...")
        
        # <<< FIX IS HERE >>>
        # Request specific fields needed for standardization
        # ORIGINAL: fields_to_get = ['title', 'year', 'authors', 'externalIds', 'journal', 'doi']
        fields_to_get = ['title', 'year', 'authors', 'externalIds', 'journal'] # <-- 'doi' removed

        # The result 'results_iterator' is the PaginatedResults object
        results_iterator = ss.search_paper(
            query=scientific_name,
            limit=max_total, # Ask for up to max_total; iteration will stop if fewer exist
            fields=fields_to_get
        )

        if results_iterator:
            for pub_obj in results_iterator:
                # Convert S2 object attributes to a dictionary for standardizer
                # This part already correctly gets the DOI from externalIds, so no change is needed here
                pub_dict = {
                    'doi': getattr(pub_obj, 'doi', None) or (getattr(pub_obj, 'externalIds', {}).get('DOI') if getattr(pub_obj, 'externalIds', None) else None),
                    'externalIds': getattr(pub_obj, 'externalIds', None),
                    'title': getattr(pub_obj, 'title', None),
                    'authors': [{'name': author.name} for author in getattr(pub_obj, 'authors', []) if author and author.name],
                    'year': getattr(pub_obj, 'year', None),
                    'journal': getattr(pub_obj, 'journal', None)
                }

                standardized = standardize_publication(pub_dict, "SemanticScholar")
                if standardized:
                    all_standardized.append(standardized)

                retrieved_count += 1
                print(f"\r   Retrieved {retrieved_count} Semantic Scholar results...", end="") # Progress indicator

                if retrieved_count >= max_total:
                    print(f"\n   Reached max total ({max_total}) for Semantic Scholar.")
                    break
            print() # Newline after progress indicator
        else:
            print("   No Semantic Scholar results iterator returned.")

        if retrieved_count == 0:
            print("   No Semantic Scholar results found.")

        print(f"   Finished Semantic Scholar search. Total standardized: {len(all_standardized)}")
        return all_standardized

    except Exception as e:
        print(f"\nAn error occurred with the Semantic Scholar search: {e}") # Add newline in case progress indicator was active
        return all_standardized # Return what we have
    finally:
        time.sleep(0.5) # Pause after finishing this source (still good practice)


def search_crossref(scientific_name, max_results_per_chunk=1000, max_total=MAX_TOTAL_RESULTS_PER_SOURCE):
    """Searches Crossref using efficient cursor pagination, handling 404 errors."""
    print(f"-> Searching Crossref for '{scientific_name}' (max {max_total})...")
    all_standardized = []
    count = 0
    # Assuming 'cr' is globally initialized

    try:
        # Add try-except specifically around the habanero API call
        try:
            work_generator = cr.works(
                query=scientific_name,
                limit=max_results_per_chunk,
                cursor=True,
                cursor_max=max_total
            )
        except requests.exceptions.HTTPError as http_err:
            if http_err.response.status_code == 404:
                print(f"   CrossRef returned 404 (Not Found) for query '{scientific_name}'. Assuming no results.")
                return [] # Return empty list gracefully
            else:
                # Re-raise other HTTP errors
                print(f"   CrossRef HTTP Error ({http_err.response.status_code}) on initial query.")
                raise http_err # Let the outer try-except handle
        
        # <<< FIX IS HERE >>>
        # This block now checks for 404 errors that aren't caught above
        except Exception as api_call_err:
            if '404 Not Found' in str(api_call_err):
                # Handle 404s that are raised as a different exception type
                print(f"   CrossRef returned 404 (Not Found) for query '{scientific_name}'. Assuming no results.")
            else:
                # Print other, more serious API setup errors
                print(f"   Error during CrossRef API call setup: {api_call_err}")
            return [] # Return empty if the generator couldn't be created

        # Iterate through the results yielded by the cursor
        for pub in work_generator:
            std_pub = standardize_publication(pub, "Crossref")
            if std_pub:
                all_standardized.append(std_pub)
                count += 1
                if count % 200 == 0: # Print update every 200 results
                    print(f"   Fetched {count} Crossref results...")

            if count >= max_total:
                print(f"\n   Reached max total ({max_total}) for Crossref.")
                break

        print(f"\n   Finished Crossref search. Total standardized: {len(all_standardized)}")
        return all_standardized

    # Catch other potential errors during iteration
    except Exception as e:
        print(f"\nAn error occurred during Crossref result processing: {e}")
        return all_standardized # Return whatever was collected
    finally:
        time.sleep(0.5)


def search_scopus(scientific_name, max_results_per_page=200, max_total=MAX_TOTAL_RESULTS_PER_SOURCE):
    """Searches Scopus using offset pagination, respecting API limits."""
    if "YOUR_" in SCOPUS_API_KEY:
        print("-> Skipping Scopus: API Key not set.")
        return []
        
    # Scopus standard search limit is 200 per page, 5000 total without cursor.
    effective_max_total = min(max_total, 5000)
    print(f"-> Searching Scopus for '{scientific_name}' (max {effective_max_total})...")
    BASE_URL = "https://api.elsevier.com/content/search/scopus"
    headers = {"X-ELS-APIKey": SCOPUS_API_KEY, "Accept": "application/json"}
    all_standardized_results = []
    start_index = 0
    # Scopus allows ~9 requests/sec for Scopus Search API
    sleep_time = 0.2

    try:
        while start_index < effective_max_total:
            # Determine how many to request in this batch
            items_to_fetch_this_page = min(max_results_per_page, effective_max_total - start_index)
            if items_to_fetch_this_page <= 0: break # Should not happen, but safety check

            params = {
                "query": f'TITLE-ABS-KEY("{scientific_name}")',
                "count": items_to_fetch_this_page,
                "start": start_index,
                "view": "STANDARD" # Usually sufficient for metadata needed
            }
            print(f"   Fetching Scopus results from index {start_index}...")
            response = requests.get(BASE_URL, headers=headers, params=params, timeout=45) # Increased timeout

            if response.status_code == 429:
                print("   Rate limit hit (429). Sleeping for 60 seconds...")
                time.sleep(60)
                continue # Retry the same request

            response.raise_for_status() # Raise error for other bad status codes (4xx, 5xx)
            data = response.json().get("search-results", {})
            entries = data.get("entry", [])

            if not entries:
                print("   No more Scopus results found.")
                break # No more results found by API

            retrieved_on_page = len(entries)
            print(f"   Retrieved {retrieved_on_page} Scopus results on this page.")

            for pub in entries:
                standardized = standardize_publication(pub, "Scopus") # Use the modified standardizer
                if standardized:
                    all_standardized_results.append(standardized)

            total_results_reported = int(data.get('opensearch:totalResults', 0))
            # Stop if Scopus reports no more relevant results OR we've hit our effective max
            if start_index + retrieved_on_page >= total_results_reported or len(all_standardized_results) >= effective_max_total:
                 break

            start_index += retrieved_on_page # Advance the index for the next page
            time.sleep(sleep_time) # Politeness between pages

        print(f"   Finished Scopus search. Total standardized: {len(all_standardized_results)}")
        return all_standardized_results

    except requests.exceptions.HTTPError as e:
        print(f"ERROR: Scopus search HTTP error. Status: {e.response.status_code}\nResponse: {e.response.text}")
        # Decide if you want to stop entirely or just return what was found so far
        return all_standardized_results # Return what we have so far
    except requests.exceptions.RequestException as e:
         print(f"ERROR: Scopus search network/request error: {e}")
         return all_standardized_results # Return partial results
    except Exception as e:
        print(f"An unexpected error occurred within the Scopus search function: {e}")
        return all_standardized_results # Return partial results
    finally:
        time.sleep(0.5) # Pause after finishing this source

# --- NEW SEARCH FUNCTION: Web of Science (Corrected for Starter API v1) ---
def search_wos(scientific_name, max_results_per_page=50, max_total=MAX_TOTAL_RESULTS_PER_SOURCE): # <-- Max page size is 50
    """Searches Web of Science Starter API using 'page', 'limit', 'q', and 'db' parameters."""
    if "YOUR_" in WOS_API_KEY:
        print("-> Skipping Web of Science: API Key not set.")
        return []

    # Starter API max limit is 50 per page
    if max_results_per_page > 50:
        max_results_per_page = 50
        
    print(f"-> Searching Web of Science Starter API for '{scientific_name}' (max {max_total})...")
    
    # --- FIX 1: This is the correct endpoint from the documentation ---
    # Note: We query the /documents endpoint directly
    BASE_URL = "https://api.clarivate.com/apis/wos-starter/v1/documents"
    
    headers = {"X-ApiKey": WOS_API_KEY, "Accept": "application/json"}
    all_standardized_results = []
    
    # Pagination logic (this part is correct)
    current_page = 1
    total_retrieved = 0
    max_pages = math.ceil(max_total / max_results_per_page) 
    
    # WoS Starter API limit is 5 requests/sec
    sleep_time = 0.3

    try:
        while (current_page <= max_pages) and (total_retrieved < max_total):

            items_to_fetch_this_page = min(max_results_per_page, max_total - total_retrieved)
            if items_to_fetch_this_page <= 0: break

            # +++ CORRECTED PARAMETERS based on the GitHub docs +++
            params = {
                # FIX 2: Changed 'usrQuery' to 'q'
                "q": f'TS=("{scientific_name}")', # TS = Topic Search (Title, Abs, Keywords)
                
                # FIX 3: Changed 'databaseId' to 'db'. (See note below!)
                "db": "WOS", 
                
                "limit": items_to_fetch_this_page,
                "page": current_page,
            }
            # +++ END CORRECTION +++

            print(f"   Fetching Web of Science results page {current_page}...")
            response = requests.get(BASE_URL, headers=headers, params=params, timeout=45)

            if response.status_code == 429:
                print("   Rate limit hit (429). Sleeping for 60 seconds...")
                time.sleep(60)
                continue # Retry same request
            
            # This 403 error is a SUBSCRIPTION issue (see note below)
            if response.status_code == 403:
                 print(f"ERROR: Web of Science returned 403 (Forbidden).")
                 print(f"   Your API key is not subscribed to the '{params['db']}' database.")
                 print("   Check your Clarivate portal or try a different 'db' value.")
                 break 
            
            # This 401 error is a KEY issue
            if response.status_code == 401:
                print(f"ERROR: Web of Science returned 401 (Unauthorized).")
                print("   Your API key is invalid or missing. Stopping WoS search.")
                break

            response.raise_for_status() # Raise error for other bad status codes
            data = response.json()
            
            # The Starter API response structure is different (simpler)
            records = data.get('hits', [])
            total_results_reported = int(data.get('metadata', {}).get('total', 0))

            if not records:
                print("   No more Web of Science results found.")
                break

            retrieved_on_page = len(records)
            print(f"   Retrieved {retrieved_on_page} Web of Science results on this page.")
            
            for pub in records:
                # --- Pre-process WoS Starter API JSON ---
                # This is slightly different from the old API
                
                doi_val = pub.get('DOI')
                
                authors_list = [author for author in pub.get('authors', []) if author]
                
                pub_year = pub.get('publishYear')

                journal_title = pub.get('source')

                pub_title = pub.get('title')

                pub_dict = {
                    'wos_doi': doi_val,
                    'wos_title': pub_title,
                    'wos_authors': authors_list,
                    'wos_year': pub_year,
                    'wos_journal': journal_title
                }
                
                # Now pass the clean dict to the standardizer
                standardized = standardize_publication(pub_dict, "WebOfScience")
                if standardized:
                    all_standardized_results.append(standardized)

            total_retrieved += retrieved_on_page 

            if total_retrieved >= total_results_reported:
                break # We've fetched all available records

            current_page += 1 
            time.sleep(sleep_time) # Politeness

        print(f"   Finished Web of Science search. Total standardized: {len(all_standardized_results)}")
        return all_standardized_results

    except requests.exceptions.HTTPError as e:
        # Catch errors that weren't 401, 403, or 429
        print(f"ERROR: Web of Science search HTTP error. Status: {e.response.status_code}\nResponse: {e.response.text}")
        return all_standardized_results
    except requests.exceptions.RequestException as e:
         print(f"ERROR: Web of Science search network/request error: {e}")
         return all_standardized_results
    except Exception as e:
        print(f"An unexpected error occurred within the Web of Science search function: {e}")
        return all_standardized_results
    finally:
        time.sleep(0.5) # Pause after finishing this source

# --- NEW SEARCH FUNCTION: Springer Nature ---
def search_springer(scientific_name, max_results_per_page=100, max_total=MAX_TOTAL_RESULTS_PER_SOURCE):
    """Searches the Springer Nature Metadata API using pagination."""
    if "YOUR_" in SPRINGER_API_KEY:
        print("-> Skipping Springer: API Key not set.")
        return []

    print(f"-> Searching Springer for '{scientific_name}' (max {max_total})...")
    BASE_URL = "http://api.springernature.com/metadata/json"
    all_standardized_results = []
    start_index = 1 # Springer is 1-based
    
    # Springer limit is 5 req/sec
    sleep_time = 0.25 

    try:
        while start_index <= max_total:
            # Determine items to fetch, ensuring we don't go over the page limit (100)
            items_to_fetch_this_page = min(max_results_per_page, 100, max_total - start_index + 1)
            if items_to_fetch_this_page <= 0: break

            params = {
                "q": f'keyword:"{scientific_name}" OR title:"{scientific_name}"',
                "api_key": SPRINGER_API_KEY,
                "p": items_to_fetch_this_page, # Page size
                "s": start_index # Start record
            }
            
            print(f"   Fetching Springer results from record {start_index}...")
            response = requests.get(BASE_URL, params=params, timeout=45)

            if response.status_code == 429:
                print("   Rate limit hit (429). Sleeping for 60 seconds...")
                time.sleep(60)
                continue # Retry same request
            
            response.raise_for_status() # Raise error for other bad status codes
            data = response.json()
            records = data.get('records', [])

            if not records:
                print("   No more Springer results found.")
                break

            retrieved_on_page = len(records)
            print(f"   Retrieved {retrieved_on_page} Springer results on this page.")
            
            for pub in records:
                # --- Pre-process Springer JSON into a simpler dict ---
                # This makes it compatible with our existing standardize_publication function
                
                authors_list = [author.get('creator') for author in pub.get('creators', []) if author.get('creator')]
                # Get year from YYYY-MM-DD date
                pub_year = pub.get('publicationDate', '')[:4] if pub.get('publicationDate') else None

                pub_dict = {
                    'doi': pub.get('doi'),
                    'title': pub.get('title'),
                    'authors': authors_list,
                    'year': pub_year,
                    'journal': pub.get('publicationName') # This is Springer's key for journal title
                }
                
                # Now pass the clean dict to the standardizer
                standardized = standardize_publication(pub_dict, "Springer")
                if standardized:
                    all_standardized_results.append(standardized)

            # Check total results reported by the API
            total_results_reported = int(data.get('result', [{}])[0].get('total', 0))
            if start_index + retrieved_on_page > total_results_reported:
                break # We've fetched all available records

            start_index += retrieved_on_page # Advance the index
            time.sleep(sleep_time) # Politeness

        print(f"   Finished Springer search. Total standardized: {len(all_standardized_results)}")
        return all_standardized_results

    except requests.exceptions.HTTPError as e:
        print(f"ERROR: Springer search HTTP error. Status: {e.response.status_code}\nResponse: {e.response.text}")
        return all_standardized_results
    except requests.exceptions.RequestException as e:
        print(f"ERROR: Springer search network/request error: {e}")
        return all_standardized_results
    except Exception as e:
        print(f"An unexpected error occurred within the Springer search function: {e}")
        return all_standardized_results
    finally:
        time.sleep(0.5) # Pause after finishing this source

# --- NEW SEARCH FUNCTION: Dimensions.ai ---
def search_dimensions(scientific_name, max_total=MAX_TOTAL_RESULTS_PER_SOURCE):
    """Searches Dimensions.ai using the dimcli client."""
    if not dsl:
        print("-> Skipping Dimensions.ai: Client not initialized (check key or install).")
        return []

    print(f"-> Searching Dimensions.ai for '{scientific_name}' (max {max_total})...")
    all_standardized_results = []
    
    # Dimensions Search Language (DSL) query
    # We ask for the fields we need to build our standard dict
    query = f"""
    search publications
      for "\\"{scientific_name}\\"" in full_data
    return publications[doi, title, authors, year, journal]
    limit {max_total}
    """
    
    try:
        results = dsl.query(query)
        
        if results.errors:
            print(f"ERROR: Dimensions.ai query failed: {results.errors}")
            return []
            
        publications = results.publications
        if not publications:
            print("   No Dimensions.ai results found.")
            return []

        print(f"   Retrieved {len(publications)} Dimensions.ai results.")
        
        for pub in publications:
            # --- Pre-process Dimensions result into a simpler dict ---
            
            # Extract authors
            authors_list = []
            for author in pub.get('authors', []):
                authors_list.append(f"{author.get('first_name', '')} {author.get('last_name', '')}".strip())
            
            pub_dict = {
                'doi': pub.get('doi'),
                'title': pub.get('title'),
                'authors': authors_list,
                'year': pub.get('year'),
                'journal': pub.get('journal', {}).get('title') # Journal is a nested dict
            }
            
            # Now pass the clean dict to the standardizer
            standardized = standardize_publication(pub_dict, "Dimensions")
            if standardized:
                all_standardized_results.append(standardized)
        
        print(f"   Finished Dimensions.ai search. Total standardized: {len(all_standardized_results)}")
        return all_standardized_results

    except Exception as e:
        print(f"An unexpected error occurred within the Dimensions.ai search function: {e}")
        return all_standardized_results
    finally:
        time.sleep(1) # Dimensions API has a rate limit


# --- MAIN EXECUTION ---
if species_df is not None:
    all_results = {} # Dictionary to store unique publications: {doi: pub_dict}
    detailed_log = [] # List to store species-publication links

    # --- Check if API keys are placeholders ---
    if "YOUR_" in SCOPUS_API_KEY or "YOUR_" in NCBI_API_KEY or "YOUR_" in SEMANTIC_SCHOLAR_API_KEY or "YOUR_" in WOS_API_KEY or "YOUR_" in DIMENSIONS_API_KEY:
         print("\n*** WARNING: One or more placeholder API keys detected. Searches may be limited or fail. ***\n")

    for index, row in species_df.iterrows():
        # --- Determine Scientific Name ---
        sci_name = None
        genus = row.get('genus')
        epithet = row.get('specificEpithet')
        scientific_name_col = row.get('scientificName')

        if pd.notna(scientific_name_col):
            sci_name = str(scientific_name_col).strip()
        elif pd.notna(genus) and pd.notna(epithet):
            sci_name = f"{str(genus).strip()} {str(epithet).strip()}"
        else:
            print(f"Skipping row {index+1}: No valid scientific name found.")
            continue # Skip row if no name identified

        print(f"\n{'='*20}\nSearching for '{sci_name}' (Row {index + 1}/{len(species_df)})\n{'='*20}")

        species_specific_dois = set() # Track unique DOIs found for THIS species

        # --- Define Search Tasks ---
        # Added Web of Science and Dimensions
        search_tasks = [
            {"func": search_scopus, "name": "Scopus"},
            {"func": search_wos, "name": "WebOfScience"},
            {"func": search_springer, "name": "Springer"}, # <<< ADD THIS
            {"func": search_dimensions, "name": "Dimensions"},
            {"func": search_pubmed, "name": "PubMed"},
            {"func": search_openalex, "name": "OpenAlex"},
            {"func": search_semantic_scholar, "name": "Semantic Scholar"},
            {"func": search_crossref, "name": "CrossRef"}
        ]

        # --- Execute Searches ---
        for task in search_tasks:
            search_func = task["func"]
            source_name = task["name"]
            try:
                # Call the search function, pagination is handled inside
                results = search_func(sci_name) # max_total is passed internally now

                if results:
                    print(f"   Standardizing {len(results)} results from {source_name}...")
                    found_new_for_species = 0
                    for pub in results:
                        # Standardize function now returns None if DOI is missing/invalid
                        if pub and pub.get("doi"): # Check if standardization succeeded
                            doi = pub["doi"]

                            # Add to the global results if this DOI is new overall
                            if doi not in all_results:
                                all_results[doi] = pub

                            # Log the species-paper link if it's new for THIS species run
                            if doi not in species_specific_dois:
                                species_specific_dois.add(doi)
                                found_new_for_species += 1
                                log_entry = {
                                    'genus': genus if pd.notna(genus) else None, # Store original components
                                    'specificEpithet': epithet if pd.notna(epithet) else None,
                                    'scientificName_searched': sci_name, # Store the actual term used
                                    'paper_title': pub.get('title'),
                                    'paper_doi': doi
                                }
                                detailed_log.append(log_entry)
                    print(f"   Added {found_new_for_species} new unique DOIs for this species from {source_name}.")
                else:
                    print(f"   No results found from {source_name} for '{sci_name}'.")

            except Exception as search_e:
                print(f"!! Critical error during search with {source_name}: {search_e} !!")
                # Decide whether to continue to next source or stop
                # continue

            # Short pause between different API sources for general politeness
            time.sleep(1.5) # Increased slightly

        print(f"-> Finished searches for '{sci_name}'. Found {len(species_specific_dois)} unique publications for this species across all sources.")

    # --- Processing Complete ---
    print(f"\n\n{'='*20}\nAll species processed. Found {len(all_results)} unique publications overall.\n{'='*20}")

    # --- SAVE RESULTS ---
    if all_results:
        # --- Save Aggregated CSV ---
        print("Preparing aggregated CSV...")
        results_df = pd.DataFrame(all_results.values())
        # Robust author joining
        results_df['authors'] = results_df['authors'].apply(
            lambda x: '; '.join(x) if isinstance(x, list) and x else (x if isinstance(x, str) else '')
        )
        # Reorder columns for clarity
        cols_order = ['doi', 'title', 'authors', 'year', 'journal', 'source']
        results_df = results_df[cols_order]
        
        # Create timestamped filename for aggregated results
        timestamp_agg = datetime.now().strftime("%Y%m%d_%H%M%S")
        csv_filename = f"beetle_citations_aggregated_{timestamp_agg}.csv"
        try:
            results_df.to_csv(csv_filename, index=False, encoding='utf-8-sig') # Use utf-8-sig for Excel compatibility
            print(f"Aggregated results saved to {csv_filename}")
        except Exception as e:
            print(f"ERROR saving aggregated CSV: {e}")

        # --- Save BibTeX ---
        print("Preparing BibTeX file...")
        bibtex_filename = f"beetle_citations_{timestamp_agg}.bib"
        bib_count = 0
        try:
            with open(bibtex_filename, "w", encoding="utf-8") as f:
                for pub in all_results.values():
                    bibtex_entry = to_bibtex(pub)
                    if bibtex_entry:
                        f.write(bibtex_entry + "\n\n")
                        bib_count += 1
            print(f"{bib_count} BibTeX entries saved to {bibtex_filename}")
        except Exception as e:
            print(f"ERROR saving BibTeX file: {e}")

        # --- Save Detailed Log ---
        print("Preparing detailed species-paper log CSV...")
        if detailed_log:
            log_df = pd.DataFrame(detailed_log)
            # Reorder columns
            log_cols = ['genus', 'specificEpithet', 'scientificName_searched', 'paper_doi', 'paper_title']
            log_df = log_df[log_cols]
            log_filename = f"species_citation_log_{timestamp_agg}.csv"
            try:
                log_df.to_csv(log_filename, index=False, encoding='utf-8-sig')
                print(f"Detailed species-paper log saved to {log_filename}")

                print("Preparing unique DOI list CSV...")
                # Get unique DOIs directly from the keys of the main results dictionary
                unique_dois = list(all_results.keys())
                doi_df = pd.DataFrame(unique_dois, columns=['paper_doi']) # Create DataFrame with correct column name

                # Create timestamped filename
                doi_list_filename = f"doi_list_unique_{timestamp_agg}.csv"

                # Save the unique DOI list
                doi_df.to_csv(doi_list_filename, index=False, encoding='utf-8-sig')
                print(f"Unique DOI list saved to {doi_list_filename}")
                print(f"This is the file to use with your download script: {doi_list_filename}")

            except Exception as e:
                 print(f"ERROR saving detailed log or DOI list CSV: {e}")
        else:
             print("Detailed log is empty, not saving.")

    else:
        print("No unique publications were found across any species or sources.")
else:
    print("\nScript did not run because the species data could not be loaded.")

print("\n--- Script finished. ---")



Dimensions.ai client not initialized (placeholder key).



Searching for 'Nisiborus' (Row 1/7858)
-> Searching Scopus for 'Nisiborus' (max 100)...
   Fetching Scopus results from index 0...
   Retrieved 1 Scopus results on this page.
   Finished Scopus search. Total standardized: 1
   Standardizing 1 results from Scopus...
   Added 1 new unique DOIs for this species from Scopus.
-> Searching Web of Science Starter API for 'Nisiborus' (max 100)...
   Fetching Web of Science results page 1...
   Retrieved 1 Web of Science results on this page.
   Finished Web of Science search. Total standardized: 0
   No results found from WebOfScience for 'Nisiborus'.
-> Searching Springer for 'Nisiborus' (max 100)...
   Fetching Springer results from record 1...
ERROR: Springer search HTTP error. Status: 403
Response: {"status":"Fail","message":"Access to this resource is restricted. This is a premium feature","error":{"error":"Forbidden","error_description":"You are attempting to access a premium fea

KeyboardInterrupt: 