In [None]:
# --- IMPORTS ---
import requests
import os
import time
import xml.etree.ElementTree as ET
import pandas as pd
import json
import warnings
from urllib.parse import quote
from datetime import datetime
from wiley_tdm import TDMClient
import springernature_api_client.tdm as tdm # <<<--- IMPORT SPRINGER CLIENT MODULE

# --- Configuration ---
YOUR_EMAIL = "@ufl.edu" # Essential for APIs

# !!! --- SECURITY WARNING --- !!!
# PASTE YOUR KEYS/TOKENS DIRECTLY BELOW (NOT RECOMMENDED FOR SHARING)
# SAFER: Replace "YOUR_KEY..." with os.environ.get('ENV_VAR_NAME', None)
ELSEVIER_API_KEY = ""
NCBI_API_KEY = ""
SPRINGER_API_KEY = "" # May need "/YOUR_API_METRIC" appended
WILEY_TDM_TOKEN = ""

# --- Check if keys seem like placeholders ---
if not ELSEVIER_API_KEY or "PASTE_" in ELSEVIER_API_KEY: warnings.warn("ELSEVIER_API_KEY looks like a placeholder.")
if not NCBI_API_KEY or "PASTE_" in NCBI_API_KEY: warnings.warn("NCBI_API_KEY looks like a placeholder.")
if not SPRINGER_API_KEY or "PASTE_" in SPRINGER_API_KEY: warnings.warn("SPRINGER_API_KEY looks like a placeholder.")
if not WILEY_TDM_TOKEN or "PASTE_" in WILEY_TDM_TOKEN: warnings.warn("WILEY_TDM_TOKEN looks like a placeholder.")
# !!! --- END SECURITY WARNING --- !!!

# 1. Input CSV file configuration
CSV_FILE_PATH = 'doi_list.csv'
DOI_COLUMN_NAME = 'paper_doi'

# 2. Output Directory (Unified)
OUTPUT_DIR = "full_text_downloads"

# 3. Files for resuming progress (Unified)
CHECKPOINT_FILE = 'download_checkpoint.json'
PROCESSED_FILE = 'processed_dois.txt'

# 4. CSV Processing Chunk Size
CHUNK_SIZE = 500

# 5. API Base URLs (Springer URL no longer needed for direct requests)
ELSEVIER_BASE_URL = "https://api.elsevier.com/content/article/doi/"
# SPRINGER_BASE_URL = "https://api.springernature.com/xmldata/jats" # Now handled by package
NCBI_IDCONV_URL = "https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/"
NCBI_EFETCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
UNPAYWALL_URL = "https://api.unpaywall.org/v2/"

# --- XML Namespaces ---
ns_elsevier = {'ce': 'http://www.elsevier.com/xml/common/dtd'}

# --- Publisher DOI Prefixes ---
ELSEVIER_PREFIXES = ('10.1016',)
SPRINGER_NATURE_PREFIXES = ('10.1007', '10.1038', '10.1186', '10.1140', '10.1365', '10.17530', '10.1891', '10.1594')
WILEY_PREFIXES = ('10.1002', '10.1111') # Simplified Wiley prefixes

# --- Initialize API Clients ---
wiley_client = None
springer_client = None

# Wiley Client Init
try:
    if WILEY_TDM_TOKEN and "PASTE_" not in WILEY_TDM_TOKEN:
        os.environ['TDM_API_TOKEN'] = WILEY_TDM_TOKEN
        wiley_client = TDMClient(download_dir=OUTPUT_DIR)
        print("Wiley TDM Client initialized.")
    else:
        warnings.warn("Wiley TDM Token not set/valid. Wiley PDF download will be skipped.")
except ImportError:
     warnings.warn("'wiley-tdm' package not installed. Wiley PDF download will be skipped. Run: pip install wiley-tdm")
except Exception as e:
    warnings.warn(f"Could not initialize Wiley TDM Client: {e}. Wiley PDF download will be skipped.")
    wiley_client = None

# Springer Client Init
try:
    if SPRINGER_API_KEY and "PASTE_" not in SPRINGER_API_KEY:
        # Note: Key might need "/YOUR_API_METRIC" appended if auth fails
        springer_client = tdm.TDMAPI(api_key=SPRINGER_API_KEY)
        print("Springer Nature API Client initialized.")
    else:
         warnings.warn("Springer API Key not set/valid. Springer XML download will be skipped.")
except ImportError:
     warnings.warn("'springernature-api-client' package not installed. Springer XML download will be skipped. Run: pip install springernature-api-client")
except Exception as e:
    warnings.warn(f"Could not initialize Springer API Client: {e}. Springer XML download will be skipped.")
    springer_client = None

# --- Helper Functions (Load/Save Checkpoint, Clean DOI, Sanitize Filename - Minor adjustments) ---
def load_checkpoint():
    """Loads progress from unified checkpoint files."""
    status_log = {}; processed_dois = set()
    if os.path.exists(CHECKPOINT_FILE):
        try:
            with open(CHECKPOINT_FILE, 'r', encoding='utf-8') as f: status_log = json.load(f)
            print(f"Loaded status for {len(status_log)} DOIs from {CHECKPOINT_FILE}.")
        except json.JSONDecodeError: print(f"Warning: Could not read {CHECKPOINT_FILE}, starting fresh.")
        except Exception as e: print(f"Warning: Error loading {CHECKPOINT_FILE}: {e}")
    if os.path.exists(PROCESSED_FILE):
        try:
            with open(PROCESSED_FILE, 'r', encoding='utf-8') as f: processed_dois = set(line.strip() for line in f if line.strip())
            print(f"Loaded {len(processed_dois)} processed DOIs from {PROCESSED_FILE}.")
        except Exception as e: print(f"Warning: Could not read {PROCESSED_FILE}: {e}. Starting fresh."); processed_dois = set()
    return status_log, processed_dois

def save_checkpoint(status_log, doi_attempted, status, processed_file_handle):
    """Saves progress to unified checkpoint files."""
    status_log[doi_attempted] = status
    try:
        temp_checkpoint_file = CHECKPOINT_FILE + ".tmp"
        with open(temp_checkpoint_file, 'w', encoding='utf-8') as f:
            json.dump(status_log, f, indent=4)
        os.replace(temp_checkpoint_file, CHECKPOINT_FILE)
    except Exception as e: print(f"ERROR saving checkpoint {CHECKPOINT_FILE}: {e}")
    try:
        processed_file_handle.write(f"{doi_attempted}\n"); processed_file_handle.flush()
    except Exception as e: print(f"ERROR writing to processed {PROCESSED_FILE}: {e}")

def clean_doi(doi_str):
    """Cleans DOI string, returns None if invalid format."""
    if pd.isna(doi_str): return None
    doi_str = str(doi_str).strip().lower()
    if doi_str.startswith('https://doi.org/'): return doi_str[16:]
    if doi_str.startswith('http://doi.org/'): return doi_str[15:]
    if doi_str.startswith('doi.org/'): return doi_str[8:]
    if doi_str.startswith('doi:'): return doi_str[4:].strip()
    if doi_str.startswith(('pii ', 's', 'b978')): return None
    if doi_str.startswith('10.'): return doi_str
    return None

def sanitize_filename(name_base, extension):
    """Creates a safe filename from a base (like DOI) and extension."""
    if not isinstance(name_base, str) or not name_base.strip(): name_base = "No_DOI"
    name = name_base.replace('/', '_').replace('\\', '_').replace(':', '_').replace('*', '_')
    name = name.replace('?', '_').replace('"', '_').replace('<', '_').replace('>', '_')
    name = name.replace('|', '_').replace('\n', '').replace('\r', '').replace('\t', '')
    name = "_".join(filter(None, name.split()))
    return name[:150] + extension

# --- API Interaction Functions ---

def check_unpaywall_for_oa(doi, email):
    """Checks Unpaywall for OA status and best PDF/landing page URL."""
    # (No major changes)
    if not doi: return {'is_oa': False, 'pdf_url': None, 'landing_url': None, 'status': 'InvalidDOI'}
    oa_info = {'is_oa': False, 'pdf_url': None, 'landing_url': None, 'status': 'Unpaywall_Error'}
    try:
        url = f"{UNPAYWALL_URL}{quote(doi)}?email={email}"
        response = requests.get(url, timeout=20)
        if response.status_code == 200:
            data = response.json()
            oa_info['is_oa'] = data.get('is_oa', False)
            best_loc = data.get('best_oa_location')
            if best_loc:
                oa_info['pdf_url'] = best_loc.get('url_for_pdf')
                oa_info['landing_url'] = best_loc.get('url_for_landing_page')
            oa_info['status'] = 'Paywalled' if not oa_info['is_oa'] else ('OA_Checked')
        elif response.status_code == 404: oa_info['status'] = 'Unpaywall_NotFound'
        elif response.status_code == 429: oa_info['status'] = 'Unpaywall_RateLimit'; print(f"   Unpaywall rate limit (429). Sleeping 10s..."); time.sleep(10)
        else: oa_info['status'] = f'Unpaywall_Error_{response.status_code}'; print(f"   Unpaywall Error {response.status_code} for {doi}.")
    except requests.exceptions.Timeout: oa_info['status'] = 'Unpaywall_Timeout'; print(f"   Unpaywall Timeout for {doi}.")
    except requests.exceptions.RequestException as e: oa_info['status'] = 'Unpaywall_NetworkError'; print(f"   Unpaywall Network Error for {doi}: {e}")
    finally: time.sleep(0.1)
    return oa_info

def get_pmcid_from_doi(doi, email, api_key=None):
    """Gets PMCID from DOI using NCBI ID Converter."""
    # (No major changes)
    if not doi: return None, "InvalidDOI"
    params = {'ids': doi, 'format': 'json', 'tool': 'UF_PhD_Script', 'email': email}
    if api_key and "YOUR_" not in api_key: params['api_key'] = api_key
    status = "IDConv_Error"; pmcid = None
    try:
        response = requests.get(NCBI_IDCONV_URL, params=params, timeout=15)
        if response.status_code == 400: status = "IDConv_NotFound"
        elif response.status_code == 429: status = "IDConv_RateLimit"; print("   NCBI IDConv rate limit (429). Sleeping 10s..."); time.sleep(10)
        else:
            response.raise_for_status(); data = response.json()
            for record in data.get('records', []):
                if record.get('doi', '').lower() == doi.lower() and 'pmcid' in record: pmcid = record['pmcid']; status = "Success_GotPMCID"; break
            if not pmcid: status = "IDConv_NoPMCIDFound"
    except requests.exceptions.HTTPError as e: print(f"   NCBI IDConv HTTP Error {e.response.status_code} for {doi}"); status = f"IDConv_HTTPError_{e.response.status_code}"
    except requests.exceptions.RequestException as e: print(f"   NCBI IDConv Network Error for {doi}: {e}"); status = "IDConv_NetworkError"
    finally: time.sleep(0.15 if api_key and "YOUR_" not in api_key else 0.4)
    return pmcid, status

def download_pmc_xml(pmcid, email, api_key=None):
    """Downloads and validates XML from PMC using EFetch."""
    # (No major changes)
    if not pmcid: return None, "No PMCID"
    params = {'db': 'pmc', 'id': pmcid, 'retmode': 'xml', 'rettype': 'full', 'tool': 'UF_PhD_Script', 'email': email}
    if api_key and "YOUR_" not in api_key: params['api_key'] = api_key
    xml_content = None; status = "PMC_FetchError_Unknown"
    try:
        response = requests.get(NCBI_EFETCH_URL, params=params, timeout=60)
        if response.status_code == 429: print("   NCBI EFetch rate limit (429). Sleeping 60s..."); time.sleep(60); status = "PMC_RateLimit"; return None, status
        response.raise_for_status()
        content_type = response.headers.get('Content-Type', '').lower()
        if 'xml' not in content_type: status = f"PMC_BadContentType_{content_type[:30]}"
        else:
            try:
                root = ET.fromstring(response.content)
                if root.tag == 'pmc-articleset' or root.find('.//article') is not None or root.tag == 'article': xml_content = response.content; status = "Success_PMC_XML"
                else: status = "PMC_InvalidXML_Structure"
            except ET.ParseError: status = "PMC_InvalidXML_ParseError"
    except requests.exceptions.HTTPError as e: status = f"PMC_HTTPError_{e.response.status_code}"
    except requests.exceptions.RequestException as e: status = f"PMC_NetworkError: {e}"
    finally: time.sleep(0.15 if api_key and "YOUR_" not in api_key else 0.4)
    return xml_content, status


def download_elsevier_xml(doi, api_key):
    """Downloads and validates XML from Elsevier API."""
    # (No major changes)
    if not doi: return None, "InvalidDOI"
    if "YOUR_" in api_key: return None, "InvalidAPIKey_Elsevier"
    headers = {"Accept": "text/xml", "X-ELS-APIKey": api_key}; url = ELSEVIER_BASE_URL + quote(doi)
    xml_content = None; status = "Elsevier_Error_Unknown"
    try:
        response = requests.get(url, headers=headers, timeout=45)
        if response.status_code == 200:
            try:
                root = ET.fromstring(response.content)
                first_paragraph = root.find('.//ce:para', namespaces=ns_elsevier)
                if first_paragraph is not None: xml_content = response.content; status = "Success_Elsevier_XML"
                else: status = "Elsevier_MetadataOnly"
            except ET.ParseError: status = "Elsevier_InvalidXML"
        elif response.status_code == 401: status = "Elsevier_AuthError"; print("!! Elsevier Auth Error - Check Key/VPN !!")
        elif response.status_code == 403: status = "Elsevier_Forbidden"
        elif response.status_code == 404: status = "Elsevier_NotFound"
        elif response.status_code == 429: status = "Elsevier_RateLimit"; print("   Elsevier rate limit (429). Sleeping 5 mins..."); time.sleep(300)
        else: status = f"Elsevier_HTTPError_{response.status_code}"
    except requests.exceptions.Timeout: status = "Elsevier_Timeout"
    except requests.exceptions.RequestException as e: status = f"Elsevier_NetworkError: {e}"
    finally: time.sleep(1) # Elsevier allows 10/sec, 1s sleep is safe
    return xml_content, status

# --- UPDATED: Springer Nature XML Download via Package ---
# --- UPDATED: Springer Nature XML Download via Package ---
def download_springer_xml_package(doi, client):
    """Downloads and validates XML from Springer Nature using official package."""
    if not client: return None, "SpringerClient_NotInitialized"
    if not doi: return None, "InvalidDOI"
    # Check original key variable for placeholder
    if not SPRINGER_API_KEY or "PASTE_" in SPRINGER_API_KEY:
        return None, "InvalidAPIKey_Springer"

    xml_content = None; status = "SpringerPkg_Error_Unknown"
    try:
        # Use the client's search method to get the raw response object
        # Pass p=1 to limit results (should only be 1 for a DOI)
        response = client.search(q=f'doi:{doi}', p=1, s=1) # Get raw requests.Response

        # Access status code from the underlying requests response
        response_status_code = getattr(response, 'status_code', None)

        if response_status_code == 200:
            content_type = getattr(response, 'headers', {}).get('Content-Type', '').lower()
            if 'xml' in content_type:
                raw_xml_candidate = getattr(response, 'content', None) # Get bytes content
                if raw_xml_candidate:
                    try:
                        root = ET.fromstring(raw_xml_candidate)
                        body_tag = root.find('.//{*}body'); p_tag = root.find('.//{*}p')
                        if body_tag is not None or p_tag is not None:
                            xml_content = raw_xml_candidate # Keep as bytes
                            status = "Success_SpringerPkg_XML"
                        else:
                            status = "SpringerPkg_MetadataOrAbstractOnly"
                    except ET.ParseError:
                        status = "SpringerPkg_InvalidXMLResponse"
                else:
                    status = "SpringerPkg_EmptyResponseContent"
            else:
                status = f"SpringerPkg_BadContentType_{content_type[:30]}"
        elif response_status_code == 401 or response_status_code == 403:
            status = f"SpringerPkg_AuthError_{response_status_code}"
            print(f"!! Springer Auth Error ({response_status_code}) via Package - Check Key/Metric !!")
        elif response_status_code == 404:
            status = "SpringerPkg_NotFound"
        elif response_status_code == 429:
            status = "SpringerPkg_RateLimit"; print("   Springer rate limit (429) via Package. Sleeping 60s..."); time.sleep(60)
        elif response_status_code is not None: # Other HTTP errors
            status = f"SpringerPkg_HTTPError_{response_status_code}"
        else:
            # If we couldn't get a status code (caught below)
            status = "SpringerPkg_NoResponseStatus"

    # Catch specific requests errors potentially raised by the package
    except requests.exceptions.HTTPError as e:
        status_code = e.response.status_code
        if status_code == 429: status = "SpringerPkg_RateLimit"; print("   Springer rate limit (429) via Package. Sleeping 60s..."); time.sleep(60)
        elif status_code == 401 or status_code == 403: status = f"SpringerPkg_AuthError_{status_code}"; print(f"!! Springer Auth Error ({status_code}) via Package - Check Key/Metric !!")
        elif status_code == 404: status = "SpringerPkg_NotFound"
        else: status = f"SpringerPkg_HTTPError_{status_code}"
        print(f"   Springer Package HTTP Error: {e}")
    except requests.exceptions.Timeout: status = "SpringerPkg_Timeout"; print(f"   Springer Package Timeout for {doi}")
    except requests.exceptions.RequestException as e: status = f"SpringerPkg_NetworkError"; print(f"   Springer Package Network Error: {e}")
    # Catch other potential errors, including from the package itself if needed
    except Exception as e: status = f"SpringerPkg_UnexpectedError: {type(e).__name__}"; print(f"   Unexpected Error using Springer package for {doi}: {e}")
    finally:
        # Adhere to Premium Full Text API rate limit: 200/min (~3.3/sec). Sleep 0.4s.
        time.sleep(1)
    return xml_content, status


# --- Wiley PDF Download Function ---
def download_wiley_pdf_client(doi, client, output_path):
    """ Attempts to download PDF using the initialized wiley-tdm client. """
    if not client: return False, "WileyClient_NotInitialized"
    if not doi: return False, "InvalidDOI"
    # Check the original token variable, not env var here
    if not WILEY_TDM_TOKEN or "PASTE_" in WILEY_TDM_TOKEN: return False, "InvalidAPIKey_Wiley"

    status = "Wiley_Error_Unknown"
    pdf_saved = False
    try:
        print(f"   Attempting Wiley PDF download via TDM Client...")
        # Client handles filename and saving to client.download_dir (OUTPUT_DIR)
        client.download_pdf(doi)

        # Infer filename based on DOI and check existence/size
        # The client might escape DOIs differently, adjust if needed
        sanitized_doi_base = sanitize_filename(doi, "") # Get base name without extension
        expected_filename = f"{sanitized_doi_base}.pdf" # Standard PDF extension
        expected_filepath = os.path.join(client.download_dir, expected_filename)

        time.sleep(1.5) # Allow slightly more time for write operation

        if os.path.exists(expected_filepath) and os.path.getsize(expected_filepath) > 1000:
             pdf_saved = True; status = "Success_Wiley_PDF"
             print(f"   ✅ SUCCESS: Wiley TDM Client saved PDF for {doi}")
             # Optional: Rename if client uses unexpected naming, e.g. os.rename(...)
        else:
            # Infer failure. Client's own logging should have details.
            status = "Wiley_PDFFail_ClientDidNotSave"
            if os.path.exists(expected_filepath): os.remove(expected_filepath) # Clean up empty file
            print(f"   Wiley TDM Client did not save PDF for {doi}. Check logs/access.")

    except Exception as e:
        status = f"Wiley_ClientError: {type(e).__name__}"
        print(f"   Error using Wiley TDM Client for {doi}: {e}")
    finally:
         time.sleep(1) # Pause after Wiley attempt
    return pdf_saved, status


# --- Unpaywall PDF Fallback Function ---
def download_pdf_unpaywall(pdf_url, filepath, email):
    """Downloads a PDF from a generic Unpaywall URL."""
    # (Using the slightly improved version from previous step)
    if not pdf_url: return False, "NoPDF_URL"
    try:
        headers = {'User-Agent': f'UF_PhD_Script/1.0 (mailto:{email})', 'Accept': 'application/pdf, */*'}
        response = requests.get(pdf_url, headers=headers, stream=True, timeout=90, allow_redirects=True)
        content_type = response.headers.get('Content-Type', '').lower()
        if response.status_code >= 200 and response.status_code < 300:
            if 'pdf' in content_type or 'application/octet-stream' in content_type or not content_type:
                with open(filepath, 'wb') as f:
                    for chunk in response.iter_content(chunk_size=16384):
                        if chunk: f.write(chunk)
                if os.path.getsize(filepath) > 1000: return True, "Success_OA_PDF"
                else: os.remove(filepath); return False, "OA_PDFFail_EmptyFile"
            else: return False, f"OA_PDFFail_NotPDF_{content_type[:30]}"
        elif response.status_code == 429: return False, "OA_PDFFail_RateLimit"
        else: return False, f"OA_PDFFail_HTTP_{response.status_code}"
    except requests.exceptions.Timeout: return False, "OA_PDFFail_Timeout"
    except requests.exceptions.TooManyRedirects: return False, "OA_PDFFail_RedirectLoop"
    except requests.exceptions.RequestException: return False, f"OA_PDFFail_NetworkError"
    finally: time.sleep(0.5)

# --- Main Script Logic ---
os.makedirs(OUTPUT_DIR, exist_ok=True) # Unified output directory
status_log, processed_dois = load_checkpoint()
total_processed_in_session = 0
stop_processing = False

print(f"--- Starting Unified Full-Text Download from {CSV_FILE_PATH} ---")
print(f"--- Saving XML/PDF files to '{OUTPUT_DIR}' ---")

try:
    with open(PROCESSED_FILE, 'a', buffering=1, encoding='utf-8') as processed_file:
        chunk_iterator = pd.read_csv(CSV_FILE_PATH, chunksize=CHUNK_SIZE, usecols=[DOI_COLUMN_NAME], dtype={DOI_COLUMN_NAME: str}, low_memory=False)

        for chunk_num, chunk in enumerate(chunk_iterator):
            if stop_processing: break
            print(f"\n--- Processing Chunk {chunk_num + 1} ---")
            dois_in_chunk = chunk[DOI_COLUMN_NAME].tolist()

            for original_doi_str in dois_in_chunk:
                if stop_processing: break
                status = "Unknown Error"; xml_saved = False; pdf_saved = False; oa_info = {}

                # --- RESUME ---
                if pd.isna(original_doi_str) or not str(original_doi_str).strip(): continue
                original_doi_str = str(original_doi_str).strip()
                if original_doi_str in processed_dois: continue

                # --- Clean DOI ---
                doi = clean_doi(original_doi_str)
                if not doi: status = "InvalidDOIFormat"
                else:
                    print(f"Processing DOI: {doi}")
                    # --- Define potential output paths ---
                    xml_filename = sanitize_filename(doi, ".xml")
                    xml_output_path = os.path.join(OUTPUT_DIR, xml_filename)
                    pdf_filename = sanitize_filename(doi, ".pdf")
                    pdf_output_path = os.path.join(OUTPUT_DIR, pdf_filename)

                    # --- 0. Check if File Already Exists ---
                    if os.path.exists(xml_output_path):
                        print(f"   Skipping: XML file already exists for {doi}")
                        status = "Success_AlreadyExisted_XML"; xml_saved = True
                    elif os.path.exists(pdf_output_path):
                        print(f"   Skipping: PDF file already exists for {doi}")
                        status = "Success_AlreadyExisted_PDF"; pdf_saved = True


                    # --- 1. Try Publisher XML APIs (if no file exists) ---
                    if not xml_saved and not pdf_saved:
                        publisher_attempted = False # Flag if we tried a specific publisher API

                        # --- Elsevier ---
                        if doi.startswith(ELSEVIER_PREFIXES):
                            publisher_attempted = True
                            print(f"   DOI is Elsevier -> Try XML API...")
                            xml_content, status = download_elsevier_xml(doi, ELSEVIER_API_KEY)
                            if xml_content:
                                try:
                                    with open(xml_output_path, 'wb') as f: f.write(xml_content)
                                    print(f"   ✅ SUCCESS: Saved Elsevier XML"); xml_saved = True
                                except Exception as write_e: print(f"   ERROR writing Elsevier XML: {write_e}"); status = "Elsevier_FileWriteError"
                            elif status == "Elsevier_AuthError": stop_processing = True
                            else: print(f"   Elsevier Status: {status}")

                        # --- Springer Nature ---
                        elif any(doi.startswith(prefix) for prefix in SPRINGER_NATURE_PREFIXES):
                            publisher_attempted = True
                            print(f"   DOI is Springer -> Try XML API Package...")
                            if springer_client: # Check if client initialized
                                xml_content, status = download_springer_xml_package(doi, springer_client)
                                if xml_content:
                                    try:
                                        with open(xml_output_path, 'wb') as f: f.write(xml_content)
                                        print(f"   ✅ SUCCESS: Saved Springer XML"); xml_saved = True
                                    except Exception as write_e: print(f"   ERROR writing Springer XML: {write_e}"); status = "SpringerPkg_FileWriteError"
                                elif status == "SpringerPkg_AuthError": print("!! Check Springer API Key/Metric !!") # Don't stop
                                else: print(f"   Springer Package Status: {status}")
                            else:
                                 status = "SpringerClient_NotInitialized"
                                 print("   Skipping Springer XML: Client not initialized.")


                        # --- Add other PUBLISHER XML API checks here ---
                        # elif any(doi.startswith(prefix) for prefix in SOME_OTHER_XML_API_PREFIXES): ...

                        # If no specific publisher XML API was attempted or matched
                        if not publisher_attempted:
                            status = "UnknownPublisherXML_TryOA"; print(f"   Unknown publisher for XML API -> Check OA.")


                    # --- 2. Try Open Access XML via PMC (if XML not found yet) ---
                    if not xml_saved and not pdf_saved and not stop_processing:
                        # Only check OA if needed (i.e., publisher attempt failed or wasn't applicable)
                        print(f"   Checking OA status / trying PMC XML...")
                        oa_info = check_unpaywall_for_oa(doi, YOUR_EMAIL)
                        if oa_info.get('is_oa', False):
                            pmcid, id_conv_status = get_pmcid_from_doi(doi, YOUR_EMAIL, NCBI_API_KEY)
                            if pmcid:
                                xml_content, pmc_status = download_pmc_xml(pmcid, YOUR_EMAIL, NCBI_API_KEY)
                                if xml_content:
                                    try:
                                        with open(xml_output_path, 'wb') as f: f.write(xml_content)
                                        print(f"   ✅ SUCCESS: Saved PMC XML"); xml_saved = True; status = pmc_status
                                    except Exception as write_e: print(f"   ERROR writing PMC XML: {write_e}"); status = "PMC_FileWriteError"
                                else: status = pmc_status; print(f"   PMC Status: {status}") # Use PMC failure status
                            else:
                                status = f"OA_NoPMCID_{id_conv_status}"
                                print(f"   OA but no PMCID ({id_conv_status}).")
                                # Proceed to check Wiley/Unpaywall PDF
                        else:
                            status = oa_info.get('status', 'Paywalled_CheckFailed')
                            print(f"   Not OA according to Unpaywall (Status: {status}).")


                    # --- 3. Try Wiley PDF (if NO XML found AND it's a Wiley DOI) ---
                    # Placed after OA XML attempt. Requires VPN/Subscription if not OA.
                    is_wiley = any(doi.startswith(prefix) for prefix in WILEY_PREFIXES)
                    if not xml_saved and not pdf_saved and not stop_processing and is_wiley:
                         if wiley_client:
                              print(f"   DOI is Wiley -> Try PDF via Wiley TDM Client...")
                              # Note: This attempts download even if Unpaywall said Paywalled,
                              # relying on institutional access via VPN + Token.
                              pdf_saved_flag, pdf_status = download_wiley_pdf_client(doi, wiley_client, pdf_output_path)
                              status = pdf_status # Update status with Wiley outcome
                              if pdf_saved_flag: pdf_saved = True
                              else: print(f"   Wiley PDF download failed. Status: {status}")
                         else:
                              # Only log Wiley was skipped if it wasn't already marked as OA but failed PMC etc.
                              if status not in ["Success_AlreadyExisted_XML", "Success_AlreadyExisted_PDF"] and not status.startswith("Success"):
                                   status = "WileySkipped_NoClient"
                                   print("   Skipping Wiley PDF: Client not initialized.")


                    # --- 4. Try Generic Unpaywall OA PDF Fallback (if NO file saved AND Unpaywall found OA or gave URL) ---
                    if not xml_saved and not pdf_saved and not stop_processing:
                        # Need oa_info from step 2, fetch if we skipped step 2 or step 3 was Wiley
                        if not oa_info or is_wiley: # Re-check if we didn't check before or if Wiley was attempted
                             print(f"   Re-checking Unpaywall for PDF fallback link...")
                             oa_info = check_unpaywall_for_oa(doi, YOUR_EMAIL)

                        # Try PDF ONLY if Unpaywall says OA OR if it gave a PDF URL
                        # AND we haven't already succeeded with Wiley PDF
                        if oa_info.get('is_oa', False) or oa_info.get('pdf_url'):
                            pdf_url = oa_info.get('pdf_url')
                            if pdf_url:
                                print(f"   Attempting Generic OA PDF fallback download...")
                                pdf_saved_flag, pdf_status = download_pdf_unpaywall(pdf_url, pdf_output_path, YOUR_EMAIL)
                                status = pdf_status # Update main status
                                if pdf_saved_flag: print(f"   ✅ SUCCESS: Saved Generic OA PDF fallback"); pdf_saved = True
                                else: print(f"   Generic PDF download failed. Status: {status}")
                            else:
                                # Still considered OA by Unpaywall, but no PDF link found
                                if status.startswith("OA_") or status == "OA_Checked" or status == "UnknownPublisherXML_TryOA":
                                     status = "OA_NoPDFLink" # Set specific status
                                # Avoid overwriting Publisher/PMC fail statuses if PDF wasn't applicable
                                print(f"   OA but no Generic PDF link found. Final Status: {status}")
                        # If not OA and no PDF URL, status should be set (Paywalled/Error)


                    # Final status cleanup if nothing worked
                    if not xml_saved and not pdf_saved and status in ["Unknown Error", "UnknownPublisherXML_TryOA", "OA_Checked"]:
                         if oa_info and not oa_info.get('is_oa', False): status = oa_info.get('status', 'Paywalled_Final')
                         elif oa_info and oa_info.get('is_oa', False): status = "OA_DownloadFailed"
                         else: status = "DownloadFailed_Unknown"


                # --- Save Progress ---
                processed_dois.add(original_doi_str)
                save_checkpoint(status_log, original_doi_str, status, processed_file)
                total_processed_in_session += 1
                if stop_processing: break # Exit inner loop

            print(f"--- Finished Chunk {chunk_num + 1}. Processed {total_processed_in_session} new DOIs this session. Checkpoint saved. ---")

except FileNotFoundError: print(f"ERROR: Cannot find input CSV '{CSV_FILE_PATH}'.")
except KeyError: print(f"ERROR: Column '{DOI_COLUMN_NAME}' not found in '{CSV_FILE_PATH}'.")
except ImportError as e: print(f"ERROR: Missing library. Please install required packages (e.g., pip install requests pandas wiley-tdm springernature-api-client). Details: {e}")
except Exception as e: print(f"An unexpected error occurred: {e}\nStopping. Run again to resume.")
finally:
    # Final save
    try:
        with open(CHECKPOINT_FILE, 'w', encoding='utf-8') as f: json.dump(status_log, f, indent=4)
        print(f"\nFinal status log saved to {CHECKPOINT_FILE}")
    except Exception as e: print(f"ERROR saving final checkpoint {CHECKPOINT_FILE}: {e}")
    print(f"\n--- Unified Download process finished. ---")
    print(f"--- Processed {total_processed_in_session} new DOIs in this run. ---")
    print(f"--- Check logs and output directory ('{OUTPUT_DIR}'). ---")

2025-10-28 14:15:30,457 - INFO - Downloading to: full_text_downloads
2025-10-28 14:15:30,542 - INFO - Your public IP address, (only) used to check entitlements: 128.227.78.146


Wiley TDM Client initialized.
Springer Nature API Client initialized.
Loaded status for 135835 DOIs from download_checkpoint.json.
Loaded 135991 processed DOIs from processed_dois.txt.
--- Starting Unified Full-Text Download from doi_list.csv ---
--- Saving XML/PDF files to 'full_text_downloads' ---

--- Processing Chunk 1 ---
--- Finished Chunk 1. Processed 0 new DOIs this session. Checkpoint saved. ---

--- Processing Chunk 2 ---
--- Finished Chunk 2. Processed 0 new DOIs this session. Checkpoint saved. ---

--- Processing Chunk 3 ---
--- Finished Chunk 3. Processed 0 new DOIs this session. Checkpoint saved. ---

--- Processing Chunk 4 ---
--- Finished Chunk 4. Processed 0 new DOIs this session. Checkpoint saved. ---

--- Processing Chunk 5 ---
--- Finished Chunk 5. Processed 0 new DOIs this session. Checkpoint saved. ---

--- Processing Chunk 6 ---
--- Finished Chunk 6. Processed 0 new DOIs this session. Checkpoint saved. ---

--- Processing Chunk 7 ---
--- Finished Chunk 7. Process