In [221]:
%run set_up.py
%run lit_utility_functions_2025.ipynb

import rispy
import unicodedata
import logging

In [139]:
#Import openalex records
oalex_records_pkl = regex_list_files(in_dir=resdir, 
                       in_pattern='oalex_records_.*[.]pkl'
                      )[-1]
with open(oalex_records_pkl, 'rb') as f:
    oalex_records_dict = pickle.load(f)

#Import RIS of selected sources
oalex_records_dict = {x['doi']:x for v in oalex_records_dict.values() for x in v}

In [135]:
#Export to RIS to create a test sample
oalex_records_ris_path = os.path.join(
        resdir, 
        f"oalex_records_ris_{datetime.now(UTC).strftime('%Y%m%d%H%M')}.ris"
    )
print(f'Writing out {oalex_records_ris_path}')
export_oalex_works_to_ris(
    works=list(oalex_records_list.values()),
    filename=oalex_records_ris_path
)

Writing out D:\WWF_SBTN\BTT_analysis\results\oalex_records_ris_202503221902.ris


In [137]:
#Read test sample
with open(oalex_records_ris_path, 'r', encoding='utf-8') as ris_file:
    entries = rispy.load(ris_file)
    oalex_records_to_get_pd = pd.DataFrame.from_dict(entries)
oalex_records_sample = oalex_records_to_get_pd.sample(n=200, random_state=1)

In [None]:
#Try to retrieve urls and pdf_urls for samples
def get_oalex_fulltext_info(oalex_sample_ris_pd, oalex_fullrecords_dict):
    oalex_loc_dict_list = []

    for doi_stub in oalex_sample_ris_pd.doi.values.tolist():
        full_doi = f'https://doi.org/{doi_stub}'
        temp_dict = oalex_fullrecords_dict[full_doi]['primary_location']
        temp_dict['doi'] = doi_stub
        oalex_loc_dict_list.append(temp_dict)

    oalex_records_loc_pd = list_of_dicts_to_dataframe(
        list_of_dicts=oalex_loc_dict_list,
        keys_to_keep=['doi','is_oa','landing_page_url','pdf_url']
    )

    out_pd = oalex_sample_ris_pd.merge(oalex_records_loc_pd, on='doi')
    
    return(out_pd)

oalex_fulltext_info_pd = get_oalex_fulltext_info(
    oalex_sample_ris_pd=oalex_records_sample, 
    oalex_fullrecords_dict=oalex_records_dict
)

#print(oalex_fulltext_info_pd)

In [213]:
print(pd.isna(oalex_fulltext_info_pd.iloc[4,:].authors))

True


In [215]:
# Function to download and rename PDFs
def _sanitize_filename(filename: str) -> str:
    """Sanitizes a filename by removing invalid characters and normalizing spaces.

    Args:
        filename: The filename to sanitize.

    Returns:
        The sanitized filename.
    """
    # Normalize unicode characters to closest ASCII equivalent (handle accents, etc.)
    filename = unicodedata.normalize('NFKD', filename).encode('ascii', 'ignore').decode('ascii')

    # Replace spaces and slashes with underscores
    filename = filename.replace(' ', '_').replace('/', '_')

    # Remove invalid characters using a regular expression
    filename = re.sub(r'[\\:*?"<>|]', '', filename)

    # Remove any leading/trailing underscores or periods (clean-up)
    filename = filename.strip('_').strip('.')

     # Truncate filename to avoid OS errors (255 is a common limit).  Keep extension.
    if len(filename) > 250:  # Leave room for ".pdf"
        name, ext = os.path.splitext(filename)
        filename = name[:246] + ext #246+4 = 250

    return filename

def _format_ris_authors(authors: Union[List[str], str, float, None]) -> str:
    """Formats author names for the filename.  Handles None/NaN and lists.

    Args:
      authors:  A string, list of strings, NaN, or None representing authors.

    Returns:
      A formatted author string.
    """
    
    if isinstance(authors, str):  # Handle case where it's already a single string
        return authors
    elif isinstance(authors, list):
        if len(authors) > 1:
            return f"{authors[0]} et al."
        elif authors: # Check for empty list
             return authors[0]
        else:
            return "noauthor" # empty list
    elif (authors is None) or (pd.isna(authors)):
        return "noauthor"
    else:
        return "noauthor" # Catch-all for other unexpected types

In [223]:
def download_and_rename_pdfs(
    df: pd.DataFrame, out_dir: str, timeout: int = 10, verbose: bool = True
) -> None:
    """
    Downloads and renames PDFs based on information in a Pandas DataFrame.

    Args:
        df: DataFrame with 'pdf_url', 'authors', 'year', 'secondary_title', and 'title' columns.
        out_dir: Output directory to save PDFs.
        timeout: Timeout in seconds for the requests.get() call.
        verbose: If True, print progress messages.

    Raises:
        TypeError: If `df` is not a Pandas DataFrame or `out_dir` is not a string.
        ValueError: if input dataframe is empty or has missing required columns
    """

    if not isinstance(df, pd.DataFrame):
        raise TypeError("df must be a Pandas DataFrame")
    if not isinstance(out_dir, str):
        raise TypeError("out_dir must be a string")
    if not isinstance(timeout, int) or timeout <= 0:
        raise TypeError("timeout must be a positive integer.")
    if not isinstance(verbose, bool):
        raise TypeError("verbose must be a boolean.")

    if df.empty:
        raise ValueError("Input DataFrame is empty.")

    required_columns = ['pdf_url', 'authors', 'year', 'secondary_title', 'title']
    if not all(col in df.columns for col in required_columns):
        raise ValueError(f"DataFrame must contain columns: {required_columns}")

    # Create directory if it doesn't exist. exist_ok=True prevents errors if it already exists.
    os.makedirs(out_dir, exist_ok=True)  

    for index, row in df.iterrows():
        pdf_url = row['pdf_url']
        authors = row['authors']
        year = row['year']
        source = row['secondary_title']
        title = row['title']

        author_formatted = _format_ris_authors(authors)
        filename = f"{author_formatted}-{year}-{source}-{title}.pdf"
        filename = _sanitize_filename(filename)  # Sanitize the filename

        pdf_path = os.path.join(out_dir, filename)

        if pdf_url and not pd.isna(pdf_url):  # Check for valid and non-null URL
            if not os.path.exists(pdf_path):
                try:
                    response = requests.get(pdf_url, 
                                            timeout=timeout,
                                            allow_redirects=True) #added allow_redirects
                    response.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx)

                    with open(pdf_path, 'wb') as pdf_file:
                        pdf_file.write(response.content)

                    # if verbose:
                    #     logging.info(f"Downloaded and saved: {pdf_path}")

                except requests.exceptions.RequestException as e:
                    logging.error(f"Error downloading {pdf_url}: {e}")
                except (IOError, OSError) as e:
                    logging.error(f"Error saving file {pdf_path}: {e}")
            elif verbose: #if file exist
                logging.info(f'File already exists, skipping download: {pdf_path}')
        # elif verbose:
        #     logging.warning(f"No PDF URL for: {filename}")


In [None]:
# Call the function
out_pdfdir_basename = re.sub('[.]*ris_*',
                             '', 
                             os.path.basename(oalex_records_ris_path))
download_and_rename_pdfs(
    df=oalex_fulltext_info_pd,
    out_dir=os.path.join(resdir, out_pdfdir_basename)
    verbose=False
)

ERROR:root:Error downloading https://onlinelibrary.wiley.com/doi/pdfdirect/10.1111/j.1752-1688.2010.00467.x: 403 Client Error: Forbidden for url: https://onlinelibrary.wiley.com/doi/pdfdirect/10.1111/j.1752-1688.2010.00467.x
ERROR:root:Error downloading https://www.sciencedirect.com/science/article/am/pii/S1462901116303550: 403 Client Error: Forbidden for url: https://www.sciencedirect.com/science/article/am/pii/S1462901116303550
ERROR:root:Error saving file D:\WWF_SBTN\BTT_analysis\results\oalex_records_202503221902\Hodges,_Mary_et_al._-_2015_-_Scientific_investigations_report_-_New_argon-argon_(&lt;sup&gt;40&lt;_sup&gt;Ar_&lt;sup&gt;39&lt;_sup&gt;Ar)_radiometric_age_dates_from_selected_subsurface_basalt_flows_at_the_Idaho_National_Laboratory,_Idaho.pdf: [Errno 2] No such file or directory: 'D:\\WWF_SBTN\\BTT_analysis\\results\\oalex_records_202503221902\\Hodges,_Mary_et_al._-_2015_-_Scientific_investigations_report_-_New_argon-argon_(&lt;sup&gt;40&lt;_sup&gt;Ar_&lt;sup&gt;39&lt;_sup&

In [179]:
print()

oalex_records_202503221902
