In [58]:
%run set_up.py
%run lit_utility_functions_2025.ipynb
%run create_search_strings.ipynb

from dataclasses import dataclass, field
from datetime import datetime, UTC
import pyalex #https://github.com/J535D165/pyalex
from pyalex import config, Works
from typing import Any, Dict, List, Pattern, Tuple, Union

config.max_retries = 1
config.email = "mathis.messager@mail.mcgill.ca"

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\messa\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
search_terms_dict = create_generic_search_terms()
oalex_string_dict = {}
for search_number, search_terms in search_terms_dict.items():
    oalex_string_dict[search_number] = create_search_string(
        search_terms, inflect=False, or_chars=' OR ', and_chars=' AND ',
        inner_separators=[" ", "-", ""],  use_quotes=True) 

In [6]:
def extract_last_url_segment(url):
    """
    Extracts the last segment of a URL path. 

    Args:
        url: The URL string.

    Returns:
        The last segment of the URL path, or None if the URL is invalid
        or has no path.
    """
    try:
        parsed_url = urlparse(url)
        path = parsed_url.path
        if not path:
            return None  # No path component

        # Split the path by '/' and get the last element
        segments = path.split('/')
        return segments[-1]  # Handle cases with trailing slashes correctly

    except Exception:  # Catch any parsing errors
        return None

def extract_concept_from_url_df(df, url_col, include_col=None):
    """
    Extracts the last segment of URLs from a specific column in a DataFrame,
    filtering by a boolean column, and adds the result as a new column.

    Args:
      df: The Pandas DataFrame.
      url_col: The name of the column containing URLs (string).
      include_col: The name of the boolean column to filter by (string).

    Returns:
        A new Pandas DataFrame with an additional column 'openalex_id_last_segment'
        containing the extracted last segment, or None if the input is invalid.
    """
    # Input validation: Check for required columns
    required_columns = [url_col]
    if include_col is not None:
        required_columns.append(include_col)
    if not all(col in df.columns for col in required_columns):
        print("DataFrame is missing some columns.")
        return None

    # Make a copy to avoid modifying the original DataFrame
    df_copy = df.copy()

    if include_col is not None:
        # Convert 'Include?' (or whatever include_col is) to boolean, handling various representations.
        df_copy[include_col] = df_copy[include_col].astype(str).str.lower().isin(['y', 'yes', 'true', '1', 't'])
        # Apply the extraction function ONLY to rows where 'include_col' is True,
        # and ONLY to the 'url_col' of those rows. Use .loc for proper indexing.
        included_clist = df_copy.loc[df_copy[include_col], url_col].apply(extract_last_url_segment)
    else:
        included_clist = df_copy.loc[:, url_col].apply(extract_last_url_segment)

    return included_clist.tolist()

#Get open alex concepts to filter with
concepts_toinclude_pd = pd.read_csv(
    os.path.join(datdir, 'openalex_concepts_toinclude.csv'))
concepts_toinclude_list =  extract_concept_from_url_df(
    df = concepts_toinclude_pd,
    url_col = 'openalex_id', 
    include_col = 'include')

In [135]:
#Get toponyms to filter with
target_toponyms_pkl = regex_list_files(in_dir=resdir, 
                       in_pattern='target_toponyms_umrb_.*'
                      )[-1]

with open(target_toponyms_pkl, 'rb') as f:
    target_toponyms_set = pickle.load(f)
target_toponyms_quoted_set = {f'"{t}"' for t in target_toponyms_set}

In [182]:
#Run on each search string
def _retrieve_oalex_records(title_and_abstract_string, 
                            concepts_list=None, 
                            search_all_string=None,
                            n_max=200):
    if search_all_string:
        oa_query =  Works().search(search_all_string)
    else:
        oa_query =  Works()
        
    oa_query = oa_query.\
    search_filter(title_and_abstract=title_and_abstract_string).\
    filter(is_retracted='False')

    if concepts_list:
        oa_query = oa_query.filter(concept={"id": '|'.join(concepts_list)})

    #print(oa_query.url)
    out_records_list = [
        record for record in itertools.chain(
        *oa_query.paginate(per_page=min([n_max, 200]), n_max=n_max)
    )]
    return(out_records_list)

def retrieve_oalex_records_dict(title_and_abstract_string_dict,
                                concepts_list=None, 
                                search_all_string_list=None,
                                n_max=200,
                                verbose=True
                               ):
    out_records_dict = collections.defaultdict(list)
    seen_ids = set()
    for search_number, search_terms in title_and_abstract_string_dict.items():
        print(f'Retrieving {search_number}')
        initial_records_list = []
        for search_all_string in search_all_string_list:
            if verbose:
                print(f'Subsetting for {search_all_string}')
            retrieved_recs = _retrieve_oalex_records(
                    title_and_abstract_string=search_terms,
                    concepts_list=concepts_list, 
                    search_all_string=search_all_string,
                    n_max=n_max
                )
            if retrieved_recs:
                initial_records_list.extend(retrieved_recs)

        #Remove duplicates within the search
        new_records_list = []  # Create a new list for each search_number
        for record in initial_records_list:
            rid = record['id']
            if rid not in seen_ids:
                new_records_list.append(record)
                seen_ids.add(rid)
        out_records_dict[search_number] = new_records_list
        
    return(out_records_dict)

oalex_records_dict = retrieve_oalex_records_dict(
    title_and_abstract_string_dict=oalex_string_dict,
    concepts_list=concepts_toinclude_list, 
    search_all_string_list=list(target_toponyms_quoted_set)[0:50],
    n_max=200,
    verbose=False
)

print([len(rec_list) for rec_list in oalex_records_dict.values() if rec_list])

Retrieving search1
Retrieving search2
Retrieving search3
Retrieving search4
Retrieving search5
Retrieving search6
[1430, 725, 133, 20, 2, 814]


In [183]:
def remove_redundant_parentheses(in_str):
    #Remove unnecessary parentheses *if* they enclose the entire expression:
    if in_str.startswith('(') and in_str.endswith(')'):
        # Check if they are *actually* unnecessary (i.e., not (a|b) AND (c|d))
        open_count = 0
        unnecessary = True
        while unnecessary:
            for i, char in enumerate(in_str):
                if char == '(':
                    open_count += 1
                elif char == ')':
                    open_count -= 1
                if open_count == 0 and i < len(in_str) - 1:
                    unnecessary = False
                    break
            if unnecessary:
                in_str = in_str[1:-1]
        return(in_str)
    
#Generate regex to filter OpenAlex after the search
#to make up for lemmatization performed by Open Alex
post_oalex_regex_dict = {}
for search_number, search_terms in search_terms_dict.items():
    #Generated an initial filter to be adjusted 
    #(split it in two regex queries for the AND rather than using greedy lookaheads
    post_oalex_regex_dict[search_number] = create_search_string(
        search_terms, inflect=True, or_chars='|', and_chars='AND',
        inner_separators=[r"[-\s]*"], use_quotes=False)
    #Remove redundant parentheses and split for nested regex filters
    post_oalex_regex_dict[search_number] = remove_redundant_parentheses(
        post_oalex_regex_dict[search_number]).split('AND')

In [184]:
def _matches_patterns(
    record: Dict[str, List[Any]], 
    patterns: List[re.Pattern], 
    match_all: bool = True
) -> bool:
    """
    Checks if a record matches regex patterns, 
    with options for 'all' or 'any' matching and including ngrams.

    Args:
        record: The record to check.
        patterns: List of compiled regex patterns.
        match_all: If True, all patterns must match.  If False, at least one must match.
        include_ngrams: If True, include ngrams in the searchable text.

    Returns:
        True if the record matches the patterns according to the conditions, False otherwise.
    """
    if not isinstance(patterns, list):
        raise TypeError("patterns must be a list of compiled regex patterns.")
    if not all(isinstance(p, re.Pattern) for p in patterns):
         raise TypeError("patterns must be a list of compiled regex patterns.")
    if not isinstance(match_all, bool):
        raise TypeError("match_all must be a boolean.")

    searchable_text_parts = [
        str(record['title']),
        str(record['abstract']),
        *[str(kw['display_name']) for kw in record['keywords']]
    ]

    searchable_text = " ".join(searchable_text_parts)

    if match_all:
        match_bool = all(pattern.search(searchable_text) for pattern in patterns)
    else:
        match_bool = any(pattern.search(searchable_text) for pattern in patterns)

    return match_bool


def filter_records(
    records_dict: Dict[str, List[Dict]],
    regex_dict: Dict[str, List[re.Pattern]],
    match_all: bool = True
) -> Dict[str, List[Dict]]:
    """
    Filters records based on regex patterns, with options for n-gram inclusion.

    Args:
        records_dict: Dictionary mapping search numbers to lists of raw OpenAlex records (dictionaries).
        regex_dict: Dictionary mapping search numbers to lists of compiled regex patterns.
        match_all: If True, all patterns must match. If False, at least one must match.
        include_ngrams: If True, include ngrams in the searchable text for matching.

    Returns:
        Dictionary containing filtered records (as Record objects) that match the
        regex patterns for their respective search number, according to the conditions.
    """
    if not isinstance(records_dict, dict):
        raise TypeError("records_dict must be a dictionary.")
    if not isinstance(regex_dict, dict):
        raise TypeError("regex_dict must be a dictionary.")
    if not isinstance(match_all, bool):
        raise TypeError("match_all must be a boolean.")

     # Check if keys in records_dict and regex_dict match
    if records_dict.keys() != regex_dict.keys():
        raise ValueError("Keys in records_dict and regex_dict must be identical.")

    #check if values are lists in dict
    if not all(isinstance(val, list) for val in regex_dict.values()):
          raise TypeError("Values of regex_dict must be lists.")

    filtered_records = {}
    for search_number, records_list in records_dict.items():
        print(f"Processing search number: {search_number}")
        print(f"Initial number of records: {len(records_list)}")

        if not records_list:
            print(f"No records for search number {search_number}, skipping.")
            filtered_records[search_number] = []  # Consistent return type
            continue

        # Convert raw OpenAlex records (dictionaries) to Record objects *and* extract ngrams
        matching_records = [
            record 
            for record in records_list
            if _matches_patterns(record, 
                                 regex_dict[search_number],
                                 match_all)
        ]

        print(f"Number of records after filtering: {len(matching_records)}")
        filtered_records[search_number] = matching_records

    return filtered_records

In [185]:
# Pre-compile the regex patterns:
post_oalex_regex_compiled_dict = {
    search_number: [re.compile(pattern) for pattern in patterns]
    for search_number, patterns in post_oalex_regex_dict.items()
}

#Run filter
oalex_records_dict_filtered = filter_records(
    records_dict=oalex_records_dict,
    regex_dict=post_oalex_regex_compiled_dict,
    match_all=True
)

Processing search number: search1
Initial number of records: 1430
Number of records after filtering: 812
Processing search number: search2
Initial number of records: 725
Number of records after filtering: 684
Processing search number: search3
Initial number of records: 133
Number of records after filtering: 0
Processing search number: search4
Initial number of records: 20
Number of records after filtering: 2
Processing search number: search5
Initial number of records: 2
Number of records after filtering: 2
Processing search number: search6
Initial number of records: 814
Number of records after filtering: 755


In [27]:
#Serialize list of records
oalex_records_pkl = os.path.join(
    resdir, 
    f"oalex_records_{datetime.now(UTC).strftime('%Y%m%d%H%M')}"
)
with open(oalex_records_pkl, 'wb') as f:
    pickle.dump(oalex_records_dict_filtered, f)

In [194]:
#Export to RIS
def export_works_to_ris(works, filename):
    """
    Export a list of open alex works to an RIS file.

    Parameters
    ----------
    works : list of Work
        List of works to be exported.
    filename : str
        The name of the RIS file to be created.
    """
    with open(filename, 'w', encoding='utf-8') as ris_file:
        for work in works:
            ris_file.write("TY  - JOUR\n")  # Type of reference
            ris_file.write(f"TI  - {work.get('title', '')}\n")  # Title
            ris_file.write(f"AU  - {work.get('authors', '')}\n")  # Authors
            ris_file.write(f"PY  - {work.get('publication_year', '')}\n")  # Year
            ris_file.write(f"JO  - {work.get('journal', '')}\n")  # Journal
            ris_file.write(f"VL  - {work.get('volume', '')}\n")  # Volume
            ris_file.write(f"IS  - {work.get('issue', '')}\n")  # Issue
            ris_file.write(f"SP  - {work.get('start_page', '')}\n")  # Start page
            ris_file.write(f"EP  - {work.get('end_page', '')}\n")  # End page
            ris_file.write(f"DO  - {work.get('doi', '')}\n")  # DOI
            ris_file.write("ER  - \n\n")  # End of reference

for search_number, records_list in oalex_records_dict_filtered.items():
    oalex_records_ris_path = os.path.join(
        resdir, 
        f"oalex_records_{search_number}_ris_{datetime.now(UTC).strftime('%Y%m%d%H%M')}.ris"
    )
    print(f'Writing out {oalex_records_ris_path}')
    export_works_to_ris(works=records_list, 
                        filename=oalex_records_ris_path)

Writing out D:\WWF_SBTN\BTT_analysis\results\oalex_records_search1_ris_202503221536.ris
Writing out D:\WWF_SBTN\BTT_analysis\results\oalex_records_search2_ris_202503221536.ris
Writing out D:\WWF_SBTN\BTT_analysis\results\oalex_records_search3_ris_202503221536.ris
Writing out D:\WWF_SBTN\BTT_analysis\results\oalex_records_search4_ris_202503221536.ris
Writing out D:\WWF_SBTN\BTT_analysis\results\oalex_records_search5_ris_202503221536.ris
Writing out D:\WWF_SBTN\BTT_analysis\results\oalex_records_search6_ris_202503221536.ris


In [None]:
######################### USEFUL UNUSED FUNCTIONS ####################################

In [None]:
#Import openalex records
oalex_records_pkl = regex_list_files(in_dir=resdir, 
                       in_pattern='oalex_records_.*'
                      )[-1]

with open(oalex_records_pkl, 'rb') as f:
    oalex_records = pickle.load(f)

print(list(list(oalex_records.values())[0][0].values()))
#Get n-grams for records, filter by title, abstract, keywords and n-grams


#Remove duplicates
def remove_oalex_search_duplicates(records_dict):
    seen_ids = set()
    new_records_dict = {}
    for search_number, records_list in records_dict.items():
        print(search_number)
        print(len(records_list))
        new_records_list = []  # Create a new list for each search_number
        for record in records_list:
            rid = record['id']
            if rid not in seen_ids:
                new_records_list.append(record)
                seen_ids.add(rid)
        new_records_dict[search_number] = new_records_list
        print(len(new_records_list))
    return(new_records_dict)
#oalex_records_dict = remove_oalex_search_duplicates(oalex_records_dict) # Replace the old dict

In [None]:
'''
~~~~~ Search for works in OpenAlex based on search string ~~~~~~~~~~~~~~~~~~~~~~
Reference info on the API: 
# https://docs.openalex.org/how-to-use-the-api/get-lists-of-entities/search-entities
# https://docs.openalex.org/api-entities/works/search-works

#EX: https://api.openalex.org/works?search=(elmo AND "sesame street") NOT (cookie OR monster)
#Filter categories based on csv
#&per-page=100&cursor=*

#~~~~~~~~~~~~~~~~ PAGING ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Basic paging only works to get the first 10,000 results of any list. If you want to see more than 10,000 results, you'll need to use cursor paging.
To use cursor paging, you request a cursor by adding the cursor=* parameter-value pair to your query.
    Get a cursor in order to start cursor pagination:
    https://api.openalex.org/works?filter=publication_year:2020&per-page=100&cursor=*
The response to your query will include a next_cursor value in the response's meta object. Here's what it looks like:
{
  "meta": {
    "count": 8695857,
    "db_response_time_ms": 28,
    "page": null,
    "per_page": 100,
    "next_cursor": "IlsxNjA5MzcyODAwMDAwLCAnaHR0cHM6Ly9vcGVuYWxleC5vcmcvVzI0ODg0OTk3NjQnXSI="
  },
  "results" : [
    // the first page of results
  ]
}

To retrieve the next page of results, copy the meta.next_cursor value into the cursor field of your next request.

    Get the next page of results using a cursor value:
    https://api.openalex.org/works?filter=publication_year:2020&per-page=100&cursor=IlsxNjA5MzcyODAwMDAwLCAnaHR0cHM6Ly9vcGVuYWxleC5vcmcvVzI0ODg0OTk3NjQnXSI=

To get all the results, keep repeating this process until meta.next_cursor is null and the results set is empty.
'''

#
