In [40]:
%run set_up.py
%run lit_utility_functions_2025.ipynb
%run create_search_strings.ipynb

from dataclasses import dataclass
import pyalex #https://github.com/J535D165/pyalex
from pyalex import config, Works
from typing import Dict, List, Pattern

config.max_retries = 1
config.email = "mathis.messager@mail.mcgill.ca"

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mamessager\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [11]:
search_terms_dict = create_generic_search_terms()
oalex_string_dict = {}
for search_number, search_terms in search_terms_dict.items():
    oalex_string_dict[search_number] = create_search_string(
        search_terms, inflect=False, or_chars=' OR ', and_chars=' AND ',
        inner_separators=[" ", "-", ""],  use_quotes=True) 

In [13]:
def extract_last_url_segment(url):
    """
    Extracts the last segment of a URL path. 

    Args:
        url: The URL string.

    Returns:
        The last segment of the URL path, or None if the URL is invalid
        or has no path.
    """
    try:
        parsed_url = urlparse(url)
        path = parsed_url.path
        if not path:
            return None  # No path component

        # Split the path by '/' and get the last element
        segments = path.split('/')
        return segments[-1]  # Handle cases with trailing slashes correctly

    except Exception:  # Catch any parsing errors
        return None

def extract_concept_from_url_df(df, url_col, include_col=None):
    """
    Extracts the last segment of URLs from a specific column in a DataFrame,
    filtering by a boolean column, and adds the result as a new column.

    Args:
      df: The Pandas DataFrame.
      url_col: The name of the column containing URLs (string).
      include_col: The name of the boolean column to filter by (string).

    Returns:
        A new Pandas DataFrame with an additional column 'openalex_id_last_segment'
        containing the extracted last segment, or None if the input is invalid.
    """
    # Input validation: Check for required columns
    required_columns = [url_col]
    if include_col is not None:
        required_columns.append(include_col)
    if not all(col in df.columns for col in required_columns):
        print("DataFrame is missing some columns.")
        return None

    # Make a copy to avoid modifying the original DataFrame
    df_copy = df.copy()

    if include_col is not None:
        # Convert 'Include?' (or whatever include_col is) to boolean, handling various representations.
        df_copy[include_col] = df_copy[include_col].astype(str).str.lower().isin(['y', 'yes', 'true', '1', 't'])
        # Apply the extraction function ONLY to rows where 'include_col' is True,
        # and ONLY to the 'url_col' of those rows. Use .loc for proper indexing.
        included_clist = df_copy.loc[df_copy[include_col], url_col].apply(extract_last_url_segment)
    else:
        included_clist = df_copy.loc[:, url_col].apply(extract_last_url_segment)

    return included_clist.tolist()

#Get open alex concepts to filter with
concepts_toinclude_pd = pd.read_csv(
    os.path.join(datdir, 'openalex_concepts_toinclude.csv'))
concepts_toinclude_list =  extract_concept_from_url_df(
    df = concepts_toinclude_pd,
    url_col = 'openalex_id', 
    include_col = 'include')

In [15]:
#Run on each search string
oalex_records_dict = {}
for search_number, search_terms in oalex_string_dict.items():
    print(f'Retrieving {search_number}')
    oa_query = Works().search_filter(title_and_abstract=search_terms).filter(
        concept={"id": '|'.join(concepts_toinclude_list)},
        is_retracted='False')

    #print(oa_query.url)
    
    oalex_records_dict[search_number] = [
        record for record in itertools.chain(
        *oa_query.paginate(per_page=200, n_max=200)
    )]

print([len(rec_list) for rec_list in oalex_records_dict.values()])

Retrieving search1
Retrieving search2
Retrieving search3
Retrieving search4
Retrieving search5
Retrieving search6
[200, 200, 200, 200, 117, 200]


In [21]:
#Remove duplicates
def remove_oalex_search_duplicates(records_dict):
    seen_ids = set()
    new_records_dict = {}
    for search_number, records_list in records_dict.items():
        print(search_number)
        print(len(records_list))
        new_records_list = []  # Create a new list for each search_number
        for record in records_list:
            rid = record['id']
            if rid not in seen_ids:
                new_records_list.append(record)
                seen_ids.add(rid)
        new_records_dict[search_number] = new_records_list
        print(len(new_records_list))
    return(new_records_dict)
    
oalex_records_dict = remove_oalex_search_duplicates(oalex_records_dict) # Replace the old dict

search1
200
200
search2
178
178
search3
194
194
search4
195
195
search5
115
115
search6
190
190


In [23]:
def remove_redundant_parentheses(in_str):
    #Remove unnecessary parentheses *if* they enclose the entire expression:
    if in_str.startswith('(') and in_str.endswith(')'):
        # Check if they are *actually* unnecessary (i.e., not (a|b) AND (c|d))
        open_count = 0
        unnecessary = True
        while unnecessary:
            for i, char in enumerate(in_str):
                if char == '(':
                    open_count += 1
                elif char == ')':
                    open_count -= 1
                if open_count == 0 and i < len(in_str) - 1:
                    unnecessary = False
                    break
            if unnecessary:
                in_str = in_str[1:-1]
        return(in_str)
    
#Generate regex to filter OpenAlex after the search
#to make up for lemmatization performed by Open Alex
post_oalex_regex_dict = {}
for search_number, search_terms in search_terms_dict.items():
    #Generated an initial filter to be adjusted 
    #(split it in two regex queries for the AND rather than using greedy lookaheads
    post_oalex_regex_dict[search_number] = create_search_string(
        search_terms, inflect=True, or_chars='|', and_chars='AND',
        inner_separators=[r"[-\s]*"], use_quotes=False)
    #Remove redundant parentheses and split for nested regex filters
    post_oalex_regex_dict[search_number] = remove_redundant_parentheses(
        post_oalex_regex_dict[search_number]).split('AND')

In [25]:
post_oalex_regex_dict['search1']

['((environmental)|(maintenances)|(eco[-\\s]*hydrologically)|(hydro[-\\s]*ecologically)|(restoration)|(minimuss)|(hydro[-\\s]*ecologic)|(hydro[-\\s]*ecological)|(augmentations)|(compensation)|(restoratives)|(maintenance)|(restorations)|(compensations)|(restorative)|(ecologically)|(minimal)|(augmentation)|(optimum)|(acceptable)|(minima)|(optima)|(ecological)|(eco[-\\s]*hydrologic)|(flushing)|(eco[-\\s]*hydrological)|(ecologic)|(minimus)|(augmented)|(in[-\\s]*stream)|(experimental)|(minimum))',
 '((floods)|(discharges)|(water[-\\s]*level)|(discharging)|(discharge)|(flow)|(flood)|(flooding)|(flowing)|(flows))']

In [48]:
@dataclass
class Record:
    """Represents a scientific record from open """
    title: str
    abstract: str
    keywords: List[Dict[str, str]]
    
def _matches_all_patterns(record: Record, patterns: List[Pattern]) -> bool:
    """
    Checks if a record matches all given regex patterns.
    
    Args:
        record: The record (dictionary) to check
        patterns: List of compiled regex patterns
    
    Returns:
        True if the record matches all patterns, False otherwise
    """
    #Create a single string from all record information to filter
    searchable_text = ' '.join([
        str(record['title']),
        str(record['abstract']),
        *[str(kw['display_name']) for kw in record['keywords']]
    ])

    #the all-for loop combination enables to implement the AND between two regex patterns
    match_bool = all(
        pattern.search(searchable_text) 
        for pattern in patterns
    )
    
    return(match_bool)

def filter_records(
    records_dict: Dict[str, List[Record]], 
    regex_dict: Dict[str, List[Pattern]]
) -> Dict[str, List[Record]]:
    """
    Filters records based on regex patterns matching in title, abstract, or keywords.
    
    Args:
        records_dict: Dictionary mapping search numbers to lists of records
        regex_dict: Dictionary mapping search numbers to lists of compiled regex patterns
    
    Returns:
        Dictionary containing filtered records that match all regex patterns
        for their respective search number
    """
    
    filtered_records = {}
    for search_number, records_list in records_dict.items():
        print(f"Processing search number: {search_number}")
        print(f"Initial number of records: {len(records_list)}")

        if not records_list:
            continue

        matching_records = [
            record for record in records_list
            if _matches_all_patterns(record, regex_dict[search_number])
        ]

        print(f"Number of records after filtering: {len(matching_records)}")
        filtered_records[search_number] = matching_records

    return(filtered_records)

# Pre-compile the regex patterns:
post_oalex_regex_compiled_dict = {
    search_number: [re.compile(pattern) for pattern in patterns]
    for search_number, patterns in post_oalex_regex_dict.items()
}

oalex_records_dict_filtered = filter_records(
    records_dict=oalex_records_dict,
    regex_dict=post_oalex_regex_compiled_dict
)

Processing search number: search1
Initial number of records: 200
Number of records after filtering: 127
Processing search number: search2
Initial number of records: 178
Number of records after filtering: 153
Processing search number: search3
Initial number of records: 194
Number of records after filtering: 8
Processing search number: search4
Initial number of records: 195
Number of records after filtering: 115
Processing search number: search5
Initial number of records: 115
Number of records after filtering: 102
Processing search number: search6
Initial number of records: 190
Number of records after filtering: 169


In [50]:
#Serialize list of records
#All results from PyAlex can be serialized. For example, save the results to a JSON file:
import json
from pathlib import Path
from pyalex import Work

with open(Path("works.json"), "w") as f:
    json.dump(Works().get(), f)

# with open(Path("works.json")) as f:
#     works = [Work(w) for w in json.load(f)]



0.5616666666666666

In [None]:
'''
~~~~~ Search for works in OpenAlex based on search string ~~~~~~~~~~~~~~~~~~~~~~
Reference info on the API: 
# https://docs.openalex.org/how-to-use-the-api/get-lists-of-entities/search-entities
# https://docs.openalex.org/api-entities/works/search-works

#EX: https://api.openalex.org/works?search=(elmo AND "sesame street") NOT (cookie OR monster)
#Filter categories based on csv
#do not lemmatize
#&per-page=100&cursor=*

#~~~~~~~~~~~~~~~~ PAGING ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Basic paging only works to get the first 10,000 results of any list. If you want to see more than 10,000 results, you'll need to use cursor paging.
To use cursor paging, you request a cursor by adding the cursor=* parameter-value pair to your query.
    Get a cursor in order to start cursor pagination:
    https://api.openalex.org/works?filter=publication_year:2020&per-page=100&cursor=*
The response to your query will include a next_cursor value in the response's meta object. Here's what it looks like:
{
  "meta": {
    "count": 8695857,
    "db_response_time_ms": 28,
    "page": null,
    "per_page": 100,
    "next_cursor": "IlsxNjA5MzcyODAwMDAwLCAnaHR0cHM6Ly9vcGVuYWxleC5vcmcvVzI0ODg0OTk3NjQnXSI="
  },
  "results" : [
    // the first page of results
  ]
}

To retrieve the next page of results, copy the meta.next_cursor value into the cursor field of your next request.

    Get the next page of results using a cursor value:
    https://api.openalex.org/works?filter=publication_year:2020&per-page=100&cursor=IlsxNjA5MzcyODAwMDAwLCAnaHR0cHM6Ly9vcGVuYWxleC5vcmcvVzI0ODg0OTk3NjQnXSI=

To get all the results, keep repeating this process until meta.next_cursor is null and the results set is empty.
'''

#
