# Create search strings

Generates structured search strings for querying OpenAlex based on pre-defined keywords and patterns. The script leverages text-processing utilities to ensure that the queries are adaptable to lemmatization or strict searches (using american and english spelling, word inflection, participles, plural forms, if needed). Then combine groups of search strings with boolean operators.
This scripts only defines functions, which are then used in create_api_call.ipynb

In [3]:
%run lit_utility_functions_2025.ipynb

import bream
import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.corpus import words
from pyinflect import getInflection
import spacy
from textblob import Word


# Load NLTK resources
try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')
try:
    nltk.data.find('corpora/words')
except LookupError:
    nltk.download('words')
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')
    
# Load spaCy model
try:
    spacy_nlp = spacy.load("en_core_web_sm")
except OSError:
    print("Downloading en_core_web_sm model...")
    spacy.cli.download("en_core_web_sm")
    spacy_nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\messa\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
def find_matching_words(pattern: str, 
                        bound_pattern: bool = True) -> List[str]:
    """
      Finds all English words that match a given regular expression pattern.
      Args:
        pattern: The regular expression pattern (string).
        bound_pattern (bool, optional): If True, adds start (^) and end ($)
                                        anchors to the pattern to ensure the
                                        entire word matches. Defaults to True.
      Returns:
        A list of English words that match the pattern. Returns an empty list if no
        words match or if there's an invalid regex pattern. Prints a warning if the
        NLTK words corpus is not found (this behavior depends on NLTK setup).
    """
    try:
        # Optionally add start (^) and end ($) anchors to the regex pattern.
        # This ensures the pattern matches the *entire* word, not just a substring.
        if bound_pattern:
            pattern = f'^{pattern}$'

        # Compile the regular expression for efficiency, especially when matching many words.
        regex = re.compile(pattern)
        
        # Load the list of English words from the NLTK corpus.
        english_words = words.words()

        # Filter words that match the regex.
        matching_words = [word for word in english_words if regex.search(word)]

        return matching_words
    except re.error as e:
        # Handle cases where the provided pattern is an invalid regular expression.
        print(f"Warning: Invalid regex pattern provided: {pattern}. Error: {e}")
        return []
    except LookupError:
        # Handle cases where the NLTK 'words' corpus is not downloaded.
        print("Warning: NLTK 'words' corpus not found. Please download it (e.g., nltk.download('words')).")
        return []
    except NameError:
        # Handle cases where 'words' object (from NLTK) is not defined/imported.
        print("Warning: NLTK 'words' corpus object not found. Ensure NLTK is imported correctly.")
        return []

def combinate_concats(
    prefixes: List[str], 
    suffixes: List[str], 
    separators: List[str] = [" ", "-", ""], 
    add_quotes: bool = False
) -> List[str]:
    """
    Generates all combinations of prefixes, separators, and suffixes.

    Args:
        prefixes (List[str]): A list of prefix strings.
        suffixes (List[str]): A list of suffix strings.
        separators (List[str], optional): A list of separator strings. 
                                          Defaults to [" ", "-", ""].
        add_quotes (bool, optional): If True, encloses each combination in 
                                     double quotes. Defaults to False.

    Returns:
        List[str]: A list of strings, each being a combination of a prefix, 
                   a separator, and a suffix. 
    Details:
        Uses itertools.product for efficiency.
    """
    # Generate the Cartesian product of the input iterables (prefixes, separators, suffixes).
    product_iterator = itertools.product(prefixes, separators, suffixes)

    combinations: List[str]
    if add_quotes:
        # If quotes are requested, format each combination accordingly.
        combinations = [
            '"' + "".join(combination) + '"'
            for combination in product_iterator
        ]
    else:
        # Otherwise, just join the parts of each combination tuple.
        combinations = [
            "".join(combination)
            for combination in product_iterator
        ]
        
    return combinations


def plural_form_exists(word: str, 
                       in_lemmatizer: WordNetLemmatizer) -> bool:
    """
    Checks if a plausible plural form of a word exists, using WordNet and 
    rule-based fallbacks. Handles some irregular plurals via lemmatization.

    Args:
        word (str): The word to check.
        in_lemmatizer (WordNetLemmatizer): An initialized NLTK WordNetLemmatizer 
                                           instance.

    Returns:
        bool: True if a plausible plural form is found (either the word itself
              seems plural, or a known plural form exists), False otherwise.
    """
    # Ensure necessary NLTK data is available (handling potential errors)
    try:
        # 1. Check if the word is *already* potentially plural in WordNet.
        #    This checks if any lemma associated with the word's synsets ends in 's'.
        #    It's a heuristic, not foolproof.
        if wordnet.synsets(word) and any(lemma.name().endswith('s') 
                                         for synset in wordnet.synsets(word)
                                         for lemma in synset.lemmas()):
            return True

        # 2. Lemmatize the word to find its base (singular) form for nouns.
        #    This helps identify the root even if the input 'word' is already plural.
        lemma: str = in_lemmatizer.lemmatize(word, pos=wordnet.NOUN) # Specify Part of Speech

        # 3. Check if the original word is different from its lemma.
        #    If word != lemma, it implies the original 'word' was likely an inflected 
        #    form (potentially plural) that was reduced to its base 'lemma'.
        if lemma != word:
            return True # Word is not the lemma, suggesting it's likely already plural.
    
        # 4. If lemma and word are the same (word is likely singular), try adding 's'.
        #    Check if this simple plural form exists in WordNet.
        simple_plural_s: str = word + 's'
        if wordnet.synsets(simple_plural_s):
            return True # The basic 'word + s' plural exists in WordNet.

        # 5. Apply common English pluralization rules as a fallback.
        #    This handles cases not directly covered by WordNet lookups above.
        plural: str = "" # Initialize plural form variable
        if word.endswith(("s", "x", "z", "ch", "sh")):
            plural = word + "es" # Add 'es' for words ending in s, x, z, ch, sh
        elif word.endswith("y") and len(word) > 1 and word[-2].lower() not in "aeiou":
            plural = word[:-1] + "ies" # Change 'y' to 'ies' if preceded by a consonant
        else:
            # Default rule: just add 's'. This also covers cases like vowel + 'y'.
            plural = word + "s" 
        
        # Check if the rule-based plural form exists in WordNet.
        if wordnet.synsets(plural):
            return True # The rule-based plural form is found in WordNet.

    except NameError:
        # Handle cases where 'wordnet' or the lemmatizer is not defined/imported.
        print("Warning: NLTK 'wordnet' or lemmatizer not found. Ensure NLTK is imported/initialized.")
        return False # Cannot perform check
    except LookupError:
        # Handle cases where NLTK 'wordnet' corpus is not downloaded.
        print("Warning: NLTK 'wordnet' corpus not found. Please download it (e.g., nltk.download('wordnet')).")
        return False # Cannot perform check

    # If none of the above checks found a plausible plural form.
    return False


def textblob_pluralize(word: str) -> str:
    """
    Generates the plural form of a word using the TextBlob library.

    Args:
        word (str): The word to pluralize.

    Returns:
        str: The pluralized form of the word as determined by TextBlob.
             Returns the original word if TextBlob fails or is not available.
    """
    # Create a TextBlob Word object from the input string.
    w = Word(word)
    return w.pluralize()


def get_spelling_variants(word: str) -> List[str]:
    """
    Gets American and British spelling variants of a word using the 'bream' library.

    Args:
        word (str): The word for which to find spelling variants.

    Returns:
        List[str]: A list containing the original word and its American/British 
                   spelling variants if found. Duplicates are automatically handled.
                   Returns a list with only the original word if 'bream' is unavailable
                   or if variants are not found/cause errors.
    """
    # Use a set to automatically handle duplicate entries.
    variants: Set[str] = set()
    variants.add(word)  # Always include the original word

    try:
        # Attempt to convert the word to American English spelling.
        # This might raise an error if the word is not in bream's dictionary.
        american: str = bream.to_american(word)
        variants.add(american)
    except Exception:
        # Ignore errors if conversion fails (e.g., word not found).
        pass # Keep the original word only if American variant fails.
        
    try:
        # Attempt to convert the word to British English spelling.
        british: str = bream.to_british(word)
        variants.add(british)
    except Exception:
        # Ignore errors if conversion fails.
        pass # Keep the existing variants if British variant fails.

    # Convert the set back to a list before returning.
    return list(variants)

In [7]:
def create_eflows_search_terms():
    combo1_1 = ["ecologic\\S*", ["eco", "hydrologic\\S*"], 
                ["hydro", "ecologic\\S*"],
                'environmental', 'minim\\S\\S', 'acceptable',
                'augmented', 'augmentation', 'compensation', 
                'experimental', 'flushing', ['in', 'stream'], 'maintenance',
                'optimum', 'restorati\\S{2}']

    combo1_2 = ['flood', 'flow', ['water', 'level'], 'discharge']
    
    combo2_1 = ['compensat[a-z]{1,3}', 'conservation', 'cultural', ['cut', 'off'], 
                'design', 'fish', 'functional', 'indigenous', 'limit', 'maintenance',
                'management', 'maximum', 'natural', 'preference', 
                'protection', 'rating', 'regime[a-z]{0,1}', 'residual',
                'right', 'sanita(ry|tion)', 'scenario', 'standard', 
                'suitable', 'surplus', 'sustainable', 'threshold',
                'use', 'vital']
    combo2_2 = ['flow']
    
    combo3_1 = ['downstream', 'dam', 'reservoir']
    combo3_2 = [['water', 'release'], ['flow', 'release'], 'reoperation']
    
    combo4_1 = ['controlled', 'artificial']
    combo4_2 = ['flood']
    
    combo5_1 = ['hydrologic(al)*']
    combo5_2 = ['requirement', 'manipulation']
    
    combo6_1 = ['flow', ['stream', 'flow'], 'freshwater', 'water', ['water', 'level']]
    combo6_2 = ['abstraction', 'allocation', 'criteri\\S{1,2}', 'delivery*', 
                'demand', 'guideline',
                'need', 'prescription', 'recommendation', 'recovery', 'requirement', 
                'reserve', 'restoration', 'restriction', 'withdrawal']
    
    search_dict = {
         'search1': ['with', [combo1_1, combo1_2]],
         'search2':  ['with', [combo2_1, combo2_2]],
         'search3': ['pre', [combo3_1, combo3_2]],
         'search4':  ['pre', [combo4_1, combo4_2]],
         'search5':  ['pre', [combo5_1, combo5_2]],
         'search6':  ['with', [combo6_1, combo6_2]]
    }
    return(search_dict)

In [None]:
def create_search_string(
    in_search_duo: List[str], 
    inflect: bool, 
    or_chars: str = ' OR ', 
    and_chars: str = ' AND ',
    inner_separators: List[str] = [" ", "-", ""], 
    use_quotes: bool = True # Default for potential quoting, but logic inside might override
) -> str:
    """
    Creates a search query string based on structured input, 
    optionally handling word inflections, combinations, and logical operators.

    Args:
        in_search_duo (Tuple[str, Tuple[List[Union[str, List[str]]], List[Union[str, List[str]]]]]): 
            A tuple containing:
            - [0] (str): The type of combination ('pre', 'with', etc.).
            - [1] (Tuple[List, List]): A tuple containing two lists. Each list 
              represents a conceptual part of the search and contains elements 
              that are either single regex patterns (str) or lists of regex 
              patterns (List[str]) intended to be combined.
        inflect (bool): If True, attempt to expand words by adding plural forms,
                      present participles, and spelling variants. If False, 
                      words found via regex matching may be lemmatized using spaCy 
                      (depending on internal logic).
        or_chars (str, optional): String used to join alternatives within a group. 
                                Defaults to ' OR '.
        and_chars (str, optional): String used to join the two main groups when 
                                 in_search_duo[0] is 'with'. Defaults to ' AND '.
        inner_separators (List[str], optional): Separators used when combining 
                                            multi-word patterns via `combinate_concats`. 
                                            Defaults to [" ", "-", ""].
        use_quotes (bool, optional): Initial preference for using quotes. Actual quoting 
                                 behaviour for combined multi-word patterns depends 
                                 on whether in_search_duo[0] is 'with'. Defaults to True.

    Returns:
        str: The formatted search query string.

    Raises:
        NameError: If required functions (find_matching_words, combinate_concats, etc.) 
                   or objects (spacy_nlp, WordNetLemmatizer) are not defined/imported.
        TypeError: If input structures don't match expected types (e.g., if 
                   `getInflection` returns unexpected type).

    Dependencies:
        Requires NLTK (WordNetLemmatizer, wordnet), TextBlob, spaCy, and potentially
        a 'bream' library. 
    """
    if inflect:
        # Initialize NLTK lemmatizer only if needed for inflection checks.
        nltk_lemmatizer = WordNetLemmatizer()
 
    # Stores the processed lists of words/phrases for each of the two main parts.
    combo_list_formatted = [] 

    # Iterate through the two main lists provided in in_search_duo[1]
    for combo_list in in_search_duo[1]:
        # Stores processed words/phrases for the current combo_list
        word_group_formatted = [] 
        
        # Iterate through each element (a pattern or list of patterns) in the current combo_list
        for repattern_group in combo_list:
            
            repattern_group_processed = [] # Holds results for this repattern_group

            # --- Case 1: repattern_group is a list (multiple patterns to combine) ---
            if isinstance(repattern_group, list):
                # This block handles cases like ["pattern1", "pattern2"] which might become "word1 word2", "word1-word2", "word1word2" etc.
                
                # Process each pattern in the inner list
                repattern_sub_results = [] # Collect results for each sub-pattern
                for repattern in repattern_group:
                    # Find English words matching the regex pattern
                    k = find_matching_words(repattern) 
                    
                    # If no matching words found, use the original pattern as a fallback
                    if not k: # Check if list is empty or if string is empty (though find_matching_words returns list)
                        k = [repattern] # Use original pattern, ensure it's a list
                    elif not isinstance(k, list):
                         # Ensure k is always a list, even if find_matching_words changed behavior
                         k = [k]

                    # Lemmatization happens if 'inflect' is False. This might seem counter-intuitive.
                    # It standardizes terms found via regex *unless* full inflection generation is requested later.
                    if not inflect:
                        try:
                            # Lemmatize the found words/original pattern using spaCy
                            token_list = spacy_nlp(" ".join(k)) 
                            k = [token.lemma_ for token in token_list if token.lemma_] # Ensure lemma exists
                        except NameError:
                             print("Warning: 'spacy_nlp' object not found. Cannot lemmatize. Ensure spaCy is loaded.")
                             # Keep 'k' as is if spacy fails

                    repattern_sub_results.append(k)

                # Combine the processed sub-results
                # Only use quotes if the overall search type is 'with' AND the initial use_quotes flag was True.
                # Otherwise, combined phrases generated here won't be quoted.
                should_quote_combination = (in_search_duo[0] == 'with') and use_quotes
                
                try:
                    # Combine the lists using specified inner separators
                    repattern_group_processed = combinate_concats(
                        prefixes=repattern_sub_results[0],
                        suffixes=repattern_sub_results[1],
                        separators=inner_separators,
                        add_quotes=should_quote_combination 
                    )
                except IndexError:
                    print(f"Warning: Expected two lists in repattern_sub_results for combination, but got {len(repattern_sub_results)}. Skipping combination.")
                    # Fallback: just flatten the list
                    repattern_group_processed = [word for sublist in repattern_sub_results for word in sublist]


            # --- Case 2: repattern_group is a single string pattern ---
            else: 
                # Find English words matching the regex pattern
                k = find_matching_words(repattern_group)
                
                # Fallback to original pattern if no matches
                if not k:
                     k = [repattern_group] # Use original pattern, ensure list
                elif not isinstance(k, list):
                     k = [k] # Ensure list

                # Lemmatize using spaCy if 'inflect' is False
                if not inflect:
                    try:
                        token_list = spacy_nlp(" ".join(k))
                        k = [token.lemma_ for token in token_list if token.lemma_]
                    except NameError:
                        print("Warning: 'spacy_nlp' object not found. Cannot lemmatize.")
                        # Keep 'k' as is

                repattern_group_processed = k # Assign the list k to the processed variable

            # --- Post-processing for the current repattern_group (applies to both cases) ---
            
            # Add the processed words/phrases, removing duplicates encountered so far within this group
            # Note: set conversion here might change order
            current_set = set(word_group_formatted)
            current_set.update(repattern_group_processed)
            word_group_formatted = list(current_set)
        
        # --- Inflection/Variant Generation (applied to the whole word_group_formatted) ---
        # This block runs *only* if inflect=True, expanding the word list further.
        
        # Create a new list to store all variations for this word group
        new_word_group: List[str] = [] 
        if inflect: 
            for word in word_group_formatted:
                # Clean word from potential quotes added by combinate_concats if needed
                clean_word = word.strip('"') 
                
                # Always add the current word/phrase itself
                new_word_group.append(word) 
                 
                # Add plural form if it likely exists (only works well for single words)
                if " " not in clean_word and "-" not in clean_word: # Basic check for single word
                   try:
                       if plural_form_exists(clean_word, nltk_lemmatizer):
                            plural_word = textblob_pluralize(clean_word)
                            if plural_word != clean_word: # Avoid adding if plural is same as singular
                                new_word_group.append(plural_word)
                                # Optionally add quotes back if original had them
                                if word.startswith('"') and word.endswith('"'):
                                    new_word_group.append(f'"{plural_word}"') 
                   except NameError:
                       print("Warning: textblob_pluralize or plural_form_exists not found.")
                   except Exception as e:
                       print(f"Warning: Error during pluralization of '{clean_word}': {e}")


                # Add present participle
                if " " not in clean_word and "-" not in clean_word: # Basic check for single word
                    try:
                        pre_participle_list= getInflection(clean_word, 'VBG')
                        if pre_participle_list: 
                            for pp_word in pre_participle_list:
                                if pp_word != clean_word: # Avoid adding if participle is same as original
                                    new_word_group.append(pp_word)
                                    # Optionally add quotes back
                                    if word.startswith('"') and word.endswith('"'):
                                        new_word_group.append(f'"{pp_word}"')
                    except Exception as e:
                        print(f"Warning: Error during present participle generation for '{clean_word}': {e}")


                # Add spelling variants (british vs american)
                if " " not in clean_word and "-" not in clean_word: # Basic check for single word
                    try:
                        spelling_variants = get_spelling_variants(clean_word)
                        for variant in spelling_variants:
                            if variant != clean_word: # Avoid adding the original word again
                                new_word_group.append(variant)
                                # Optionally add quotes back
                                if word.startswith('"') and word.endswith('"'):
                                    new_word_group.append(f'"{variant}"')
                    except Exception as e:
                        print(f"Warning: Error during spelling variant generation for '{clean_word}': {e}")
        else:
             # If not inflecting, just use the words processed so far
             new_word_group.extend(word_group_formatted) 

        # Remove duplicates introduced by inflection/variant generation
        word_group_formatted = list(set(new_word_group))

        # Add the finalized list of words/phrases for this combo_list
        combo_list_formatted.append(word_group_formatted)
         
    # --- Final Combination of the two processed groups ---    
    try:
        # Combine based on the type specified in in_search_duo[0]
        if in_search_duo[0] == 'pre':
            # Case 'pre': Combine elements from the first list with elements from the second list, separated by space.
            # Then join all these combinations with OR.

            combinations = combinate_concats(
                prefixes=combo_list_formatted[0], 
                suffixes=combo_list_formatted[1],
                separators=[" "],
                add_quotes=use_quotes
            )
            # Join all generated combinations using the specified OR characters
            search_duo_formatted = recomb(combinations, 
                                          recomb_sep=or_chars)
        
        elif in_search_duo[0] == 'with':
            # Case 'with': Create two blocks of OR-separated terms, and join the blocks with AND.
            # Block 1: Join all terms from the first list with OR
            block1 = recomb(combo_list_formatted[0], recomb_sep=or_chars)
            # Block 2: Join all terms from the second list with OR
            block2 = recomb(combo_list_formatted[1], recomb_sep=or_chars)
            # Join the two blocks with the specified AND characters
            search_duo_formatted = f"{block1}{and_chars}{block2}"
        
        else:
            # Handle unknown combination types if necessary
            print(f"Warning: Unknown search duo type '{in_search_duo[0]}'. Returning empty string.")
            search_duo_formatted = ""
            
    except NameError as e:
         print(f"Warning: Function 'combinate_concats' or 'recomb' not found. Error: {e}")
    except Exception as e:
         print(f"Error during final combination: {e}")
             
    return search_duo_formatted