In [13]:
%run lit_utility_functions_2025.ipynb

In [29]:
def find_matching_words(pattern, bound_pattern=True):
  """
  Finds all English words that match a given regular expression pattern.

  Args:
    pattern: The regular expression pattern (string).

  Returns:
    A list of English words that match the pattern.  Returns an empty list if no words match
    or if there's an invalid regex pattern.  Prints a warning if the NLTK words corpus is not found.
  """
  if bound_pattern:
      pattern = f'^{pattern}$'
    
  try:
    regex = re.compile(pattern)
  except re.error as e:
    print(f"Invalid regular expression pattern: {e}")
    return []

  try:
    english_words = words.words()
  except LookupError:
    print("Warning: NLTK words corpus not found. Downloading...")
    try:
      nltk.download('words')
      english_words = words.words()
    except Exception as e:
      print(f"Error downloading NLTK words corpus: {e}")
      return []
        
  matching_words = [word for word in english_words if regex.search(word)]
  
  return matching_words

def combinate_concats(prefixes, suffixes, separators=[" ", "-", ""]):
    """
    Generates all combinations using itertools.product (most efficient).
    """
    combinations = [
        "".join(combination)
        for combination in itertools.product(prefixes, separators, suffixes)
    ]
    return combinations

def plural_form_exists(word):
    """
    Checks if a plausible plural form of a word exists, with improved logic
    and handling of irregular plurals.  Uses WordNet and a rule-based fallback.

    Args:
      word: The word (string) to check.

    Returns:
      True if a plausible plural form is found, False otherwise.
    """
    lemmatizer = WordNetLemmatizer()

    # 1. Check if the word is already plural (common case):
    if wordnet.synsets(word) and any(lemma.name().endswith('s') for
                                     synset in wordnet.synsets(word) for lemma in synset.lemmas()):
       return True

    # 2. Lemmatize the word (get the singular form):
    lemma = lemmatizer.lemmatize(word, wordnet.NOUN)


    #3. Check if word is the lemma (if so, it is most likely singular)
    if lemma != word:
        return True #Word is not the lemma (it's likely already a plural form)
    
    # 4. If lemma and word are the same, then add an s and try again with wordnet
    if wordnet.synsets(word + 's'):
        return True
    
    # 5. Try common plural endings
    if word.endswith(("s", "x", "z", "ch", "sh")):
        plural = word + "es"
    elif word.endswith("y") and len(word) > 1 and word[-2] not in "aeiou":
        plural = word[:-1] + "ies"
    else:
        plural = word + "s"
    
    if wordnet.synsets(plural):
        return True

    return False
    
def textblob_pluralize(word):
    w = Word(word)
    return w.pluralize()

def get_spelling_variants(word):
    """Gets American and British spelling variants of a word using bream."""
    variants = set()
    variants.add(word)  # Add the original word

    try:
      #The following lines will generate errors if the words are not int he dictionary. We capture these.
        american = bream.to_american(word)
        variants.add(american)
    except:
        pass
    try:
        british = bream.to_british(word)
        variants.add(british)
    except:
      pass
    return list(variants)

def create_generic_search_string():
    combo1_1 = ["ecologic\\S*", ["eco", "hydrologic\\S*"], ["hydro", "ecologic\\S*"],
                'environmental', 'minim\\S\\S', 'acceptable',
                'augmented', 'augmentation', 'compensation', 
                'experimental', 'flushing', ['in', 'stream'], 'maintenance',
                'optimum', 'restorati\\S{2}']

    combo1_2 = ['flood', 'flow', ['water', 'level'], 'discharge']
    
    combo2_1 = ['compensat[a-z]{1,3}', 'conservation', 'cultural', ['cut', 'off'], 
                'design', 'fish', 'functional', 'indigenous', 'limit', 'maintenance',
                'management', 'maximum', 'natural', 'preference', 
                'protection', 'rating', 'regime[a-z]{0,1}', 'residual',
                'right', 'sanita(ry|tion)', 'scenario', 'standard', 
                'suitable', 'surplus', 'sustainable', 'threshold',
                'use', 'vital']
    combo2_2 = ['flow']
    
    combo3_1 = ['downstream', 'dam', 'reservoir']
    combo3_2 = [['water', 'release'], ['flow', 'release'], 'reoperation']
    
    combo4_1 = ['controlled', 'artificial']
    combo4_2 = ['flood']
    
    combo5_1 = ['hydrologic(al)*']
    combo5_2 = ['requirement', 'manipulation']
    
    combo6_1 = ['flow', ['stream', 'flow'], 'freshwater', 'water', ['water', 'level']]
    combo6_2 = ['abstraction', 'allocation', 'criteri\\S{1,2}', 'delivery*', 
                'demand', 'guideline',
                'need', 'prescription', 'recommendation', 'recovery', 'requirement', 
                'reserve', 'restoration', 'restriction', 'withdrawal']
    
    search_dict = [
            ['with', [combo1_1, combo1_2]],
          ['with', [combo2_1, combo2_2]],
          ['pre', [combo3_1, combo3_2]],
          ['pre', [combo4_1, combo4_2]],
          ['pre', [combo5_1, combo5_2]],
          ['with', [combo6_1, combo6_2]],
          [None, [['e-flow', 'e-flows']]]
    ]
    return(search_dict)

In [30]:
def create_openalex_search_string(in_searchlist):
    #For each combo item~~~~~~~~~~~~~~~~~~~~
    search_list_formatted = []
    for combo_duo in in_searchlist:
        combo_list_formatted = []
        for combo_list in combo_duo[1]:
            word_group_formatted = []
            for repattern_group in combo_list:
                #When multiple words in a group
                if isinstance(repattern_group, list):
                    #print(repattern_group)
                    #Apply find_matching_words to each
                    repattern_group_antistemmed = []
                    for repattern in repattern_group:
                        k = find_matching_words(repattern)
                        if (len(k) == 0):
                            k = repattern
                        if not isinstance(k, list):
                            k = [k]
                        repattern_group_antistemmed.append(k)
                    #Then join them in the order with a space, no space, and with a hyphen
                    repattern_group_antistemmed = combinate_concats(
                        prefixes=repattern_group_antistemmed[0],
                        suffixes=repattern_group_antistemmed[1]
                    )
                else:
                    repattern_group_antistemmed = find_matching_words(repattern_group)
                    if (len(repattern_group_antistemmed) == 0):
                        repattern_group_antistemmed = repattern_group
                    if not isinstance(repattern_group_antistemmed, list):
                        repattern_group_antistemmed = [repattern_group_antistemmed]
                        
                #Remove duplicates
                word_group_formatted += list(set(repattern_group_antistemmed)) #Remove duplicates

            # Create a new list to store all variations
            new_word_group = []
            for word in word_group_formatted:
                new_word_group.append(word)  # Add original word

                # Add plural form
                if plural_form_exists(word):
                    plural_word = textblob_pluralize(word)
                    new_word_group.append(plural_word)

                # Add present participle
                pre_participle = getInflection(word, 'VBG')
                if pre_participle is not None:  # Avoid adding if it's the same
                    if isinstance(pre_participle, list):
                        for w in pre_particile:
                            new_word_group.append(pre_participle)
                    else:
                        new_word_group.append(pre_participle[0])

                # Add spelling variants (british vs americna or vice-versa)
                spelling_variants = get_spelling_variants(word)
                for variant in spelling_variants:
                    if variant != word: # Avoid adding if its the same
                      new_word_group.append(variant)

            word_group_formatted = list(set(new_word_group)) # Remove dups (again, after adding variants)
            combo_list_formatted.append(word_group_formatted)
            
            #Then create actual combinations for each combo duo with spaces in between when 'pre' and 'OR' when 'with'
            #combinate_concats
            
            #Then combine with OR and AND, or NOT
            #         search_list = []
            # for combo in create_generic_search_string().values():
            #     if len(combo) > 1:
            #         v = combine_2w_regex(combo[1][0], combo[1][1], combo[0]=='pre')
            #     else:
            #         v = combo[0]
            #     search_list.append(v)
            # print(search_list)
            search_list_formatted.append([combo_duo[0], combo_list_formatted])

    return(search_list_formatted)

    
# def create_openalex_api_call(in_search_string):
#     #Do NOT allow lemmatization

    


In [31]:
in_searchlist = create_generic_search_string()
check = create_openalex_search_string(in_searchlist)
print(check)

[['with', [['ecohydrologic', 'eco hydrologically', 'hydro-ecological', 'hydro ecological', 'minimuss', 'optima', 'hydro ecologic', 'eco-hydrologically', 'augmentations', 'restorations', 'eco hydrologic', 'hydroecologic', 'eco hydrological', 'minimum', 'hydroecological', 'minimal', 'optimum', 'flushing', 'environmental', 'minimus', 'ecological', 'hydroecologically', 'ecologically', 'in stream', 'ecologic', 'maintenances', 'hydro ecologically', 'eco-hydrologic', 'ecohydrologically', 'maintenance', 'minima', 'in-stream', 'instream', 'compensations', 'restoratives', 'eco-hydrological', 'hydro-ecologically', 'compensation', 'experimental', 'ecohydrological', 'acceptable', 'hydro-ecologic', 'augmented', 'augmentation', 'restoration', 'restorative'], ['flow', 'waterlevel', 'discharge', 'discharges', 'flowing', 'water-level', 'discharging', 'flows', 'water level', 'flood', 'floods', 'flooding']]], ['with', [['ecohydrologic', 'eco hydrologically', 'hydro-ecological', 'hydro ecological', 'minimu

In [None]:
#For each word in the group
plural_list = []
for word in word_group_formatted:
    if plural_form_exists(word):
        print(word)
        plural_word = textblob_pluralize(word)
        plural_list += plural_word
word_group_formatted += plural_list