In [145]:
%run create_search_strings.ipynb
%run set_up.py

import pyalex #https://github.com/J535D165/pyalex
from pyalex import config
from pyalex import Works

config.max_retries = 1
pyalex.config.email = "mathis.messager@mail.mcgill.ca"

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\messa\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [146]:
search_terms_dict = create_generic_search_terms()
oalex_string_dict = {}
for search_number, search_terms in search_terms_dict.items():
    oalex_string_dict[search_number] = create_search_string(
        search_terms, inflect=False, or_chars=' OR ', and_chars=' AND ',
        inner_separators=[" ", "-", ""],  use_quotes=True) 

In [149]:
oalex_string_dict['search3']

'((downstream water release) OR (downstream water-release) OR (downstream flowrelease) OR (downstream flow-release) OR (downstream reoperation) OR (downstream flow release) OR (downstream waterrelease) OR (reservoir water release) OR (reservoir water-release) OR (reservoir flowrelease) OR (reservoir flow-release) OR (reservoir reoperation) OR (reservoir flow release) OR (reservoir waterrelease) OR (dam water release) OR (dam water-release) OR (dam flowrelease) OR (dam flow-release) OR (dam reoperation) OR (dam flow release) OR (dam waterrelease))'

In [142]:
def extract_last_url_segment(url):
    """
    Extracts the last segment of a URL path.  Handles various URL formats
    and potential errors robustly.

    Args:
        url: The URL string.

    Returns:
        The last segment of the URL path, or None if the URL is invalid
        or has no path.
    """
    try:
        parsed_url = urlparse(url)
        path = parsed_url.path
        if not path:
            return None  # No path component

        # Split the path by '/' and get the last element
        segments = path.split('/')
        return segments[-1]  # Handle cases with trailing slashes correctly

    except Exception:  # Catch any parsing errors
        return None

def extract_concept_from_url_df(df, url_col, include_col=None):
    """
    Extracts the last segment of URLs from a specific column in a DataFrame,
    filtering by a boolean column, and adds the result as a new column.

    Args:
      df: The Pandas DataFrame.
      url_col: The name of the column containing URLs (string).
      include_col: The name of the boolean column to filter by (string).

    Returns:
        A new Pandas DataFrame with an additional column 'openalex_id_last_segment'
        containing the extracted last segment, or None if the input is invalid.
    """
    # Input validation: Check for required columns
    required_columns = [url_col]
    if include_col is not None:
        required_columns.append(include_col)
    if not all(col in df.columns for col in required_columns):
        print("DataFrame is missing some columns.")
        return None

    # Make a copy to avoid modifying the original DataFrame
    df_copy = df.copy()

    if include_col is not None:
        # Convert 'Include?' (or whatever include_col is) to boolean, handling various representations.
        df_copy[include_col] = df_copy[include_col].astype(str).str.lower().isin(['y', 'yes', 'true', '1', 't'])
        # Apply the extraction function ONLY to rows where 'include_col' is True,
        # and ONLY to the 'url_col' of those rows. Use .loc for proper indexing.
        included_clist = df_copy.loc[df_copy[include_col], url_col].apply(extract_last_url_segment)
    else:
        included_clist = df_copy.loc[:, url_col].apply(extract_last_url_segment)

    return included_clist.tolist()

#Get open alex concepts to filter with
concepts_toinclude_pd = pd.read_csv(
    os.path.join(datdir, 'openalex_concepts_toinclude.csv'))
concepts_toinclude_list =  extract_concept_from_url_df(
    df = concepts_toinclude_pd,
    url_col = 'openalex_id', 
    include_col = 'include')

In [150]:
#Run on each search string
oalex_records_dict = {}
for search_number, search_terms in oalex_string_dict.items():
    print(f'Retrieving {search_number}')
    oa_query = Works().search_filter(title_and_abstract=search_terms).filter(
        concept={"id": '|'.join(concepts_toinclude_list)},
        is_retracted='False')

    print(oa_query.url)
    
    oalex_records_dict[search_number] = [
        record for record in itertools.chain(
        *oa_query.paginate(per_page=200, n_max=200)
    )]

print([len(rec_list) for rec_list in oalex_records_dict.values()])

#Remove duplicates

# #Serialize
# #All results from PyAlex can be serialized. For example, save the results to a JSON file:
# import json
# from pathlib import Path
# from pyalex import Work

# with open(Path("works.json"), "w") as f:
#     json.dump(Works().get(), f)

# # with open(Path("works.json")) as f:
# #     works = [Work(w) for w in json.load(f)]


Retrieving search1
https://api.openalex.org/works?filter=title_and_abstract.search:%28%28%22hydroecologic%22%29+OR+%28%22ecohydrologically%22%29+OR+%28minimus%29+OR+%28acceptable%29+OR+%28%22ecohydrological%22%29+OR+%28restoration%29+OR+%28%22instream%22%29+OR+%28%22eco-hydrologic%22%29+OR+%28%22hydro+ecologic%22%29+OR+%28experimental%29+OR+%28%22hydro-ecological%22%29+OR+%28restorative%29+OR+%28%22hydro+ecological%22%29+OR+%28ecological%29+OR+%28%22hydroecological%22%29+OR+%28%22in-stream%22%29+OR+%28%22eco+hydrological%22%29+OR+%28%22eco+hydrologic%22%29+OR+%28minimum%29+OR+%28ecologically%29+OR+%28augmentation%29+OR+%28flush%29+OR+%28optimum%29+OR+%28%22eco+hydrologically%22%29+OR+%28augment%29+OR+%28%22hydro-ecologic%22%29+OR+%28%22hydro+ecologically%22%29+OR+%28minimal%29+OR+%28compensation%29+OR+%28%22ecohydrologic%22%29+OR+%28%22eco-hydrological%22%29+OR+%28%22hydroecologically%22%29+OR+%28%22in+stream%22%29+OR+%28ecologic%29+OR+%28%22hydro-ecologically%22%29+OR+%28maintenance%2

In [121]:
# post_oalex_filter_string = [create_search_string([terms], inflect=True,
#                                                or_chars='|', and_chars='AND',
#                                                use_quotes=False)
#                             for terms in search_terms_list]
#Generate regex to be adjusted (split it in two regex queries for the AND rather
#than using greedy lookaheads
post_oalex_filter_string = create_search_string([search_terms_list[0]], inflect=True,
                                                or_chars='|', and_chars='AND',
                                                inner_separators=[r"[-\s]*"],
                                                use_quotes=False)



post_oalex_filter_string = remove_redundant_parentheses(post_oalex_filter_string)
post_oalex_filter_string = post_oalex_filter_string.split('AND')


['((flushing)|(minimus)|(eco[-\\s]*hydrological)|(eco[-\\s]*hydrologically)|(acceptable)|(restoration)|(compensations)|(experimental)|(maintenances)|(in[-\\s]*stream)|(restorative)|(ecological)|(augmentations)|(minima)|(hydro[-\\s]*ecological)|(minimum)|(restoratives)|(ecologically)|(augmented)|(augmentation)|(hydro[-\\s]*ecologic)|(optimum)|(minimal)|(compensation)|(eco[-\\s]*hydrologic)|(hydro[-\\s]*ecologically)|(ecologic)|(minimuss)|(maintenance)|(environmental)|(restorations)|(optima))', '((water[-\\s]*level)|(flood)|(discharges)|(flows)|(discharging)|(floods)|(flooding)|(discharge)|(flow)|(flowing))']


In [92]:
# record = record_list[4000]
# print(record.keys())
# print(record['is_retracted'])

' '.join(
    ([record['title'], record['abstract']] 
     + [kw['display_name'] for kw in record['keywords']])
)


dict_keys(['id', 'doi', 'title', 'display_name', 'relevance_score', 'publication_year', 'publication_date', 'ids', 'language', 'primary_location', 'type', 'type_crossref', 'indexed_in', 'open_access', 'authorships', 'institution_assertions', 'countries_distinct_count', 'institutions_distinct_count', 'corresponding_author_ids', 'corresponding_institution_ids', 'apc_list', 'apc_paid', 'fwci', 'has_fulltext', 'fulltext_origin', 'cited_by_count', 'citation_normalized_percentile', 'cited_by_percentile_year', 'biblio', 'is_retracted', 'is_paratext', 'primary_topic', 'topics', 'keywords', 'concepts', 'mesh', 'locations_count', 'locations', 'best_oa_location', 'sustainable_development_goals', 'grants', 'datasets', 'versions', 'referenced_works_count', 'referenced_works', 'related_works', 'abstract_inverted_index', 'abstract_inverted_index_v3', 'cited_by_api_url', 'counts_by_year', 'updated_date', 'created_date'])
False


'Thermal regime of a headwater stream within a clear-cut, coastal British Columbia, Canada This study examined the thermal regime of a headwater stream within a clear-cut. The stream had a complex morphology dominated by step–pool features, many formed by sediment accumulation upstream of woody debris. Maximum daily temperatures increased up to 5 °C after logging, and were positively associated with maximum daily air temperature and negatively with discharge. Maximum daily temperatures generally increased with downstream distance through the cut block, but decreased with distance in two segments over distances of tens of metres, where the topography indicated relatively concentrated lateral inflow. Localized cool areas within a step–pool unit were associated with zones of concentrated upwelling. Bed temperatures tended to be higher and have greater ranges in areas of downwelling flow into the bed. Heat budget estimates were made using meteorological measurements over the water surface 

In [None]:
'''
~~~~~ Search for works in OpenAlex based on search string ~~~~~~~~~~~~~~~~~~~~~~
Reference info on the API: 
# https://docs.openalex.org/how-to-use-the-api/get-lists-of-entities/search-entities
# https://docs.openalex.org/api-entities/works/search-works

#EX: https://api.openalex.org/works?search=(elmo AND "sesame street") NOT (cookie OR monster)
#Filter categories based on csv
#do not lemmatize
#&per-page=100&cursor=*

#~~~~~~~~~~~~~~~~ PAGING ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Basic paging only works to get the first 10,000 results of any list. If you want to see more than 10,000 results, you'll need to use cursor paging.
To use cursor paging, you request a cursor by adding the cursor=* parameter-value pair to your query.
    Get a cursor in order to start cursor pagination:
    https://api.openalex.org/works?filter=publication_year:2020&per-page=100&cursor=*
The response to your query will include a next_cursor value in the response's meta object. Here's what it looks like:
{
  "meta": {
    "count": 8695857,
    "db_response_time_ms": 28,
    "page": null,
    "per_page": 100,
    "next_cursor": "IlsxNjA5MzcyODAwMDAwLCAnaHR0cHM6Ly9vcGVuYWxleC5vcmcvVzI0ODg0OTk3NjQnXSI="
  },
  "results" : [
    // the first page of results
  ]
}

To retrieve the next page of results, copy the meta.next_cursor value into the cursor field of your next request.

    Get the next page of results using a cursor value:
    https://api.openalex.org/works?filter=publication_year:2020&per-page=100&cursor=IlsxNjA5MzcyODAwMDAwLCAnaHR0cHM6Ly9vcGVuYWxleC5vcmcvVzI0ODg0OTk3NjQnXSI=

To get all the results, keep repeating this process until meta.next_cursor is null and the results set is empty.
'''

#
