# Query Example Notebook

In [None]:
import time
import rasterio
import multiprocessing
import numpy as np
import pandas as pd

In [None]:
def point_data_from_file(
    filename: str,
    lon_lat_tuples: list,
    queue: multiprocessing.Queue = None,
    sub_error_val: float = np.nan,
) -> dict:
    """
    Returns a dictionary of values for the first 3 bands of `filename`,
    which is assumed to be an raster file with at least 3 bands in the
    epsg4326 coordinate system. This dictionary is intended to be used
    as input to create or add to a pandas dataframe, with keys being
    the column names and values being the row values.
    
    :param filename: raster file to extract values from
    :param lon_lat_tuples: list of (lon, lat) values in the raster file
      to extract values for
    :queue: multiprocessing queue; if it is not None, the return value
      will be added to it
    :sub_error_val: value to return in the dict if the file cannot be
      opened due to a RasterioIOError, default is np.nan
    :return: dictionary with keys being band indexes and values being
      those band's values for each given coordinate
    """
    bands = [1, 2, 3]
    # try up to 3 times to get the values
    cnt = 0
    success = False
    while not success and cnt < 3:
        try:
            d = dict([(f"band{b}", []) for b in bands])
            # open file
            with rasterio.open(filename) as src:
                # grab all point values with src.sample; 3 indexes are
                # being grabbed for each point, so it will return a list
                # of lists (inner lists have length 3)
                pt_values = src.sample(lon_lat_tuples, indexes=bands)
                for pt_value in pt_values:
                    for val, b in zip(pt_value, bands):
                        d[f"band{b}"].append(val)
            # put dict in multiprocessing queue if given one
            if queue is not None:
                queue.put(d)
            success = True
        except rasterio.errors.RasterioIOError:
            time.sleep(3)
            cnt += 1
    # if it failed, make the intended dictionary exactly the same as if it
    # succeeded, but make all values equal to sub_error_val
    if not success :
        print(
            f"Failed to open file {filename} because of RasterioIOError,"
            f"substituting values with {sub_error_val}"
        )
        d = dict([(f"band{b}", []) for b in bands])
        for _ in lon_lat_tuples:
            for band in b:
                d[f"band{b}"].append(sub_error_val)
        # put dict in multiprocessing queue if given one
        if queue is not None:
            queue.put(d)
    return d

## example extraction

In [None]:
# values
filename = "s3://jupiter-climatescoreglobal-eos/production/heat/fit_v7/daysExceeding35C/agg_v2/ssp585/remap_v1/2045/n40w075.tif"
lon_lat_tuples = [(-73.946, 40.693), (-73.973, 40.773)]

# create dictionary with lon lat data
data = {}
data["lon"] = [t[0] for t in lon_lat_tuples]
data["lat"] = [t[1] for t in lon_lat_tuples]

# get point data (print time it took to query)
start_time = time.perf_counter()
results = point_data_from_file(filename, lon_lat_tuples)
print(f"took {round(time.perf_counter() - start_time, 2)}s to query")

# store results in dict
for k, v in results.items():
    data[k] = v

# display results
pd.DataFrame(data)

## example extraction using multiprocessing

#### IMPORTANT:
##### !!! jupyter notebooks can only queue up execution of a single cell if that cell uses the multiprocessing library !!!

In [None]:
# values
lon_lat_tuples = [(-73.946, 40.693), (-73.973, 40.773)]
files_to_query = [
    "s3://jupiter-climatescoreglobal-eos/production/heat/fit_v7/daysExceeding35C/agg_v2/ssp245/remap_v1/2025/n40w075.tif",
    "s3://jupiter-climatescoreglobal-eos/production/heat/fit_v7/daysExceeding35C/agg_v2/ssp585/remap_v1/2045/n40w075.tif",
]

# create out dictionary with lon lat data
data = {}
data["lon"] = [t[0] for t in lon_lat_tuples]
data["lat"] = [t[1] for t in lon_lat_tuples]

# get point data using multiprocessing (print time it took to query)
queue = multiprocessing.Queue()
processes = [
    multiprocessing.Process(
        target=point_data_from_file,
        args=(filename, lon_lat_tuples, queue, np.nan),
    )
    for filename in files_to_query
]
# start all the processes before doing anything else
start_time = time.perf_counter()
for p in processes:
     p.start()
# call join
for p in processes:
    p.join()
# collect results from the queue
results = [queue.get() for p in processes]
print(f"took {round(time.perf_counter() - start_time, 2)}s to query")

# store results in dict
for i, result in enumerate(results):
    for k, v in result.items():
        data[f"file{i}_{k}"] = v

# display results
pd.DataFrame(data)