In [None]:
import os
import sys
from pathlib import Path
import logging

from tqdm.notebook import tqdm
import geopandas as gpd
import pandas as pd

sys.path.append("..")
from nmaipy.feature_api import FeatureApi
from nmaipy.constants import AREA_CRS, API_CRS
import nmaipy.coverage_utils

SINCE="2007-01-01"

DATA_DIR = Path("../data")
COVERAGE_CHUNK_DIR = DATA_DIR / "coverage_chunks"
POINTS_FILE = DATA_DIR / "points.csv"
ID_COL = "point_id"
API_KEY = os.environ["API_KEY"]

## Load Points
Load a file of points to check for coverage of imagery, 3D and AI.

In [None]:
df_points = pd.read_csv(POINTS_FILE, index_col = 0)
df_points["until"] = pd.to_datetime(df_points["until"], format="%d/%m/%Y")
df_points.until.hist()
df_points

In [None]:
if "since" not in df_points.columns:
    df_points["since"] = SINCE # A very old date just to have a since column.
df_points["until"] = df_points["until"].dt.strftime("%Y-%m-%d")

## Test Coverage
For every point, get a full history of all surveys that intersect with it from the coverage API, and whether the survey has AI and/or 3D resources attached. This doesn't tell us what generation of AI data is available - that will require a subsequent run against the AI Feature API coverage endpoint to determine versions (or just pulling the data and ignoring version).

In [None]:
CHUNK_SIZE = 10000 # For big tests, useful to save chunks of coverage calls in case of failure.

def get_survey_resource_id(resources):
    """
    Get the survey resource id from the resources list. This is the id that can be used with the AI Feature API to get an exact match (rather than since/until dates).
    """
    if "aifeatures" in resources:
        return resources["aifeatures"][-1]["id"]
    else:
        return None

df_coverage = None

Path(COVERAGE_CHUNK_DIR).mkdir(parents=True, exist_ok=True)

for i in tqdm(range(0, len(df_points), CHUNK_SIZE)):
    f = os.path.join(COVERAGE_CHUNK_DIR, f"coverage_chunk_{i}-{i+CHUNK_SIZE}.parquet")
    if not os.path.exists(f):
        df_point_chunk = df_points.iloc[i:i+CHUNK_SIZE,:]
        logging.debug(f"Pulling chunk from API for {f}.")
        # Multi-threaded pulls are ok - the API is designed to cope fine with 10-20 threads running in parallel pulling requests.
        c = nmaipy.coverage_utils.threaded_get_coverage_from_point_results(
            df_point_chunk, since_col="since", until_col="until", apikey=API_KEY, threads=20
        )
        c_with_idx = []
        for j in range(len(c)):
            row_id = df_point_chunk.iloc[j].name
            c_tmp = c[j].copy()
            if len(c_tmp) > 0:
                c_tmp[ID_COL] = row_id
                c_with_idx.append(c_tmp)
        if len(c_with_idx) > 0:
            c = (pd
                 .concat(c_with_idx)
                )
            c["survey_resource_id"] = c.resources.apply(get_survey_resource_id)
            c = (c
                 .drop(columns=["resources"])
                 .rename(columns={"id": "survey_id"})
                )
            c.to_parquet(f)
        else:
            c = None

    else:
        logging.debug(f"Reading chunk from parquet for {f}.")
        c = pd.read_parquet(f)

    if c is not None:
        c = (c
             .loc[:,[ID_COL, "captureDate", "survey_id", "survey_resource_id", "aifeatures", "3d"]]
             .set_index(ID_COL)
            )
        df_coverage = pd.concat([df_coverage, c])

In [None]:
# The full set of surveys covered by each point, regardless of whether it has AI.
df = df_points.merge(df_coverage, left_index=True, right_index=True, how="right")
df

In [None]:
# The latest survey for each point within the point's since-until range, regardless of whether it has AI.
df_latest = df.sort_values("captureDate").groupby(ID_COL).last()
df_latest

In [None]:
# Show what fraction of the latest survey has AI features
df_latest.aifeatures.value_counts() / len(df_latest)

In [None]:
# The latest survey for each point within the point's since-until range, only if it has AI.
df_ai_latest = df.query("aifeatures == True").sort_values("captureDate").groupby(ID_COL).last()
df_ai_latest

In [None]:
# Show what fraction have a valid AI result already
display(f" {len(df_ai_latest) / len(df_points) * 100:0.1f}% of the originally provided points have valid AI coverage in the since-until range.")
display(f" {len(df_ai_latest) / len(df_latest) * 100:0.1f}% of the points that have at least one valid imagery survey have valid AI coverage in the since-until range.")

## Which points don't have AI coverage?
It is often useful to explore which points don't have AI coverage yet (but do have imagery coverage). The following section identifies these points, and also has an algorithm to produce a set of surveys which will incrementally increase the AI coverage (a greedy search for valid surveys that match between the largest set of points). This allows us to answer the question "if we ran AI on this list of extra surveys, we could increase AI coverage from x% to y%".

In [None]:
# Get the subset of points in "df" that have no coverage (i.e. don't fall in "df_ai_latest")
df_no_coverage = df.loc[~df.index.isin(df_ai_latest.index),:]
df_no_coverage

In [None]:
df_no_coverage_latest = df_no_coverage.sort_values("captureDate").groupby(ID_COL).last()
df_no_coverage_latest

In [None]:
f" {len(df_no_coverage_latest) / len(df_latest) * 100:0.1f}% of the points that have valid imagery DON'T have valid AI coverage in the since-until range."

In [None]:
# Iteratively cycle through identifying the most common "survey_id" in df_no_coverage, and then remove all rows that have coverage on that survey_id.

df_surveys_to_run = []
SURVEY_NUMBER_BUDGET = 200

for i in range(SURVEY_NUMBER_BUDGET):
    most_common_survey = df_no_coverage.survey_id.value_counts().index[0]
    capture_date = df_no_coverage.query("survey_id == @most_common_survey").captureDate.iloc[0]
    df_surveys_to_run.append({"survey_id": most_common_survey, "captureDate": capture_date})
    idx_covered_by_most_common_survey = df_no_coverage.query("survey_id == @most_common_survey").index
    
    df_no_coverage = df_no_coverage.loc[~df_no_coverage.index.isin(idx_covered_by_most_common_survey),:]
df_surveys_to_run = pd.DataFrame(df_surveys_to_run).set_index("survey_id")
df_surveys_to_run

In [None]:
df_no_coverage_latest = df_no_coverage.sort_values("captureDate").groupby(ID_COL).last()
df_no_coverage_latest

In [None]:
frac_with_ai_already = len(df_ai_latest.query('aifeatures')) / len(df_latest)
frac_without_ai_after_extra_surveys = len(df_no_coverage_latest) / len(df_latest)
simulated_num_with_coverage = len(df_latest) - len(df_no_coverage_latest)

display(f"If an additional {SURVEY_NUMBER_BUDGET} surveys are run with AI, the AI coverage (out of possible imagery surveys) would increase from {frac_with_ai_already*100:0.1f}% to {(1-frac_without_ai_after_extra_surveys)*100:0.1f}%")
display(f"i.e. {simulated_num_with_coverage:,} points covered in total, out of the original {len(df_points):,} provided ({simulated_num_with_coverage / len(df_points)*100:0.1f}%) after running  {SURVEY_NUMBER_BUDGET} surveys.")

In [None]:
df.to_parquet(DATA_DIR / f"{POINTS_FILE.stem}_all.parquet")
df.query("aifeatures").to_parquet(DATA_DIR / f"{POINTS_FILE.stem}_all_with_ai_coverage.parquet")
df_latest.to_parquet(DATA_DIR / f"{POINTS_FILE.stem}_latest_with_ai_coverage.parquet")
df_surveys_to_run.to_csv(DATA_DIR / f"{POINTS_FILE.stem}_surveys_to_run.csv")