This notebook creates two subsets of NAIP tiles from the list of _all_ NAIP tiles:
- A list of the most recent 100cm resolution imagery per state
- A list of the most recent imagery per state

It also creates per-state splits for each of the above subsets.

In [1]:
# Copyright (c) Microsoft Corporation. All rights reserved
# Licensed under the MIT License.
from collections import defaultdict

In [2]:
with open("../data/naip_blob_list.txt") as f:
    lines = f.read().strip().split()
    urls = [
        "https://naipeuwest.blob.core.windows.net/naip/" + line
        for line in lines
        if line.endswith(".tif")
    ]

In [3]:
state_year_sets = defaultdict(set)
state_resolution_sets = defaultdict(set)
state_year_resolutions = {}
unique_resolutions = set()
for url in urls:
    year = int(url.split("/")[6])
    state_code = url.split("/")[5]
    resolution = url.split("/")[7].split("_")[1].strip("0")
    unique_resolutions.add(resolution)
    state_year_sets[state_code].add(year)
    state_resolution_sets[state_code].add((year, resolution))
    state_year_resolutions[(state_code, year)] = resolution

In [4]:
unique_resolutions

{'100cm', '50cm', '60cm'}

## Most recent 100cm resolution imagery

In [5]:
most_recent_100cm_state_year_pairs = set()
for state, years in state_year_sets.items():
    years = sorted(list(years))
    
    for i in range(len(years)-1, -1, -1):
        resolution = state_year_resolutions[(state, years[i])]
        if resolution != "60cm" and resolution != "50cm":
            break
    year = years[i]
    most_recent_100cm_state_year_pairs.add((state, year))

In [6]:
filtered_urls = []
for url in urls:
    year = int(url.split("/")[6])
    state_code = url.split("/")[5]
    if (state_code, year) in most_recent_100cm_state_year_pairs:
        filtered_urls.append(url)

In [7]:
len(filtered_urls)

212354

In [8]:
with open("../data/naip_most_recent_100cm.csv", "w") as f:
    f.write("image_fn\n")
    f.write("\n".join(filtered_urls))

In [10]:
for state_year_pair in most_recent_100cm_state_year_pairs:
    filtered_urls = []
    for url in urls:
        year = int(url.split("/")[6])
        state_code = url.split("/")[5]

        if (state_code, year) == state_year_pair:
            filtered_urls.append(url)
    with open("../data/naip_most_recent_100cm_by_state/%s_%d.csv" % (state_year_pair[0], state_year_pair[1]), "w") as f:
        f.write("image_fn\n")
        f.write("\n".join(filtered_urls))

## Most recent imagery

In [11]:
most_recent_state_year_pairs = set()
for state, years in state_year_sets.items():
    years = sorted(list(years))
    year = years[-1]
    most_recent_state_year_pairs.add((state, year))

In [12]:
filtered_urls = []
for url in urls:
    year = int(url.split("/")[6])
    state_code = url.split("/")[5]
    if (state_code, year) in most_recent_state_year_pairs:
        filtered_urls.append(url)

In [13]:
len(filtered_urls)

215381

In [14]:
with open("../data/naip_most_recent.csv", "w") as f:
    f.write("image_fn\n")
    f.write("\n".join(filtered_urls))

In [16]:
for state_year_pair in most_recent_state_year_pairs:
    filtered_urls = []
    for url in urls:
        year = int(url.split("/")[6])
        state_code = url.split("/")[5]

        if (state_code, year) == state_year_pair:
            filtered_urls.append(url)
    with open("../data/naip_most_recent_by_state/%s_%d.csv" % (state_year_pair[0], state_year_pair[1]), "w") as f:
        f.write("image_fn\n")
        f.write("\n".join(filtered_urls))