In [1]:
# Copyright (c) Microsoft Corporation. All rights reserved
# Licensed under the MIT License.
import os
import numpy as np
import pandas as pd
import rasterio

## Calculate the number of pixels in each split

In [2]:
for fn in os.listdir("../data/splits/"):
    df = pd.read_csv(f"../data/splits/{fn}")
    naip_urls = df["image_fn"].values
    mask_urls = df["label_fn"].values
    
    num_patches = 0
    for url in naip_urls:
        with rasterio.open(url) as f:
            height, width = f.shape
        num_patches += (height * width)# / (256 * 256)
    print(fn, num_patches)

all.csv 160049799502
train-all.csv 124084523923
all-single.csv 23713268170
train-augment.csv 124084523923
test-single.csv 5326745583
test-all.csv 35965275579
train-single.csv 18386522587


## Compute average tile size

In [11]:
heights = []
widths = []
df = pd.read_csv(f"../data/splits/all.csv")
naip_urls = df["image_fn"].values
for url in naip_urls:
    with rasterio.open(url) as f:
        height, width = f.shape
        heights.append(height)
        widths.append(width)
print(np.mean(heights))
print(np.mean(widths))

8593.338690709526
6918.482947863583


## Print number of tiles in each split

In [3]:
for fn in os.listdir("../data/splits/"):
    df = pd.read_csv(f"../data/splits/{fn}")
    print(fn, df.shape)

all.csv (2551, 2)
train-all.csv (1983, 2)
all-single.csv (511, 2)
train-augment.csv (1983, 2)
test-single.csv (114, 2)
test-all.csv (568, 2)
train-single.csv (397, 2)


## Print number of positive samples in each split

In [36]:
import fiona

In [13]:
number_polygons_per_split = {}

In [17]:
with fiona.open("../data/delmarva_testing_set_polygons.geojson") as f:
    number_polygons_per_split["test-single"]= len(f)
    number_polygons_per_split["test-all"]= len(f) * 5
with fiona.open("../data/delmarva_training_set_polygons.geojson") as f:
    number_polygons_per_split["train-single"]= len(f)
    number_polygons_per_split["train-all"]= len(f) * 5

In [41]:
tile_state_years = []
with fiona.open("../data/delmarva_all_set_tiles.geojson") as f:
    for i, row in enumerate(f):
        state, res, year = row["properties"]["url"].split("/")[3].split("_")
        tile_state_years.append((state, year))
state_years = set(tile_state_years)

In [69]:
train_augment_counts = 0
for state_year in state_years:
    state, year = state_year
    if state in ["md"]:
        with fiona.open(f"../data/poultry_barn_change_predictions/poultry_barns-64-200_predictions_{year}_train.geojson") as f:
            train_augment_counts += len(f)
            print(state, year, len(f))
number_polygons_per_split["train-augment"] = train_augment_counts

md 2011 4554
md 2018 5280
md 2015 5004
md 2017 5280
md 2013 4749


In [70]:
number_polygons_per_split

{'test-single': 733,
 'test-all': 3665,
 'train-single': 5280,
 'train-all': 26400,
 'train-augment': 24867}

## Number of tiles in the US

In [4]:
df = pd.read_csv("../data/naip_most_recent_100cm.csv")

## Number of pixels in the US

In [10]:
(160049799502 / 2551) * df.shape[0]

13323094913150.807