We are going to do a stratified random sampling to select glaciers to use for validation

The stratified random sampling will work like this:
1- All benchmark glaciers, glacier with in situ glaciological observations will be used: Wolverine, Gulkana, Lemon Creek, Taku, Eklutna
2- A further series of 40 glaciers will be randomly chosen, with an equal number from each of the 4 major RGI subregions (regions 2,4,5,6)
3- additionally, glaciers from each region will be chosen in a manner to ensure a distribution of glacier sizes. out of the 10 from each region, 5 will be 2-10km2, 3 will be 10-40km2, and 2 will be 40+ km2

For each glacier, all available imagery (July to November) will be downloaded for all years (2000-2023), and we will determine the image which best captures the end of summer snow line (if such an image exists).
Then, using that best image, multiple users will draw the 'idealized' snow line.
This process will be repeated for both Landsat and Sentinel-2 imagery, with separate 'best' products chosen for each

In [1]:
import os
import numpy as np
import shapely
import pyproj
import geopandas as gpd
import matplotlib.pyplot as plt
import pandas as pd
from datetime import datetime

# define folder and file paths
folder_AGVA = os.path.join('C:',os.sep,'Users','lzell','OneDrive - Colostate','Desktop',"AGVA")

# open rgi
path_rgi = os.path.join(folder_AGVA, 'RGI', "01_rgi60_Alaska", "01_rgi60_Alaska.shp")
rgi_gdf = gpd.read_file(path_rgi)

In [2]:
### select the benchmark glaciers which will be used
benchmark_glacier_names = ["Wolverine Glacier", "Gulkana Glacier", "Lemon Creek Glacier", "Taku Glacier", "Eklutna Glacier"]
benchmark_glacier_df = rgi_gdf[rgi_gdf['Name'].isin(benchmark_glacier_names)]

# add a flag indicating the benchmark glaciers
benchmark_glacier_df['Benchmark'] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [3]:
# for each of the other regions, select ten glaciers that will be used
# define size ranges to split upon, numbers of each
sizes = [2,10,40,99999]
numbers = [5,3,2]

validation_list = [benchmark_glacier_df]
for r in [2,4,5,6]:
    
    # subset to this region
    rgis_region = rgi_gdf[rgi_gdf['O2Region']==f'{r}']

    # don't include benchmark glaciers
    rgis_region = rgis_region[~rgis_region['Name'].isin(benchmark_glacier_names)]
    
    # for each size range, subset to those sizes and select a certain number
    for i in (0,1,2):
        
        # subset sizes
        rgis_subset = rgis_region[ (rgis_region['Area']>=sizes[i]) & (rgis_region['Area']<sizes[i+1]) ]
        
        # randomly select
        randomly_sampled = rgis_subset.sample(n=numbers[i], replace=False, random_state=r)
        
        # append to list
        validation_list.append(randomly_sampled)

# make into single gdf
rgi_all_validation = gpd.GeoDataFrame( pd.concat( validation_list, ignore_index=True) ).sort_values(['O2Region', 'Area']).reset_index()
rgi_all_validation['Benchmark'] = np.nan_to_num(rgi_all_validation['Benchmark'], nan=0) 

# save to file
out_path = os.path.join(folder_AGVA, 'Validation', 'Validation Glaciers.csv')
out_df = rgi_all_validation[["O2Region", "RGIId", "Area", "Name", "Benchmark"]].copy()
# out_df.to_csv(out_path, index=False)


In [4]:
# save separate csv to hold the manually identified dates of best imagery
# out_path = os.path.join(folder_AGVA, 'Validation', 'Validation Glaciers - Best Dates.csv')
# out_df = rgi_all_validation[["O2Region", "RGIId", "Area", "Name", "Benchmark"]].copy().sort_values('Benchmark',ascending=False)
# for y in range(2018,2023):
#     out_df[f"{y}_S2"] = [0 for i in range(len(out_df))]
# for y in range(2000,2023):
#     out_df[f"{y}_LS"] = [0 for i in range(len(out_df))]
# # out_df.to_csv(out_path, index=False)
# out_df.head()

In [5]:
# save separate csvs (one for each glacier) to hold the manually identified dates of best imagery
out_path = os.path.join(folder_AGVA, 'Validation', 'Validation Glaciers - Best Dates.csv')
out_df = rgi_all_validation[["O2Region", "RGIId", "Area", "Name", "Benchmark"]].copy().sort_values('Benchmark',ascending=False)

years_all = ['2018_S2','2019_S2','2020_S2','2021_S2','2022_S2'] + [f"{i}_LS" for i in range(2000,2023)]
zeros = [0 for i in years_all]

for idx, row in out_df.iterrows():
    # format some info
    rgi_i = row['RGIId']
    O2_i = row['O2Region']
    name_i = str(row['Name']).replace(f"/", "_").replace(" ", "_").replace("nan", "noname")
    
    # create df
    out_df = pd.DataFrame({"RGIId":[rgi_i for i in years_all], 'Year':years_all, 'Best Image':zeros, 'Next Image':zeros, 'Previous Image':zeros, "All Ablation":zeros, "No Good Imagery":zeros})
    
    out_path = os.path.join(folder_AGVA, 'Validation', 'Best Images', f'{rgi_i}_{O2_i}_{name_i}.csv')
#     out_df.to_csv(out_path, index=False)

In [9]:
# save shapefile with only these glaciers
out_path = os.path.join(folder_AGVA, 'Validation', 'rgi', "validation_rgi.shp")
rgi_all_validation.to_file(out_path)