In [1]:
"""
Model Result Processing
Author: Liam Megraw, RIT Envirionmental Science Technician
Date last edited: 10/19/2022
ESRI ArcGIS Pro Version 2.5.2


Description:

This code creates stratified random points in a set of spatial 
attributes and selects the nearest point to create a representative 
sample for analysis.


Outputs:

The final output is a point layer sampled from a larger layer that 
more closely matches a target distribution of selected variables


How to Use:
"""

'\nModel Result Processing\nAuthor: Liam Megraw, RIT Envirionmental Science Technician\nDate last edited: 10/19/2022\nESRI ArcGIS Pro Version 2.5.2\n\n\nDescription:\n\nThis code creates stratified random points in a set of spatial \nattributes and selects the nearest point to create a representative \nsample for analysis.\n\n\nOutputs:\n\nThe final output is a point layer sampled from a larger layer that \nmore closely matches a target distribution of selected variables\n\n\nHow to Use:\n'

In [None]:
"""
Pseudocode overview

Set workspace
Define inputs & load packages
Join attributes to model predictions


"""

In [1]:
#----- Get and set WD to gdb -----
import os
gdb = "Final_Deployment.gdb" # Put your relative (or absolute) geodatabase path here
gdb_abs = os.path.abspath(gdb) # Gets absolute path of geodatabase

dwd = os.getcwd() # Gets current working directory
if dwd in gdb_abs:
    print("Workspace path is already your defined geodatabase")
else:
    os.chdir(gdb) # Changes directory to geodatabase 
    new_path = os.getcwd() # Assigns current working directory to variable
    print("Workspace path changed to: " + new_path)

Workspace path is already your defined geodatabase


In [3]:
# Define inputs


# Import packages to manipulate spatial dataframes
import pandas as pd
from arcgis.features import GeoAccessor, GeoSeriesAccessor

# Packages for stratifying data
import numpy as np
import random

In [8]:
# Convert feature to spatial dataframe
fc = "model_predicted_presences_Orange"
sdf = pd.DataFrame.spatial.from_featureclass(fc)
sdf.head()

Unnamed: 0,OBJECTID,lat,long,date,path,phra,knot,wild_parsnip,purple_loosestrife,tree_of_heaven_with_seeds,Common_Nam,op_criteria,SHAPE
0,1,41.530173,-74.353212,2012-07,2020-01-22_16-31-05,0.07,0.11,0.0,0.01,0.01,"Phragmites, Unspecified",recall,"{'x': 553957.9364, 'y': 4597815.008199999, 'sp..."
1,2,41.512276,-74.13846,2018-08,2020-01-22_16-31-05,0.18,0.01,0.01,0.01,0.01,"Phragmites, Unspecified",recall,"{'x': 571893.4068, 'y': 4595984.422800001, 'sp..."
2,3,41.391915,-74.333993,2017-09,2020-01-22_16-31-05,1.0,0.02,0.01,0.01,0.01,"Phragmites, Unspecified",recall,"{'x': 555679.4143000003, 'y': 4582477.87749999..."
3,4,41.484375,-74.011023,2012-06,2020-01-22_16-31-05,0.83,0.05,0.01,0.01,0.02,"Phragmites, Unspecified",recall,"{'x': 582563.2433000002, 'y': 4593000.6524, 's..."
4,5,41.5136,-74.051624,2014-10,2020-01-22_16-31-05,0.07,0.02,0.0,0.01,0.01,"Phragmites, Unspecified",recall,"{'x': 579138.1085999999, 'y': 4596207.2775, 's..."


In [15]:
# See proportions of entire dataset (adapted from Graham Harrison in Towards Data Science)

# Define dataframe to use
df_credit = sdf


ownership_filter = df_credit['Common_Nam'].isin(df_credit.Common_Nam.unique())
df_credit = df_credit.drop(df_credit[~ownership_filter].index)

# Print out proportions for selected attributes
(df_credit['Common_Nam'].value_counts() / len(df_credit)).sort_values(ascending=False), (df_credit['op_criteria'].value_counts() / len(df_credit)).sort_values(ascending=False)

(Knotweed, Unspecified         0.455902
Phragmites, Unspecified       0.356575
Tree-of-Heaven (Ailanthus)    0.141267
Purple Loosestrife            0.039914
Wild Parsnip                  0.006343
Name: Common_Nam, dtype: float64, recall       0.707124
F1           0.207293
precision    0.085583
Name: op_criteria, dtype: float64)

In [16]:
# Combine classes to get joint proportions
df_credit['Stratify'] = df_credit['op_criteria'] + ", " + df_credit['Common_Nam']
(df_credit['Stratify'].value_counts() / len(df_credit)).sort_values(ascending=False)

recall, Knotweed, Unspecified            0.370879
recall, Phragmites, Unspecified          0.192232
F1, Phragmites, Unspecified              0.112198
recall, Tree-of-Heaven (Ailanthus)       0.109246
F1, Knotweed, Unspecified                0.061492
precision, Phragmites, Unspecified       0.052145
recall, Purple Loosestrife               0.030603
F1, Tree-of-Heaven (Ailanthus)           0.024826
precision, Knotweed, Unspecified         0.023530
precision, Tree-of-Heaven (Ailanthus)    0.007195
F1, Purple Loosestrife                   0.007046
recall, Wild Parsnip                     0.004163
precision, Purple Loosestrife            0.002264
F1, Wild Parsnip                         0.001730
precision, Wild Parsnip                  0.000450
Name: Stratify, dtype: float64

In [17]:
# Define function to stratify data 
def stratify_data(df_data, stratify_column_name, stratify_values, stratify_proportions, random_state=None):
    """Stratifies data according to the values and proportions passed in
    Args:
        df_data (DataFrame): source data
        stratify_column_name (str): The name of the single column in the dataframe that holds the data values that will be used to stratify the data
        stratify_values (list of str): A list of all of the potential values for stratifying e.g. "Male, Graduate", "Male, Undergraduate", "Female, Graduate", "Female, Undergraduate"
        stratify_proportions (list of float): A list of numbers representing the desired propotions for stratifying e.g. 0.4, 0.4, 0.2, 0.2, The list values must add up to 1 and must match the number of values in stratify_values
        random_state (int, optional): sets the random_state. Defaults to None.
    Returns:
        DataFrame: a new dataframe based on df_data that has the new proportions represnting the desired strategy for stratifying
    """
    df_stratified = pd.DataFrame(columns = df_data.columns) # Create an empty DataFrame with column names matching df_data

    pos = -1
    for i in range(len(stratify_values)): # iterate over the stratify values (e.g. "Male, Undergraduate" etc.)
        pos += 1
        if pos == len(stratify_values) - 1: 
            ratio_len = len(df_data) - len(df_stratified) # if this is the final iteration make sure we calculate the number of values for the last set such that the return data has the same number of rows as the source data
        else:
            ratio_len = int(len(df_data) * stratify_proportions[i]) # Calculate the number of rows to match the desired proportion

        df_filtered = df_data[df_data[stratify_column_name] ==stratify_values[i]] # Filter the source data based on the currently selected stratify value
        df_temp = df_filtered.sample(replace=True, n=ratio_len, random_state=random_state) # Sample the filtered data using the calculated ratio
        
        df_stratified = pd.concat([df_stratified, df_temp]) # Add the sampled / stratified datasets together to produce the final result
        
    return df_stratified # Return the stratified, re-sampled data   


In [20]:
# Sample the data in a stratified way according to specified proportions
stratify_values = ['recall, Knotweed, Unspecified','recall, Phragmites, Unspecified', 'F1, Phragmites, Unspecified']
stratify_proportions = [0.65, 0.20, 0.15]
df_stratified = stratify_data(df_credit, 'Stratify', stratify_values, stratify_proportions, random_state=42)
df_stratified

Unnamed: 0,OBJECTID,lat,long,date,path,phra,knot,wild_parsnip,purple_loosestrife,tree_of_heaven_with_seeds,Common_Nam,op_criteria,SHAPE,Stratify
83199,83200,41.467249,-74.366737,2012-07,2020-01-22_16-31-05,0.01,0.48,0.01,0.01,0.01,"Knotweed, Unspecified",recall,"{'x': 552880.7801999999, 'y': 4590820.83170000...","recall, Knotweed, Unspecified"
68264,68265,41.561822,-74.170640,2016-09,2020-01-22_16-31-05,0.01,0.07,0.00,0.01,0.01,"Knotweed, Unspecified",recall,"{'x': 569155.2450000001, 'y': 4601458.81880000...","recall, Knotweed, Unspecified"
122290,122291,41.543911,-74.008922,2012-06,2020-01-22_16-31-05,0.01,0.16,0.00,0.01,0.01,"Knotweed, Unspecified",recall,"{'x': 582662.8561000004, 'y': 4599612.434, 'sp...","recall, Knotweed, Unspecified"
73669,73670,41.512392,-74.146967,2012-07,2020-01-22_16-31-05,0.01,0.70,0.00,0.01,0.01,"Knotweed, Unspecified",recall,"{'x': 571183.3863000004, 'y': 4595990.2608, 's...","recall, Knotweed, Unspecified"
104598,104599,41.417008,-73.998478,2015-07,2020-01-22_16-31-05,0.00,0.98,0.00,0.01,0.01,"Knotweed, Unspecified",recall,"{'x': 583697.1059999997, 'y': 4585533.5989, 's...","recall, Knotweed, Unspecified"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41628,41629,41.516428,-74.073488,2011-06,2020-01-22_16-31-05,0.79,0.01,0.01,0.01,0.01,"Phragmites, Unspecified",F1,"{'x': 577310.2652000003, 'y': 4596501.4585, 's...","F1, Phragmites, Unspecified"
56960,56961,41.398172,-74.356449,2017-09,2020-01-22_16-31-05,0.91,0.01,0.01,0.01,0.01,"Phragmites, Unspecified",F1,"{'x': 553796.8866999997, 'y': 4583158.33060000...","F1, Phragmites, Unspecified"
56256,56257,41.519857,-74.091629,2011-10,2020-01-22_16-31-05,0.35,0.01,0.01,0.01,0.01,"Phragmites, Unspecified",F1,"{'x': 575792.5251000002, 'y': 4596866.08280000...","F1, Phragmites, Unspecified"
51182,51183,41.513695,-74.047845,2018-08,2020-01-22_16-31-05,0.66,0.01,0.01,0.01,0.01,"Phragmites, Unspecified",F1,"{'x': 579453.3376000002, 'y': 4596221.29130000...","F1, Phragmites, Unspecified"


In [21]:
(df_stratified['Stratify'].value_counts() / len(df_stratified)).sort_values(ascending=False)

recall, Knotweed, Unspecified      0.649996
recall, Phragmites, Unspecified    0.199998
F1, Phragmites, Unspecified        0.150006
Name: Stratify, dtype: float64

In [None]:
# Export spatial dataframe to feature class
sdf.spatial.to_featureclass(location=r"sdf_test.shp")