# <b>MODIS Water Notebook - Compare models with Boxplot </b>

Purpose: Used to perform statisvalidation of C61 MOD44W products from different models. Compares those products to the previous version, C6 MOD44W.


In [1]:
from ipyleaflet import Map, Marker, basemaps, ScaleControl, LayersControl
from sklearn.metrics import accuracy_score,matthews_corrcoef,f1_score
from localtileserver import TileClient, get_leaflet_tile_layer 
from ipyleaflet import LegendControl, FullScreenControl, Popup
import matplotlib.colors as mcolors
from ipysheet import from_dataframe
import matplotlib.pyplot as plt
import ipywidgets as widgets
import rioxarray as rxr
from osgeo import gdal
import seaborn as sns
from glob import glob
import xarray as xr
import pandas as pd
import numpy as np
import warnings
import tempfile
import ipysheet
import joblib
import math
import os

os.environ['LOCALTILESERVER_CLIENT_PREFIX'] = \
    f"{os.environ['JUPYTERHUB_SERVICE_PREFIX'].lstrip('/')}/proxy/{{port}}"

### Data Parameters

In [2]:
TILE = ['h22v01','h12v09','h21v10','h09v05']
YEAR = [2019] #,2019]

#THIS IS JUST 2006 MAKE SURE TO CHANGE IF WANTED

In [3]:
BOOTSTRAP = None

In [4]:
target = False
match = True
eb = True
noc = False

In [5]:
data_dir = '/explore/nobackup/projects/ilab/data/MODIS/PRODUCTION/Burke_MW_RFA/no_outlier_cluster'

TAR_PATH = f'{data_dir}/v4_sample' #targeted rfa trained using v4.2.1 data
EB_PATH = f'{data_dir}/v2_cluster' #even balance trained rfa
MATCH_PATH = f'{data_dir}/v2_cluster_all_land' #random subset match size of even balance rfa
NOC_PATH = f'{data_dir}/v2_total' #rfa trained using v2.0.1 data

# file_path = f'*{YEAR}*{TILE}*Product.*tif'
# all_tile_year_paths = [f'*{Y}*{T}*Product.*tif' for T in TILE for Y in YEAR]
# print(all_tile_year_paths)

#MOD44W dir paths
MOD44_C6_YEAR = YEAR
MOD44W_C6_BASEPATH = '/explore/nobackup/people/mcarrol2/MODIS_water/v5_outputs/'
# mod44w_c6_path = f'{MOD44W_C6_BASEPATH}/{str(MOD44_C6_YEAR)}/MOD44W_{TILE}_{MOD44_C6_YEAR}_v5.tif'
# all_modd44w_paths = [f'{MOD44W_C6_BASEPATH}/{str(Y)}/MOD44W_{T}_{Y}_v5.tif' for T in TILE for Y in YEAR]
# print(all_modd44w_paths)

### Statistics

In [6]:
def data_stats(truth,pred):
    """
    In 
        product_path (str): Path to the prediction and truth tif files
        name (str): Name of the prediction dataset 
        
    """
    tp = np.where((truth == 1) & (pred == 1), 1, 0)
    tn = np.where((truth == 0) & (pred == 0), 1, 0)
    fp = np.where((truth == 0) & (pred == 1), 1, 0)
    fn = np.where((truth == 1) & (pred == 0), 1, 0)
    total = np.count_nonzero(truth == 1) + np.count_nonzero(truth == 0)
    
    truePositives = np.count_nonzero(tp == 1)
    trueNegatives = np.count_nonzero(tn == 1)
    falsePositives = np.count_nonzero(fp == 1)
    falseNegatives = np.count_nonzero(fn == 1)
    accuracy = (truePositives + trueNegatives) / (truePositives + trueNegatives + falsePositives + falseNegatives)
    f1 = truePositives / (truePositives + (0.5*(falsePositives + falseNegatives)))
    mcc_denom_nosqrt = (truePositives+falsePositives)*(truePositives+falseNegatives)*(trueNegatives+falsePositives)*(trueNegatives+falseNegatives)
    mcc_numerator = (truePositives*trueNegatives) - (falsePositives*falseNegatives)
    mcc = mcc_numerator/math.sqrt(mcc_denom_nosqrt)

    return accuracy,mcc,f1

In [7]:
%%time

if eb:
    eb_boot_acc, eb_boot_mcc, eb_boot_f1 = [], [], []
    for T in TILE: 
        for Y in YEAR:
            truth_array = rxr.open_rasterio(glob(f'{MOD44W_C6_BASEPATH}/{str(Y)}/MOD44W_{T}_{Y}_v5.tif')[0]).sel(band=1).data #.ravel()
            eb_pred_array = rxr.open_rasterio(glob(f'{EB_PATH}/*{Y}*{T}*Product.*tif')[0]).sel(band=1).data #.ravel()
            if BOOTSTRAP is None: 
                eb_acc,eb_mcc,eb_f1 = data_stats(truth_array,eb_pred_array)
                eb_boot_acc.append(eb_acc)
                eb_boot_mcc.append(eb_mcc)
                eb_boot_f1.append(eb_f1)
            else: 
                total_num = len(truth_array)
                small_sample_num = int(len(truth_array)*0.1)
                for b in np.arange(BOOTSTRAP):
                    x_eb_rand_sample = np.random.choice(total_num,small_sample_num,replace=False)
                    y_eb_rand_sample = np.random.choice(total_num,small_sample_num,replace=False)
                    subset_eb_truth = truth_array[x_eb_rand_sample] #,y_eb_rand_sample].ravel()
                    subset_eb = eb_pred_array[x_eb_rand_sample] #,y_eb_rand_sample].ravel()
                    eb_acc,eb_mcc,eb_f1 = data_stats(subset_eb_truth,subset_eb)
                    eb_boot_acc.append(eb_acc)
                    eb_boot_mcc.append(eb_mcc)
                    eb_boot_f1.append(eb_f1)

CPU times: user 1.06 s, sys: 787 ms, total: 1.85 s
Wall time: 1.85 s


In [8]:
if target:
    tar_boot_acc, tar_boot_mcc, tar_boot_f1 = [], [], []
    for T in TILE: 
        for Y in YEAR:
            truth_array = rxr.open_rasterio(glob(f'{MOD44W_C6_BASEPATH}/{str(Y)}/MOD44W_{T}_{Y}_v5.tif')[0]).sel(band=1).data
            tar_pred_array = rxr.open_rasterio(glob(f'{TAR_PATH}/*{Y}*{T}*Product.*tif')[0]).sel(band=1).data
            if BOOTSTRAP is None: 
                tar_acc,tar_mcc,tar_f1 = data_stats(truth_array,tar_pred_array)
                tar_boot_acc.append(tar_acc)
                tar_boot_mcc.append(tar_mcc)
                tar_boot_f1.append(tar_f1)
            else: 
                total_num = len(truth_array)
                small_sample_num = int(len(truth_array)*0.1)
                for b in np.arange(BOOTSTRAP):
                    x_tar_rand_sample = np.random.choice(total_num,small_sample_num,replace=False)
                    y_tar_rand_sample = np.random.choice(total_num,small_sample_num,replace=False)
                    subset_tar_truth = truth_array[x_tar_rand_sample,y_tar_rand_sample].ravel()
                    subset_tar = tar_pred_array[x_tar_rand_sample,y_tar_rand_sample].ravel()
                    tar_acc,tar_mcc,tar_f1 = data_stats(subset_tar_truth,subset_tar)                    
                    tar_boot_acc.append(tar_acc)
                    tar_boot_mcc.append(tar_mcc)
                    tar_boot_f1.append(tar_f1)

In [9]:
if match: 
    match_boot_acc, match_boot_mcc, match_boot_f1 = [], [], []
    for T in TILE: 
        for Y in YEAR:
            truth_array = rxr.open_rasterio(glob(f'{MOD44W_C6_BASEPATH}/{str(Y)}/MOD44W_{T}_{Y}_v5.tif')[0]).sel(band=1).data
            match_pred_array = rxr.open_rasterio(glob(f'{MATCH_PATH}/*{Y}*{T}*Product.*tif')[0]).sel(band=1).data
            if BOOTSTRAP is None:
                match_acc,match_mcc,match_f1 = data_stats(truth_array,match_pred_array)
                match_boot_acc.append(match_acc)
                match_boot_mcc.append(match_mcc)
                match_boot_f1.append(match_f1)
            else: 
                total_num = len(truth_array)
                small_sample_num = int(len(truth_array)*0.1)
                for b in np.arange(BOOTSTRAP):
                    x_match_rand_sample = np.random.choice(total_num,small_sample_num,replace=False)
                    y_match_rand_sample = np.random.choice(total_num,small_sample_num,replace=False)
                    subset_match_truth = truth_array[x_match_rand_sample,y_match_rand_sample].ravel()
                    subset_match= match_pred_array[x_match_rand_sample,y_match_rand_sample].ravel()
                    match_acc,match_mcc,match_f1 = data_stats(subset_match_truth,subset_match)
                    match_boot_acc.append(match_acc)
                    match_boot_mcc.append(match_mcc)
                    match_boot_f1.append(match_f1)

In [10]:
if noc: 
    noc_boot_acc, noc_boot_mcc, noc_boot_f1 = [], [], []
    for T in TILE: 
        for Y in YEAR:
            truth_array = rxr.open_rasterio(glob(f'{MOD44W_C6_BASEPATH}/{str(Y)}/MOD44W_{T}_{Y}_v5.tif')[0]).sel(band=1).data
            noc_pred_array = rxr.open_rasterio(glob(f'{NOC_PATH}/*{Y}*{T}*Product.*tif')[0]).sel(band=1).data
            if BOOTSTRAP is None:
                noc_acc,noc_mcc,noc_f1 = data_stats(truth_array,noc_pred_array)
                noc_boot_acc.append(noc_acc)
                noc_boot_mcc.append(noc_mcc)
                noc_boot_f1.append(noc_f1)
            else:
                total_num = len(truth_array)
                small_sample_num = int(len(truth_array)*0.1)

                for b in np.arange(BOOTSTRAP):
                    x_noc_rand_sample = np.random.choice(total_num,small_sample_num,replace=False)
                    y_noc_rand_sample = np.random.choice(total_num,small_sample_num,replace=False)
                    subset_noc_truth = truth_array[x_noc_rand_sample,y_noc_rand_sample].ravel()
                    subset_noc = noc_pred_array[x_noc_rand_sample,y_noc_rand_sample].ravel()
                    noc_acc,noc_mcc,noc_f1 = data_stats(subset_noc_truth,subset_noc)
                    noc_boot_acc.append(noc_acc)
                    noc_boot_mcc.append(noc_mcc)
                    noc_boot_f1.append(noc_f1)

In [None]:
# acc_dict = {'v2 all': noc_boot_acc, 'v2 cluster': eb_boot_acc, 'v2 rand': match_boot_acc, 'v4': tar_boot_acc} 
# acc_df = pd.DataFrame(acc_dict)

# mcc_dict = {'v2 all': noc_boot_mcc, 'v2 cluster': eb_boot_mcc, 'v2 rand': match_boot_mcc, 'v4': tar_boot_mcc} 
# mcc_df = pd.DataFrame(mcc_dict)

# f1_dict = {'v2 all': noc_boot_f1, 'v2 cluster': eb_boot_f1, 'v2 rand': match_boot_f1, 'v4': tar_boot_f1 } 
# f1_df = pd.DataFrame(f1_dict)

### Plots

In [None]:
# bxpt_file = 'MW_RFA_Statistics_No_P.csv'
# df = pd.read_csv(bxpt_file).dropna()
# df.var(numeric_only=True)

In [None]:

colors = ['grey','plum','darkorchid','goldenrod'] #,'lightblue','darkblue']
models = ['v2 all','v2 cluster','v2 rand','v4'] #,'P Cluster','P Random Match']

all_colors = dict(zip(models,colors))

fig, axes = plt.subplots(3,1,figsize=(5, 7))
plt.subplots_adjust(hspace=0.5)

label_size = 7
axes[0].tick_params(axis='both', which='major', labelsize=label_size) 
axes[1].tick_params(axis='both', which='major', labelsize=label_size) 
axes[2].tick_params(axis='both', which='major', labelsize=label_size) 

sns.boxplot(data=mcc_df,ax=axes[0],palette=all_colors)
sns.boxplot(data=acc_df,ax=axes[1],palette=all_colors)
sns.boxplot(data=f1_df,ax=axes[2],palette=all_colors)

axes[0].set_title('Matthew Correlation Coefficient',size=15)
axes[1].set_title('Accuracy',size=15)
axes[2].set_title('F1 Score',size=15)

axes[0].set_ylim([0.2,1.1])
axes[1].set_ylim([0.2,1.1])
axes[2].set_ylim([0.2,1.1])

axes[0].yaxis.grid(True)
axes[1].yaxis.grid(True)
axes[2].yaxis.grid(True)

# axes[0].set_xticklabels(axes[0].get_xticklabels(),rotation=10)

plt.show()
