# Generate Statistics and TIFFs for Site Location Analysis

In [None]:
import os
import json
import numpy as np
import pandas as pd

import initialise
import common
from analysis_utils import calc_statistics

In [None]:
if common.GDAL_DATA is not None:
    os.environ['GDAL_DATA'] = common.GDAL_DATA
from osgeo import gdal
from osgeo import osr

In [None]:
MODEL_DIR = os.path.join(common.MODELS_DIR, 'evaluation_models')
OUTPUT_DIR = os.path.join(common.MAPS_DIR, 'Gridded_sites')
NODATA = common.GDAL_NODATA_VALUE
BANDS = ['RMSE', 'Bias', 'R2', 'Uncertainty']

In [None]:
all_samples = pd.read_csv(os.path.join(common.DATASETS_DIR, 'samples_730days.csv'), index_col=0)
tests = [{'name': 'Nowcasting', 'dir': 'test0'}, {'name': '3-month Projection', 'dir': 'test3'},]
ifile_name = f'ensemble{common.ENSEMBLE_SIZE}_{common.ANALYSIS_MODEL}.csv'
ufile_name = f'ensemble{common.ENSEMBLE_SIZE}_stds.csv'
ofile_names = [f'projection_nowcast_sites.tif', f'projection_3months_sites.tif',]
locations = {}

### Calculate the statistics for each grid cell

In [None]:
def gen_gridded_data(predictions, samples, std_devs, y):
    predict = predictions.merge(samples[['Latitude', 'Longitude', y]], left_index=True, right_index=True)
    predict['Latitude'] = np.ceil((predict.Latitude * 2))/2
    predict['Longitude'] = np.floor((predict.Longitude * 2))/2
    predict['Uncertainty'] = std_devs.mean(axis=1)

    counts = predict.groupby(['Latitude', 'Longitude']).size()
    uncertainty = predict.groupby(['Latitude', 'Longitude'])['Uncertainty'].mean()
    locations = []
    locs = predict.set_index(['Latitude', 'Longitude']).loc[counts[counts >= 10].index].groupby(['Latitude', 'Longitude'])
    ybar = samples[y].mean()
    for col in predict.columns[:-4]:
        l = locs.apply(lambda x: calc_statistics(x[y], x[col], ybar=ybar))
        locations.append(pd.DataFrame(list(l.values), index=l.index))
    locations = pd.concat(locations).groupby(level=[0,1]).mean()
    locations['Uncertainty'] = uncertainty
    locations['NumSamples'] = counts
    return locations

### Summary statistics

In [None]:
def display_summary(locations):
    print('Proportion of grid cells with RMSE < 20:', np.round(locations.RMSE.lt(20).sum() / locations.shape[0], 2), '\n')
    display(pd.DataFrame([locations.min(), locations.median(), locations.mean(), locations.max(), locations.std()],
                         index=['min', 'median', 'mean', 'max', 'std dev']).round(2))

### Region summary

In [None]:
def display_region(locations, long_range, lat_range):
    temp_df = locations.reset_index()
    temp_df = temp_df[temp_df.Latitude.between(lat_range[0], lat_range[1]) & temp_df.Longitude.between(long_range[0], long_range[1])]
    print('RMSE weighted average:', np.round(np.sqrt((temp_df.RMSE ** 2 * temp_df.NumSamples).sum() / temp_df.NumSamples.sum()), 2))
    print('Bias weighted average:', np.round((temp_df.Bias * temp_df.NumSamples).sum() / temp_df.NumSamples.sum(), 2))

### Grid cell LFMC estimation bias

In [None]:
def display_grid_bias(locations):
    print('Proportion of grid cells with under-estimated LFMC:', np.round(locations.Bias.lt(0).sum() / locations.shape[0], 2))
    print('Proportion of grid cells with abs(bias) < 5:', np.round(locations.Bias.between(-5, 5).sum() / locations.shape[0], 2))
    print('Proportion of grid cells with abs(bias) < 10:', np.round(locations.Bias.between(-10, 10).sum() / locations.shape[0], 2))
    print('Proportion of grid cells with abs(bias) > 20:', np.round(locations.Bias.abs().gt(20).sum() / locations.shape[0], 2))

### Generate sites tiff

In [None]:
def gen_locations_tiff(locations, output_file, bands, nodata_value, longitude='Longitude', latitude='Latitude'):
    x_coords = (int(np.floor(locations.index.get_level_values(longitude).min())), int(np.ceil(locations.index.get_level_values(longitude).max())))
    y_coords = (int(np.floor(locations.index.get_level_values(latitude).min())), int(np.ceil(locations.index.get_level_values(latitude).max())))
    x_size = (x_coords[1] - x_coords[0]) * 2 + 1
    y_size = (y_coords[1] - y_coords[0]) * 2 + 1
    multi_index = pd.MultiIndex.from_product([np.linspace(y_coords[0], y_coords[1], y_size), np.linspace(x_coords[0], x_coords[1], x_size)])

    pixel_size = 0.5
    transform = [x_coords[0], pixel_size, 0.0, y_coords[1], 0.0, -pixel_size]

    srs = osr.SpatialReference()
    srs.ImportFromEPSG(4326)

    driver = gdal.GetDriverByName('GTiff')
    out_map_raster = driver.Create(output_file, x_size, y_size, len(bands), gdal.GDT_Float32)
    out_map_raster.SetGeoTransform(transform)
    out_map_raster.SetProjection(srs.ExportToWkt())

    for num, band in enumerate(bands, 1):
        band_data = locations[band].reindex(multi_index).unstack()[::-1]
        out_map_band = out_map_raster.GetRasterBand(num)
        out_map_band.SetNoDataValue(nodata_value)
        out_map_band.SetDescription(band)
        out_map_band.WriteArray(band_data.values)
        out_map_band.FlushCache()

    del out_map_raster

## Nowcasting

In [None]:
print(f"{tests[0]['name']} Scenario")
print('===================')

with open(os.path.join(MODEL_DIR, 'model_params.json'), 'r') as f:
    model_params = json.load(f)

predict = pd.read_csv(os.path.join(MODEL_DIR, tests[0]['dir'], ifile_name), index_col=0)
std_dev = pd.read_csv(os.path.join(MODEL_DIR, tests[0]['dir'], ufile_name), index_col=0)
samples = all_samples.reindex(predict.index)

locations['nowcasting'] = gen_gridded_data(predict, samples, std_dev, model_params['targetColumn'])
print('\nResults summary')
print('---------------')
display_summary(locations['nowcasting'])

print('\nResults for southern Texas')
print('--------------------------')
display_region(locations['nowcasting'], (-100.0, -98.0), (26.0, 29.5))

print('\nResults for Rocky Mountains')
print('---------------------------')
display_region(locations['nowcasting'], (-118.0, -113.0), (47.0, 50.0))

print('\nBias summary')
print('------------')
display_grid_bias(locations['nowcasting'])

#ofile = os.path.join(OUTPUT_DIR, ofile_names[0])
#gen_locations_tiff(locations['nowcasting'], ofile, BANDS, NODATA)

## 3-month Projection

In [None]:
print(f"{tests[1]['name']} Scenario")
print('===========================')

predict = pd.read_csv(os.path.join(MODEL_DIR, tests[1]['dir'], ifile_name), index_col=0)
std_dev = pd.read_csv(os.path.join(MODEL_DIR, tests[1]['dir'], ufile_name), index_col=0)
samples = all_samples.reindex(predict.index)

locations['projection'] = gen_gridded_data(predict, samples, std_dev, model_params['targetColumn'])
print('\nResults summary')
print('---------------')
display_summary(locations['projection'])

print('\nResults for southern Texas')
print('--------------------------')
display_region(locations['projection'], (-100.0, -98.0), (26.0, 29.5))

print('\nResults for Rocky Mountains')
print('---------------------------')
display_region(locations['projection'], (-118.0, -113.0), (47.0, 50.0))

print('\nBias summary')
print('------------')
display_grid_bias(locations['projection'])

#ofile = os.path.join(OUTPUT_DIR, ofile_names[1])
#gen_locations_tiff(locations['projection'], ofile, BANDS, NODATA)

In [None]:
locations['differences'] = locations['projection'] - locations['nowcasting']

In [None]:
locations['differences'].describe()

In [None]:
locations['projection'][['RMSE', 'R2', 'Bias', 'Uncertainty']].plot.density(subplots=True, sharex=False, sharey=False, layout=(2,2))

In [None]:
locations['nowcasting'][['RMSE', 'R2', 'Bias', 'Uncertainty']].plot.density(subplots=True, sharex=False, sharey=False, layout=(2,2))

In [None]:
locations['differences'][['RMSE', 'R2', 'Bias', 'Uncertainty']].plot.density(subplots=True, sharex=False, sharey=False, layout=(2,2))

In [None]:
locations['differences'].RMSE[locations['differences'].RMSE.between(-5, 5)].size / locations['differences'].RMSE.size

In [None]:
locations['differences'].RMSE[locations['differences'].RMSE.between(0, 5)].size / locations['differences'].RMSE.size

In [None]:
locations['differences'].RMSE[locations['differences'].RMSE < 5].size / locations['differences'].RMSE.size

In [None]:
locations['differences'].RMSE[locations['differences'].RMSE < 0].size / locations['differences'].RMSE.size

In [None]:
locations['projection'].RMSE[locations['projection'].RMSE < 30].size / locations['projection'].RMSE.size

In [None]:
locations['differences'].R2[locations['differences'].R2.between(-0.1, 0.1)].size / locations['differences'].R2.size

In [None]:
locations['differences'].R2[locations['differences'].R2.between(-0.1, 0)].size / locations['differences'].R2.size

In [None]:
locations['differences'].R2[locations['differences'].R2 > 0].size / locations['differences'].R2.size

In [None]:
locations['differences'].R2[locations['differences'].R2 < -0.1].size / locations['differences'].R2.size

In [None]:
locations['projection'].R2[locations['projection'].R2 <= 0].size / locations['projection'].R2.size

In [None]:
locations['nowcasting'].R2[locations['nowcasting'].R2 <= 0].size / locations['nowcasting'].R2.size

In [None]:
locations['projection'].Bias.quantile([.10, .20, .30, .40, .50, .60, .70, .80, .90])

In [None]:
locations['nowcasting'].Bias.quantile([.40, .41, .42, .43, .44, .45, .46, .47, .48, .49, .5])

In [None]:
locations['projection'].Bias.quantile([.40, .41, .42, .43, .44, .45, .46, .47, .48, .49, .5])

In [None]:
locations['projection'].R2.quantile([.10, .20, .30, .40, .50, .60, .70, .80, .90])

In [None]:
locations['projection'].R2.quantile([.23, .33, .50, .67, .75, .76])

In [None]:
locations['projection'].RMSE.quantile([.10, .20, .30, .40, .50, .60, .70, .80, .90])

In [None]:
locations['projection'].RMSE.quantile([.0, .25, .50, .75, 1])

In [None]:
locations['differences'][(locations['differences'].R2 >= 0) & (locations['differences'].RMSE > 0)]

In [None]:
locations['differences'][(locations['differences'].R2 < 0) & (locations['differences'].RMSE < 0)]

In [None]:
locations['differences'][locations['differences'].RMSE < 0]

In [None]:
locations['projection'][['R2', 'Bias']].plot.scatter('R2', 'Bias')

In [None]:
locations['projection'][locations['projection'].Bias < 0].reset_index().describe()

In [None]:
locations['projection'][locations['projection'].Bias > 0].reset_index().describe()

In [None]:
locations['projection'].reset_index()[['Latitude', 'Longitude']].plot.density(subplots=True, sharex=False)

In [None]:
locations['projection'][locations['projection'].Bias < 0].reset_index()[['Latitude', 'Longitude']].plot.density(subplots=True, sharex=False)

In [None]:
locations['projection'][locations['projection'].Bias > 0].reset_index()[['Latitude', 'Longitude']].plot.density(subplots=True, sharex=False)

In [None]:
import matplotlib.pyplot as plt

In [None]:
fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6)) = plt.subplots(
    3, 2, figsize=(6, 6), sharex='col', #sharey='row', #
    constrained_layout=True, dpi=100, linewidth=2, edgecolor="black")
locations['projection'].reset_index()['Latitude'].plot.density(ax=ax1)
locations['projection'][locations['projection'].Bias < 0].reset_index()['Latitude'].plot.density(ax=ax3)
locations['projection'][locations['projection'].Bias > 0].reset_index()['Latitude'].plot.density(ax=ax5)
locations['projection'].reset_index()['Longitude'].plot.density(ax=ax2)
locations['projection'][locations['projection'].Bias < 0].reset_index()['Longitude'].plot.density(ax=ax4)
locations['projection'][locations['projection'].Bias > 0].reset_index()['Longitude'].plot.density(ax=ax6)

In [None]:
fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6)) = plt.subplots(
    3, 2, figsize=(6, 6), sharex='col', #sharey='row', #
    constrained_layout=True, dpi=100, linewidth=2, edgecolor="black")
locations['differences'].reset_index()['Latitude'].plot.density(ax=ax1)
locations['differences'][locations['differences'].Bias < 0].reset_index()['Latitude'].plot.density(ax=ax3)
locations['differences'][locations['differences'].Bias > 0].reset_index()['Latitude'].plot.density(ax=ax5)
locations['differences'].reset_index()['Longitude'].plot.density(ax=ax2)
locations['differences'][locations['differences'].Bias < 0].reset_index()['Longitude'].plot.density(ax=ax4)
locations['differences'][locations['differences'].Bias > 0].reset_index()['Longitude'].plot.density(ax=ax6)

In [None]:
locations['projection']

In [None]:
predict2 = predict.merge(samples[['Latitude', 'Longitude', 'LFMC value']], left_index=True, right_index=True)
predict2['Latitude'] = np.ceil((predict2.Latitude * 2))/2
predict2['Longitude'] = np.floor((predict2.Longitude * 2))/2
predict2

In [None]:
samples2 = samples.copy()
samples2['Latitude'] = np.ceil((samples2.Latitude * 2))/2
samples2['Longitude'] = np.floor((samples2.Longitude * 2))/2
samples2

In [None]:
samples2['LFMC value'].plot.density()

In [None]:
_ = predict2.drop(columns=['Latitude', 'Longitude']).plot.density(subplots=True, layout=(7,8), figsize=(12, 10), sharex=True, sharey=True, legend=False)

In [None]:
_ = predict2[['5', '23', '46', '17', 'LFMC value', '39', '2', '28', '34']].plot.density(subplots=True, layout=(3, 3), figsize=(9, 9),
                                                                                        sharex=True, sharey=True, legend=False, xlim=(0,450))

In [None]:
predict.kurtosis().mean()

In [None]:
samples['LFMC value'].kurtosis()

In [None]:
g2 = (((samples['LFMC value'] - samples['LFMC value'].mean())/samples['LFMC value'].std(ddof=0)) ** 4).mean() - 3
n = len(samples['LFMC value'])
G2 = (n-1) / ((n-2)*(n-3)) * ((n+1) * g2 + 6)
G2

In [None]:
len(samples['LFMC value'])

In [None]:
predict_now = pd.read_csv(os.path.join(MODEL_DIR, tests[0]['dir'], ifile_name), index_col=0)
predict_now.kurtosis().mean()

In [None]:
ax = pd.concat([locations['projection']['R2'], locations['nowcasting']['R2']], axis=1, keys=['Projection', 'Nowcasting']).plot.density()
ax.set_xlim([0, 1])

In [None]:
ax = pd.concat([locations['projection']['RMSE'], locations['nowcasting']['RMSE']], axis=1, keys=['Projection', 'Nowcasting']).plot.density()
ax.set_xlim([0, 100])

In [None]:
ax = pd.concat([locations['projection']['Bias'], locations['nowcasting']['Bias']], axis=1, keys=['Projection', 'Nowcasting']).plot.density()
ax.set_xlim([-50, 50])

In [None]:
pd.concat([locations['projection']['R2'], locations['nowcasting']['R2'], locations['differences']['R2']],
          axis=1,
          keys=['Projection', 'Nowcasting', 'Differences']
         ).quantile([0, .10, .20, .30, .40, .50, .60, .70, .80, .90, 1])

In [None]:
pd.concat([locations['projection']['R2'], locations['nowcasting']['R2'], locations['differences']['R2']],
          axis=1,
          keys=['Projection', 'Nowcasting', 'Differences']
         ).quantile([0, .25, .50, .75, 1])

In [None]:
locations['projection'].plot.scatter('R2', 'Count')

In [None]:
locations['nowcasting'].plot.scatter('R2', 'Count')

In [None]:
locations['projection'][locations['projection'].Count >= 300]

In [None]:
locations['projection'][locations['projection'].R2 < 0].describe()

In [None]:
locations['projection'][locations['projection'].R2 > 0].describe()

In [None]:
locations.keys()

In [None]:
samples_temp = samples[['Longitude', 'Latitude', 'LFMC value']].copy()
samples_temp['Latitude'] = np.ceil((samples_temp.Latitude * 2))/2
samples_temp['Longitude'] = np.floor((samples_temp.Longitude * 2))/2
locations['LFMC_stats'] = samples_temp.groupby(['Longitude', 'Latitude']).describe().droplevel(0, axis=1)

In [None]:
locations['projection'][locations['projection'].R2 < 0].merge(locations['LFMC_stats']['std'], left_index=True, right_index=True).describe()

In [None]:
locations['LFMC_stats']['std'].describe()

In [None]:
locations['LFMC_stats'][locations['LFMC_stats']['count'] == 300]