# Generate Statistic and TIFFs for Site Location Analysis

In [None]:
import os
import json
import numpy as np
import pandas as pd

import initialise
import common
from analysis_utils import calc_statistics

In [None]:
if common.GDAL_DATA is not None:
    os.environ['GDAL_DATA'] = common.GDAL_DATA
import gdal
from gdal import osr

In [None]:
SCENARIOS = ['within-site', 'out-of-site']
DISPLAY_NAMES = [' '.join([s.capitalize(), 'Models']) for s in SCENARIOS]
MODELS_DIR = [
    os.path.join(common.MODELS_DIR, f'{SCENARIOS[0]}_models'),
    os.path.join(common.MODELS_DIR, f'{SCENARIOS[1]}_models'),
    os.path.join(common.MODELS_DIR, 'comparison_models', 'test2'),
    os.path.join(common.MODELS_DIR, 'comparison_models', 'test3'),
]
OUTPUT_DIR = os.path.join(common.MAPS_DIR, 'Gridded_sites')
NODATA = common.GDAL_NODATA_VALUE
BANDS = ['RMSE', 'Bias']

In [None]:
all_samples = pd.read_csv(os.path.join(common.DATASETS_DIR, 'samples_365days.csv'), index_col=0)
models = [
    {'type': 'Multi-tempCNN', 'preds_file': f'ensemble{common.ENSEMBLE_SIZE}_{common.ANALYSIS_MODEL}.csv'},
    {'type': 'Modis-tempCNN', 'preds_file': f'predictions_{common.MODIS_TEMPCNN_MODEL}.csv'},
]
ifile_names = [m['preds_file'] for m in models for _ in range(len(SCENARIOS))]
ofile_names = [f'{mt["type"]}_{sc}_sites.tif' for mt in models for sc in SCENARIOS]

### Calculate the statistics for each grid cell

In [None]:
def gen_gridded_data(predictions, samples, y):
    predict = predictions.merge(all_samples[['Latitude', 'Longitude', y]], left_index=True, right_index=True)
    predict['Latitude'] = np.ceil((predict.Latitude * 2))/2
    predict['Longitude'] = np.floor((predict.Longitude * 2))/2

    counts = predict.groupby(['Latitude', 'Longitude']).size()
    locations = []
    locs = predict.set_index(['Latitude', 'Longitude']).loc[counts[counts >= 10].index].groupby(['Latitude', 'Longitude'])
    for col in predict.columns[:-3]:
        l = locs.apply(lambda x: calc_statistics(x[y], x[col])) 
        locations.append(pd.DataFrame(list(l.values), index=l.index))
    locations = pd.concat(locations).groupby(level=[0,1]).mean()
    locations['NumSamples'] = counts
    return locations

### Summary statistics

In [None]:
def display_summary(locations):
    print('Proportion of grid cells with RMSE < 20:', np.round(locations.RMSE.lt(20).sum() / locations.shape[0], 2), '\n')
    display(pd.DataFrame([locations.min(), locations.median(), locations.mean(), locations.max(), locations.std()],
                         index=['min', 'median', 'mean', 'max', 'std dev']).round(2))

### Region summary

In [None]:
def display_region(locations, long_range, lat_range):
    temp_df = locations.reset_index()
    temp_df = temp_df[temp_df.Latitude.between(lat_range[0], lat_range[1]) & temp_df.Longitude.between(long_range[0], long_range[1])]
    print('RMSE weighted average:', np.round(np.sqrt((temp_df.RMSE ** 2 * temp_df.NumSamples).sum() / temp_df.NumSamples.sum()), 2))
    print('Bias weighted average:', np.round((temp_df.Bias * temp_df.NumSamples).sum() / temp_df.NumSamples.sum(), 2))

### Grid cell LFMC estimation bias

In [None]:
def display_grid_bias(locations):
    print('Proportion of grid cells with under-estimated LFMC:', np.round(locations.Bias.lt(0).sum() / locations.shape[0], 2))
    print('Proportion of grid cells with abs(bias) < 5:', np.round(locations.Bias.between(-5, 5).sum() / locations.shape[0], 2))
    print('Proportion of grid cells with abs(bias) < 10:', np.round(locations.Bias.between(-10, 10).sum() / locations.shape[0], 2))
    print('Proportion of grid cells with abs(bias) > 20:', np.round(locations.Bias.abs().gt(20).sum() / locations.shape[0], 2))

### Generate sites tiff

In [None]:
def gen_locations_tiff(locations, output_file, bands, nodata_value, longitude='Longitude', latitude='Latitude'):
    x_coords = (int(np.floor(locations.index.get_level_values(longitude).min())), int(np.ceil(locations.index.get_level_values(longitude).max())))
    y_coords = (int(np.floor(locations.index.get_level_values(latitude).min())), int(np.ceil(locations.index.get_level_values(latitude).max())))
    x_size = (x_coords[1] - x_coords[0]) * 2 + 1
    y_size = (y_coords[1] - y_coords[0]) * 2 + 1
    multi_index = pd.MultiIndex.from_product([np.linspace(y_coords[0], y_coords[1], y_size), np.linspace(x_coords[0], x_coords[1], x_size)])

    pixel_size = 0.5
    transform = [x_coords[0], pixel_size, 0.0, y_coords[1], 0.0, -pixel_size]

    srs = osr.SpatialReference()
    srs.ImportFromEPSG(4326)

    driver = gdal.GetDriverByName('GTiff')
    out_map_raster = driver.Create(output_file, x_size, y_size, len(bands), gdal.GDT_Float32)
    out_map_raster.SetGeoTransform(transform)
    out_map_raster.SetProjection(srs.ExportToWkt())

    for num, band in enumerate(bands, 1):
        band_data = locations[band].reindex(multi_index).unstack()[::-1]
        out_map_band = out_map_raster.GetRasterBand(num)
        out_map_band.SetNoDataValue(nodata_value)
        out_map_band.SetDescription(band)
        out_map_band.WriteArray(band_data.values)
        out_map_band.FlushCache()

    del out_map_raster

## Multi-tempCNN Within-site Scenario

In [None]:
print(f'Multi-tempCNN {SCENARIOS[0].capitalize()} Scenario')
print('==================================')

with open(os.path.join(MODELS_DIR[0], 'model_params.json'), 'r') as f:
    model_params = json.load(f)

predict = pd.read_csv(os.path.join(MODELS_DIR[0], ifile_names[0]), index_col=0)
samples = all_samples.reindex(predict.index)

locations = gen_gridded_data(predict, samples, model_params['targetColumn'])
print('\nResults summary')
print('---------------')
display_summary(locations)

print('\nResults for southern Texas')
print('--------------------------')
display_region(locations, (-100.0, -98.0), (26.0, 29.5))

print('\nResults for Rocky Mountains')
print('---------------------------')
display_region(locations, (-118.0, -113.0), (47.0, 50.0))

print('\nBias summary')
print('------------')
display_grid_bias(locations)

ofile = os.path.join(OUTPUT_DIR, ofile_names[0])
gen_locations_tiff(locations, ofile, BANDS, NODATA)

## Multi-tempCNN Out-of-site Scenario

In [None]:
print(f'Multi-tempCNN {SCENARIOS[1].capitalize()} Scenario')
print('==================================')

with open(os.path.join(MODELS_DIR[1], 'model_params.json'), 'r') as f:
    model_params = json.load(f)

predict = pd.read_csv(os.path.join(MODELS_DIR[1], ifile_names[1]), index_col=0)

locations = gen_gridded_data(predict, all_samples, model_params['targetColumn'])
print('\nResults summary')
print('---------------')
display_summary(locations)

print('\nResults for southern Texas')
print('--------------------------')
display_region(locations, (-100.0, -98.0), (26.0, 29.5))

print('\nResults for Rocky Mountains')
print('---------------------------')
display_region(locations, (-118.0, -113.0), (47.0, 50.0))

print('\nBias summary')
print('------------')
display_grid_bias(locations)

ofile = os.path.join(OUTPUT_DIR, ofile_names[1])
gen_locations_tiff(locations, ofile, BANDS, NODATA)

## Modis-tempCNN Within-site Scenario

In [None]:
print(f'Modis-tempCNN {SCENARIOS[0].capitalize()} Scenario')
print('==================================')

with open(os.path.join(MODELS_DIR[2], 'model_params.json'), 'r') as f:
    model_params = json.load(f)

predict = pd.read_csv(os.path.join(MODELS_DIR[2], ifile_names[2]), index_col=0)
samples = all_samples.reindex(predict.index)

locations = gen_gridded_data(predict, samples, model_params['targetColumn'])
print('\nResults summary')
print('---------------')
display_summary(locations)

print('\nResults for southern Texas')
print('--------------------------')
display_region(locations, (-100.0, -98.0), (26.0, 29.5))

print('\nResults for Rocky Mountains')
print('---------------------------')
display_region(locations, (-118.0, -113.0), (47.0, 50.0))

print('\nBias summary')
print('------------')
display_grid_bias(locations)

ofile = os.path.join(OUTPUT_DIR, ofile_names[2])
gen_locations_tiff(locations, ofile, BANDS, NODATA)

## Modis-tempCNN Out-of-site Scenario

In [None]:
print(f'Modis-tempCNN {SCENARIOS[1].capitalize()} Scenario')
print('==================================')

with open(os.path.join(MODELS_DIR[3], 'model_params.json'), 'r') as f:
    model_params = json.load(f)

predict = pd.read_csv(os.path.join(MODELS_DIR[3], ifile_names[3]), index_col=0)

locations = gen_gridded_data(predict, all_samples, model_params['targetColumn'])
print('\nResults summary')
print('---------------')
display_summary(locations)

print('\nResults for southern Texas')
print('--------------------------')
display_region(locations, (-100.0, -98.0), (26.0, 29.5))

print('\nResults for Rocky Mountains')
print('---------------------------')
display_region(locations, (-118.0, -113.0), (47.0, 50.0))

print('\nBias summary')
print('------------')
display_grid_bias(locations)

ofile = os.path.join(OUTPUT_DIR, ofile_names[3])
gen_locations_tiff(locations, ofile, BANDS, NODATA)