# Analyze_CETB_Cubes

Calculate SIR and/or GRD analysis by year for selected subset areas. For example, running melt-onset-dates by year and pixel, or calculating intrapixel stddevs.

Use this notebook for any analysis/display that is examining the TBs from the cubefiles.

Saves MOD/EHD DAV booleans with geolocation information in pickle files that can be examined/displayed elsewhere.

Makes geotiff maps of various annual and/or average results.


## Load in all the modules needed

In [None]:
%matplotlib inline
# check if a windows machine, it needs special attention
# this extra step will bypass an error from mpl_toolkits.basemap
import os
if os.name == 'nt':
    os.environ["PROJ_LIB"] = os.path.join(os.environ["CONDA_PREFIX"], "Library", "share")
    os.environ["GDAL_DATA"] = os.path.join(os.environ["CONDA_PREFIX"], "Library", "share", "gdal")
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable
from netCDF4 import Dataset, num2date
import numpy as np
import pandas as pd
from pathlib import Path
import re
from cetbtools.ease2conv import Ease2Transform
from mpl_toolkits.basemap import Basemap
import glob
from pathlib import Path

# Set parameters to display more than default rows/cols in Data frames

In [None]:
pd.set_option('display.max_rows', 999)
pd.set_option('display.min_rows', 200)
pd.set_option('display.max_columns', 10)
pd.set_option('display.width', 120)
pd.set_option('display.max_colwidth', 10)

In [None]:
# Define the local machine location of CETB data cubes
# This directory is expected to contain subdirectories in the following hierarchy
# that duplicates the hierarchy on the Google Shared Drive NSIDC-SD-CETB/v1/, 
# for example:
# dataDir/F13_SSMI/N/nc_cubes/cubes_<regionName>
user = 'Joan' #Mariah #MJWindows #MAHMac
if ('Joan' == user):
    dataDir = Path(Path.home(), 'ceph', 'jmr204group','CETB_cubes')
    #dataDir = '/mnt/data3/cetb/nsidc0630_v1/' #jmr machine fringe 
    scriptDir = Path(Path.home(), 'ipynb_melt_onset', 'scripts')
    outDir = Path(Path.home(), 'cetb/ipynb_melt__onset_plots')
elif ('Mariah' == user):
    dataDir = Path(Path.home(), 'nsidc0630_v1') # Mariah's PC or Mary Jo's Mac
    scriptDir = Path(Path.home(), 'ipynb_melt_onset', 'scripts')
    outDir = Path(Path.home(), 'nsidc0630_v1', 'MODs')
elif ('MJWindows' == user):
    dataDir = Path('Z:/mj On My Mac/nsidc0630_v1') # Mary Jo's Windows machine
    scriptDir = Path(Path.home(), 'ipynb_melt_onset', 'scripts')
    outDir = ''
elif ('MJMac' == user):
    dataDir = Path(Path.home(), 'nsidc0630_v1') # Mary Jo's Mac
    scriptDir = Path(Path.home(), 'ipynb_melt_onset', 'scripts')  
    outDir = Path(Path.home(), 'nsidc0630_v1', 'pkls')
elif ('MAHMac' == user):
    dataDir = Path(Path.home(), 'nsidc0630_v1') # Molly's Mac
    scriptDir = Path(Path.home(), 'Projects', 'ipynb_melt_onset', 'scripts')  
    outDir = Path(Path.home(), 'nsidc0630_v1', 'pkls')
else:
    raise ValueError("unknown user= %s\n" % (user) )
    
%cd $scriptDir
dataDir, outDir, user

In [None]:
# load the custom functions
from CETB_IO import read_Tb_whole
from CETB_IO import coords
from CETB_algorithms import calc_DAV
from CETB_IO import grid_locations_of_subset
from CETB_IO import years_for
from CETB_IO import get_sir_info
from CETB_IO import write_df_to_geotiff
from CETB_IO import get_site_boundaries
from CETB_algorithms import DAV_MOD
from CETB_analysis import MOD_array

## Specify inputs

This cell is the place to specify the cube name to analyze ('WesternCA', 'AKYukon', etc)
and the sensor and channels to process.

Here is the current map of coverages of our subset cubes for the Northern Hemisphere:

<img src='graphics/CETB_EASE2_N_cubes_geolocations.v2.png' width="800" height="800">



In [None]:
#Specify region, satellite, sensor, channel, and image reconstruction algorithm of interest in file name
# this notebook will read in 2 CETB datasets so that channels/algorithms/sensors can be compared
region='EEurope' #'AKYukon' #'Laptev' #'WesternCA'  #'GLaIL'  #make this the same syntax as cubefilenames and sub-directory
sat_GRD='GCOMW1'   #'AQUA' for AMSRE, 'F13','F14','F15'... for SSMI 'F18' 'GCOMW1
sat_SIR= 'GCOMW1'
sensor_GRD='AMSR2'  #'AMSRE', 'SSMI', SSMIS, etc. AMSR2
sensor_SIR='AMSR2'
channel_GRD='36V'  #'36V','36H', '18V','18H', etc. '19V','19H' and '37V','37H' for SSMI)
channel_SIR='36V'
alg_GRD='GRD'   #SIR or GRD
alg_SIR='SIR'

hemName = 'N' 

# get sir to grd factor and sir_gpd name
sir_2_grd_factor, sir_gpd = get_sir_info(channel_SIR, hem=hemName)
print("channel=%s, sir_2_grd_factor=%d, sir_gpd=%s" % (
    channel_SIR, sir_2_grd_factor, sir_gpd))

cubeType_GRD = channel_GRD + '-' + alg_GRD
cubeType_SIR = channel_SIR + '-' + alg_SIR
  
if ('SSMI' == sensor_GRD) or ('SSMIS' == sensor_GRD):
    provider='CSU' 
    version='v1.*'
elif 'AMSRE' == sensor_GRD:
    provider='RSS'
    version='v1.3'
elif 'AMSR2' == sensor_GRD:
    provider='PPS_XCAL'
    version='v1.*'
    
# on Joan's machine
#datadir_GRD = dataDir + sat_GRD+'_'+sensor_GRD+'/'+region+'/' 
#datadir_SIR = dataDir + sat_SIR+'_'+sensor_SIR+'/'+region+'/' 
# on MJ's machine
datadir_GRD = "%s/%s_%s/%s/nc_cubes/cubes_%s/" % (
    dataDir, sat_GRD, sensor_GRD, hemName, region )
datadir_SIR = "%s/%s_%s/%s/nc_cubes/cubes_%s/" % (
    dataDir, sat_SIR, sensor_SIR, hemName, region )

# prefix filepath
prefix_GRD = 'CETB.cubefile.'+region+'.'+sat_GRD+'_'+sensor_GRD+'-'+channel_GRD+'-'+alg_GRD+'-'+provider+'-'+version
prefix_SIR = 'CETB.cubefile.'+region+'.'+sat_SIR+'_'+sensor_SIR+'-'+channel_SIR+'-'+alg_SIR+'-'+provider+'-'+version

Years=years_for(sat_GRD)
#might want to truncate Years to subset if very slow during testing
#if we give it more years than available what do we want it to do? 
#warn me but return what it finds

# Only truncate Years here for speed and/or testing
subYears = Years[0:2] 
#subYears = Years


In [None]:
subYears

In [None]:
get_site_boundaries('')

## Specify the geographic bounds of the subset area inside the cube to process

Also set the 'Site' name for identifying output files.

In [None]:
SiteLabel='NEUkraine'
lat_start, lat_end, lon_start, lon_end, Site=get_site_boundaries(SiteLabel)
lat_start, lat_end, lon_start, lon_end, Site

In [None]:
dataDir, datadir_GRD, prefix_GRD

In [None]:
# get the GRD pixel IDs for the lat/lon rectangle chosen
# and then calculate the corrsponding SIR pixel row/col numbers
rows_cols_GRD=coords(datadir_GRD, prefix_GRD, lat_start, lat_end, lon_start, lon_end)
rows_cols_env = tuple(np.array(rows_cols_GRD) * sir_2_grd_factor)
print(rows_cols_GRD)
print(rows_cols_env)

In [None]:
# load GRD Tb data
data_GRD = read_Tb_whole(datadir_GRD, prefix_GRD, subYears,
                         rows_cols_GRD[0], rows_cols_GRD[1], rows_cols_GRD[2], rows_cols_GRD[3])

# load in SIR TB data
data_SIR = read_Tb_whole(datadir_SIR, prefix_SIR, subYears,
                         rows_cols_env[0], rows_cols_env[1], rows_cols_env[2], rows_cols_env[3])


In [None]:
# calculate DAV for the Tb data that was imported
DAV_GRD = calc_DAV(data_GRD['TB'])
DAV_SIR = calc_DAV(data_SIR['TB'])

In [None]:
# Examine shapes of the dataFrames and the DAV (masked arrays)--these are (timeSteps, rows, cols)
data_SIR['TB'].shape, DAV_SIR.shape, data_GRD['TB'].shape, DAV_GRD.shape

## TODO: Placeholder here to calculate the std dev of the 64 SIR pixels in each GRD pixel
## Will also need to decide how to save this additional information

## Specify the MOD and EHD parameters

window : window for MOD algorithm, '10' would be 5 days (remember that the CETB data come in 2 measurements per day)

count : number of Tb/DAV exceedances needed to trigger melt-onset-date

DAV and TB thresholds here are from these publications:

Johnson et al 2020 AMSRE rSIR Tb >= 249 DAV>=13 and AMSRE GRD Tb>=243 DAV>=14

From Johnson et al 2020 SSMI rSIR and GRD Tb>=247 DAV>=10

DAV_threshold : diurnal amplitude variation in Kelvins default is 10 (per pubs)
Tb_threshold : TB threshold above which melt is possibly triggered

Colorado (Johnson et al 2020) used 5 times in 7 day window
Patagonia (Monahan and Ramage 2010) and Yukon (Semmens et al 2013?) used 3 times in 5 day window


In [None]:
# From Matias Fall 2022 forward facing indexer to assign rolling sum value to the beginning of window
# Setting the window_size sets the number of observations,'14' would be 7 days (2 measurements per day)
# If you don't want to use the forward facing indexer, then change "window" to a numeral, this will assign
# the rolling sum value to the end of the window
MOD_window = 10
MOD_count = 1
indexer = pd.api.indexers.FixedForwardWindowIndexer(window_size=MOD_window)
window = indexer   # this assigns the rolling sum value to the beginning of the window
#window = MOD_window # this assigns the rolling sum value to the end of the window

# Number of Tb/DAV exceedances to trigger EHD = end of high DAV
# At the current time, EHD is not using the forward facing indexer, using default behavior
# From Matias Fall 2022
# Theoretically you could do the same thing with the EDH_window here, but I'm not sure
# you would ever want that
EHD_window = 20 
EHD_count = 7 

# number of Tb/DAV exceedances to trigger MOD
#From Johnson et al 2020 AMSRE rSIR Tb >= 249 DAV>=13 and AMSRE GRD Tb>=243 DAV>=14
#From Johnson et al 2020 SSMI rSIR and GRD Tb>=247 DAV>=10
DAV_threshold = 8
Tb_threshold = 252

In [None]:
# generate histogram - SIR - all data
# This histogram will include all points that are in the CETB_SIR array
year=2021

#data = CETB_SIR[cal_year==year]
data = data_SIR['TB'][data_SIR['cal_year']==year, :, :] # SIR data for all pixels in this year
data = data[data>=0]
bins = range(150, 300)  # bins for histogram
fig,ax = plt.subplots()
ax.hist(data, bins)
ax.set_title('TB SIR Histogram, area=%s, year=%s' % (Site, str(year)))
ax.axvline(x=Tb_threshold, color='red')
ax.set_xlabel('Brightness Temp (K)') 

# Create data frames of MOD and EHD


In [None]:
## MOD of the GRD pixel - avg all years
MOD_DOY_GRD_df, meltflag_GRD_df, EHD_DOY_GRD_df, EHDflag_GRD_df = MOD_array(
    datadir_GRD, prefix_GRD, data_GRD, DAV_GRD, rows_cols_GRD, 
    subYears, window, MOD_count, EHD_window, EHD_count, DAV_threshold, Tb_threshold)
MOD_DOY_GRD_df

# the meltflag dataframe 

Has an entry for each date (morning and evening) on rows, and for each pixel on columns 

It contains a 1 for any location/date that the melt criteria were met

In [None]:
meltflag_GRD_df

In [None]:
# sir MOD array - MOD will be in day of year (DOY) #changeback to Years for all years
MOD_DOY_df, meltflag_df, EHD_DOY_df, EHDflag_df = MOD_array(
    datadir_SIR, prefix_SIR, data_SIR, DAV_SIR, rows_cols_env, 
    subYears, window, MOD_count, EHD_window, EHD_count, DAV_threshold, Tb_threshold)
MOD_DOY_df

# Make a histogram of the MODs for a selected year

In [None]:
fig,ax = plt.subplots()
MOD_DOY_GRD_df.hist(ax=ax,column=2021)
ax.set_title('MOD Histogram, area=%s, year=%s' % (Site, str(year)))
ax.set_xlabel('DOY') 

# Notes about changes from original notebooks:

The old notebooks used to call MOD_array to get the average MOD for a set of years 
and then call MOD_array_year for a given year of interest.

Now we just call MOD_array for SIR data and GRD data, and get back a data frame with 
MOD columns for each individual year, and one column for the avg MOD for all the years.

This runs much faster, and can be saved and just re-read from a saved file on disk.

The old notebooks only calculated MOD, the current versions of the MOD_array function also calculate EHD.

You might decide to change MOD_array to only do one or the other based on an input switch if you don't always
want both calculations.

Now we are setting up to save these data and the way to read them in another notebook is:

new = pd.read_pickle(MOD_DOY_filename)

# Save the MOD by year data frames for SIR and GRD to pickle files

Also saving geolocation and melt onset flag data frames

In [None]:
if not os.path.isdir(outDir):
    os.makedirs(outDir)

# Set a short string in the filename to indicate whether window was the indexer function or a plain integer
if isinstance(window, int):
    rollingSumLabel = 'end'
else:
    rollingSumLabel = 'beg'
    
MODinfo = "MOD_C%1dW%02d%sT%03dD%02d" % (
    MOD_count, MOD_window, rollingSumLabel, Tb_threshold, DAV_threshold)  
meltflaginfo = "meltflag_C%1dW%02d%sT%03dD%02d" % (
    MOD_count, MOD_window, rollingSumLabel, Tb_threshold, DAV_threshold) 

sirMODBasename = "%s/%s.%s.%s.%s.%s.%s-%s.%s" % (
    outDir, data_SIR['gpd'], region, SiteLabel, sat_SIR, channel_SIR, 
    subYears[0], subYears[-1], MODinfo)
grdMODBasename = "%s/%s.%s.%s.%s.%s.%s-%s.%s" % (
    outDir, data_GRD['gpd'], region, SiteLabel, sat_GRD, channel_GRD, 
    subYears[0], subYears[-1], MODinfo)

sirmeltflagBasename = "%s/%s.%s.%s.%s.%s.%s-%s.%s" % (
    outDir, data_SIR['gpd'], region, SiteLabel, sat_SIR, channel_SIR, 
    subYears[0], subYears[-1], meltflaginfo)
grdmeltflagBasename = "%s/%s.%s.%s.%s.%s.%s-%s.%s" % (
    outDir, data_GRD['gpd'], region, SiteLabel, sat_GRD, channel_GRD, 
    subYears[0], subYears[-1], meltflaginfo)

filename = "%s.pkl" % (sirMODBasename)
MOD_DOY_df.to_pickle(filename)
print("MOD_DOY dataframe saved to %s\n" % filename)

filename = "%s.pkl" % (grdMODBasename)
MOD_DOY_GRD_df.to_pickle(filename)
print("MOD_DOY_GRD dataframe saved to %s\n" % filename)

filename = "%s.pkl" % (sirmeltflagBasename)
meltflag_df.to_pickle(filename)
print("meltflag_df dataframe saved to %s\n" % filename)

filename = "%s.pkl" % (grdmeltflagBasename)
meltflag_GRD_df.to_pickle(filename)
print("meltflag_GRD_df dataframe saved to %s\n" % filename)

# Save the EHD by year data frames for SIR and GRD to pickle files

Also saving geolocation and EHD flag data frames

In [None]:
if not os.path.isdir(outDir):
    os.makedirs(outDir)

# Set a short string in the filename to indicate whether window was the indexer function or a plain integer
if isinstance(EHD_window, int):
    rollingSumLabel = 'end'
else:
    rollingSumLabel = 'beg'
    
EHDinfo = "EHD_C%1dW%02d%sT%03dD%02d" % (
    EHD_count, EHD_window, rollingSumLabel, Tb_threshold, DAV_threshold)  
EHDflaginfo = "EHDflag_C%1dW%02d%sT%03dD%02d" % (
    EHD_count, EHD_window, rollingSumLabel, Tb_threshold, DAV_threshold) 

sirEHDBasename = "%s/%s.%s.%s.%s.%s.%s-%s.%s" % (
    outDir, data_SIR['gpd'], region, SiteLabel, sat_SIR, channel_SIR, 
    subYears[0], subYears[-1], EHDinfo)
grdEHDBasename = "%s/%s.%s.%s.%s.%s.%s-%s.%s" % (
    outDir, data_GRD['gpd'], region, SiteLabel, sat_GRD, channel_GRD, 
    subYears[0], subYears[-1], EHDinfo)

sirEHDflagBasename = "%s/%s.%s.%s.%s.%s.%s-%s.%s" % (
    outDir, data_SIR['gpd'], region, SiteLabel, sat_SIR, channel_SIR, 
    subYears[0], subYears[-1], EHDflaginfo)
grdEHDflagBasename = "%s/%s.%s.%s.%s.%s.%s-%s.%s" % (
    outDir, data_GRD['gpd'], region, SiteLabel, sat_GRD, channel_GRD, 
    subYears[0], subYears[-1], EHDflaginfo)

filename = "%s.pkl" % (sirEHDBasename)
EHD_DOY_df.to_pickle(filename)
print("EHD_DOY dataframe saved to %s\n" % filename)

filename = "%s.pkl" % (grdEHDBasename)
EHD_DOY_GRD_df.to_pickle(filename)
print("EHD_DOY_GRD dataframe saved to %s\n" % filename)

filename = "%s.pkl" % (sirEHDflagBasename)
EHDflag_df.to_pickle(filename)
print("EHDflag_df dataframe saved to %s\n" % filename)

filename = "%s.pkl" % (grdEHDflagBasename)
EHDflag_GRD_df.to_pickle(filename)
print("EHDflag_GRD_df dataframe saved to %s\n" % filename)

# Optionally, save the MOD for each year as a geotiff

In [None]:
# Take the subYears date range out of the basename strings, so that the geotiff routine can add the year itself
sirMODBasename = "%s/%s.%s.%s.%s.%s.%s" % (
    outDir, data_SIR['gpd'], region, SiteLabel, sat_SIR, channel_SIR, MODinfo)
grdMODBasename = "%s/%s.%s.%s.%s.%s.%s" % (
    outDir, data_GRD['gpd'], region, SiteLabel, sat_GRD, channel_GRD, MODinfo)

outSIR = write_df_to_geotiff(MOD_DOY_df, data_SIR['gpd'], sirMODBasename, verbose=True)
outGRD = write_df_to_geotiff(MOD_DOY_GRD_df, data_GRD['gpd'], grdMODBasename, verbose=True)

# Optionally, save the EHD for each year as a geotiff

In [None]:
# Take the subYears date range out of the basename strings, so that the geotiff routine can add the year itself
sirEHDBasename = "%s/%s.%s.%s.%s.%s.%s" % (
    outDir, data_SIR['gpd'], region, SiteLabel, sat_SIR, channel_SIR, EHDinfo)
grdEHDBasename = "%s/%s.%s.%s.%s.%s.%s" % (
    outDir, data_GRD['gpd'], region, SiteLabel, sat_GRD, channel_GRD, EHDinfo)

outSIR = write_df_to_geotiff(EHD_DOY_df, data_SIR['gpd'], sirEHDBasename, verbose=True)
outGRD = write_df_to_geotiff(EHD_DOY_GRD_df, data_GRD['gpd'], grdEHDBasename, verbose=True)

# TODO: Other things to potentially save here

#Examples include: saving the DAV, or saving the std dev
    
    