# Imports

In [2118]:
from matplotlib import pyplot as plt
from pathlib import Path
import pandas as pd
import seaborn as sns
import scipy as sp
import hvplot.pandas
import numpy as np
import holoviews as hv

## Load TSM RS data

In [2259]:
paths = list(Path('validation').glob(f'rrs_timeseries_siwrp.csv'))
path_csv = paths[0]
df_rs = pd.read_csv(path_csv, dtype={'station_id':str})
df_rs['dt_utc'] = pd.to_datetime(df_rs['system:time_start'], format='mixed', utc=True)
df_rs['dt_loc'] = df_rs['dt_utc'].dt.tz_convert('Asia/Ho_Chi_Minh')
df_rs['date'] = df_rs['dt_loc'].dt.date
df_rs = df_rs.drop(columns=['.geo', 'system:time_start'], errors='ignore')
df_rs = df_rs.sort_values('date')
df_rs['data_source'] = 'siwrp'
print(f'Loaded "{path_csv}" file.')

Loaded "validation\sr_timeseries_siac_downstream.csv" file.


## Load TSM RS data (SIAC)

In [2260]:
paths = list(Path('validation').glob(f'sr_timeseries_siac_siwrp.csv'))
path_csv = paths[0]
df_rs = pd.read_csv(path_csv, dtype={'station_id':str})
df_rs['dt_utc'] = pd.to_datetime(df_rs['system:time_start'], format='mixed', utc=True)
df_rs['dt_loc'] = df_rs['dt_utc'].dt.tz_convert('Asia/Ho_Chi_Minh')
df_rs['date'] = df_rs['dt_loc'].dt.date
df_rs = df_rs.drop(columns=['.geo', 'system:time_start'], errors='ignore')
df_rs = df_rs.sort_values('date')
df_rs['data_source'] = 'siwrp'
df_rs_siwrp = df_rs
print(f'Loaded "{path_csv}" file.')

Loaded "validation\sr_timeseries_siac_siwrp.csv" file.


## Load TSM insitu data

In [2257]:
# read SIWRP files
path_metadata = 'data/insitu_siwrp/Tss_LongDataset_VMD.xlsx'
df_metadata = pd.read_excel(path_metadata, sheet_name=1)
df_metadata['station_id'] = df_metadata['Station'].apply(lambda x: x.replace(" ", "").lower())
path_data = 'data/insitu_siwrp/siwrp_data.csv'
df_insitu_siwrp = pd.read_csv(path_data, parse_dates=['date', 'dt_loc_estimated'])
df_insitu_siwrp = df_insitu_siwrp.rename(columns={'dt_loc_estimated': 'dt_loc'})
df_insitu_siwrp['date'] = df_insitu_siwrp['date'].dt.date
df_insitu_siwrp = df_insitu_siwrp.sort_values('date')

print(f'Loaded "{path_data}" file.')

Loaded "data/insitu_siwrp/siwrp_data.csv" file.


In [2251]:
df_insitu = pd.concat([df_insitu_siwrp])
df_insitu = df_insitu.reset_index().drop(columns='index')
df_insitu['station_id'] = df_insitu['station_id'].astype(str)
df_insitu['dt_loc'] = df_insitu.dt_loc.astype('datetime64[ns, UTC+07:00]')

### Get matchups

In [2252]:
import datetime

delta_dt = 6 # allowed timedifference in hours
df_rs = df_rs.reset_index().drop(columns=['index'])

n_matchups = 0
for idx, row in df_rs.iterrows():
    station_id = row.station_id
    timestamp = row.dt_loc
    t_start = timestamp - datetime.timedelta(hours=delta_dt)
    t_end = timestamp + datetime.timedelta(hours=delta_dt)
    matches = df_insitu.loc[(df_insitu.dt_loc <= t_end) & (df_insitu.dt_loc >= t_start)]
    matches = matches.loc[matches.station_id == station_id]
    
    if matches.shape[0] == 0:
        value = np.nan
        dt_loc_insitu = pd.Timestamp('NaT')
        dt_diff = pd.Timestamp('NaT')
    elif matches.shape[0] == 1:
        value = matches.iloc[0].value
        dt_loc_insitu = matches.iloc[0].dt_loc
        dt_diff = abs(matches.iloc[0]['dt_loc']-timestamp)
        n_matchups += 1
    else:
        matches['dt_diff'] = abs(matches['dt_loc']-timestamp)
        matches = matches.sort_values('dt_diff')
        value = matches.iloc[0].value 
        dt_loc_insitu = matches.iloc[0].dt_loc
        dt_diff = matches.iloc[0].dt_diff
        n_matchups += 1
    
    df_rs.loc[idx, 'value'] = value
    df_rs.loc[idx, 'dt_loc_insitu'] = dt_loc_insitu
    df_rs.loc[idx, 'dt_diff'] = dt_diff

print(f'{n_matchups} ({(n_matchups/df_rs.shape[0])*100:0.1f}%) matchups found with maximal difference of {delta_dt} hours.')

418 (40.2%) matchups found with maximal difference of 6 hours.


### Correlation matrix

In [2253]:
centerwl_lut_msi = {
    'B1':442.7, 'B2':492.4, 'B3':559.8, 'B4':664.6, 'B5':704.1, 'B6':740.5,'B7':782.8, 'B8':832.8, 'B8A':864.7, 'B9':945.1, 'B10':1373.5, 'B11':1613.7, 'B12':2202.4
}
centerwl_lut_oli = {
    'B1':442.96, 'B2':482.04, 'B3':561.41, 'B4':654.59, 'B5':864.67, 'B6':1608.86, 'B7':2200.73,'B8':590,'B9':1375,'B10':10800,'B11':1200  
}

In [2254]:
# Prepare data
thresh_cloud_cover = 80
thresh_roi_coverage = 95
data = df_rs.copy()
data = data.loc[data.CLOUD_COVER<=thresh_cloud_cover]
data = data.loc[data.roi_coverage>=thresh_roi_coverage]
data_msi = data.loc[data.platform=='SENTINEL-2']
data_msi = data_msi[data_msi.columns[((data_msi.columns.str.startswith('B'))&(data_msi.columns.str.endswith('_median')))|(data_msi.columns=='value')]]
data_oli = data.loc[data.platform=='LANDSAT-8']
data_oli = data_oli[data_oli.columns[((data_oli.columns.str.startswith('B'))&(data_oli.columns.str.endswith('_median')))|(data_oli.columns=='value')]]

# Compute a correlation matrix and convert to long-form
corr_mat = data_msi.corr().stack().reset_index(name="correlation")
corr_mat = corr_mat.loc[corr_mat.level_0=='value'].reset_index().drop(columns=['level_0', 'index']).rename(columns={'level_1': 'band'})
corr_mat_msi = corr_mat.loc[corr_mat.band!='value']
corr_mat_msi['band'] = corr_mat_msi.band.apply(lambda x: x.split('_')[0])
corr_mat_msi['wavelength'] = corr_mat_msi.band.apply(lambda x: centerwl_lut_msi[x])
corr_mat_msi = corr_mat_msi.sort_values('wavelength')

corr_mat = data_oli.corr().stack().reset_index(name="correlation")
corr_mat = corr_mat.loc[corr_mat.level_0=='value'].reset_index().drop(columns=['level_0', 'index']).rename(columns={'level_1': 'band'})
corr_mat_oli = corr_mat.loc[corr_mat.band!='value']
corr_mat_oli['band'] = corr_mat_oli.band.apply(lambda x: x.split('_')[0])
corr_mat_oli['wavelength'] = corr_mat_oli.band.apply(lambda x: centerwl_lut_oli[x])
corr_mat_oli = corr_mat_oli.sort_values('wavelength')

hline = hv.HLine(0).opts(
    color='grey', 
    line_dash='dashed', 
    line_width=1.0,
)

clim = (-1, 1)

plot_msi =  hline *\
            corr_mat_msi.hvplot.line(x='wavelength', y='correlation', clim=clim) *\
            corr_mat_msi.hvplot.scatter(x='wavelength', y='correlation', 
                xlabel='Wavelength (nm)', ylabel="Correlation\n(Pearson's R)",
                color='correlation', size=100, clim=clim, cmap='bwr',
                title='Sentinel-2 - Rrs-TSM correlation',
                line_color='black',
                xlim=(400,1500), rot=45) * \
            corr_mat_msi.hvplot.labels(x='wavelength', y='correlation', text='band', text_baseline='top')

plot_oli =  hline *\
            corr_mat_oli.hvplot.line(x='wavelength', y='correlation', clim=clim) *\
            corr_mat_oli.hvplot.scatter(x='wavelength', y='correlation', 
                xlabel='Wavelength (nm)', ylabel="Correlation\n(Pearson's R)",
                color='correlation', size=100, clim=clim, cmap='bwr',
                title='Landsat-8 OLI - Rrs-TSM correlation',
                line_color='black',
                xlim=(400,1500), rot=45) * \
            corr_mat_oli.hvplot.labels(x='wavelength', y='correlation', text='band', text_baseline='top')

hv.Layout([plot_msi, plot_oli]).cols(1).opts(hv.opts.Scatter(ylim=(-1,1), clim=(-1, 1)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  corr_mat_msi['band'] = corr_mat_msi.band.apply(lambda x: x.split('_')[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  corr_mat_msi['wavelength'] = corr_mat_msi.band.apply(lambda x: centerwl_lut_msi[x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  corr_mat_oli['band'] = corr_mat_oli.band.apply(