In [35]:
from pysal.viz.mapclassify import UserDefined
import pysal.viz.mapclassify as mc

import pandas as pd
import numpy as np
from tqdm import tqdm

from sandpyper.hotspot import LISA_site_level

### Create multitemporal datased (dh)

In [2]:
full_dataset=pd.read_csv(r"C:\my_packages\doc_data\profiles\classified_data.csv")

In [3]:
def compute_multitemporal (df,
                           date_field='survey_date',
                          sand_label_field='label_sand',
                          common_field="geometry"):



    fusion_long=pd.DataFrame()

    for location in full_dataset.location.unique():
        print(f"working on {location}")
        loc_data=full_dataset.query(f"location=='{location}'")
        list_dates=loc_data.loc[:,date_field].unique()
        list_dates.sort()


        for i in tqdm(range(list_dates.shape[0])):

            if i < list_dates.shape[0]-1:
                date_pre=list_dates[i]
                date_post=list_dates[i+1]
                print(f"Calculating dt{i}, from {date_pre} to {date_post} in {location}.")

                df_pre=loc_data.query(f"{date_field} =='{date_pre}' & {sand_label_field} == 0").dropna(subset=['z'])
                df_post=loc_data.query(f"{date_field} =='{date_post}' & {sand_label_field} == 0").dropna(subset=['z'])

                merged=pd.merge(df_pre,df_post, how='inner', on=common_field,validate="one_to_one",suffixes=('_pre','_post'))
                merged["dh"]=merged.z_post.astype(float) - merged.z_pre.astype(float)

                dict_short={"geometry": merged.geometry,
                            "location":location,
                            "tr_id":merged.tr_id_pre,
                            "distance":merged.distance_pre,
                            "dt":  f"dt_{i}",
                            "date_pre":date_pre,
                            "date_post":date_post,
                            "z_pre":merged.z_pre.astype(float),
                            "z_post":merged.z_post.astype(float),
                            "dh":merged.dh}

                short_df=pd.DataFrame(dict_short)
                fusion_long=pd.concat([short_df,fusion_long],ignore_index=True)

    print("done")
    return fusion_long

In [4]:
dh_df=compute_multitemporal(full_dataset,
                      date_field='survey_date',
                      sand_label_field='label_sand')

 33%|████████████████████████████                                                        | 3/9 [00:00<00:00, 24.45it/s]

working on mar
Calculating dt0, from 2018-06-01 to 2018-06-21 in mar.
Calculating dt1, from 2018-06-21 to 2018-07-27 in mar.
Calculating dt2, from 2018-07-27 to 2018-09-25 in mar.
Calculating dt3, from 2018-09-25 to 2018-11-13 in mar.
Calculating dt4, from 2018-11-13 to 2018-12-11 in mar.


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 24.67it/s]
  0%|                                                                                            | 0/7 [00:00<?, ?it/s]

Calculating dt5, from 2018-12-11 to 2019-02-05 in mar.
Calculating dt6, from 2019-02-05 to 2019-03-13 in mar.
Calculating dt7, from 2019-03-13 to 2019-05-16 in mar.
working on leo
Calculating dt0, from 2018-06-06 to 2018-07-13 in leo.
Calculating dt1, from 2018-07-13 to 2018-07-25 in leo.


100%|████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 33.58it/s]

Calculating dt2, from 2018-07-25 to 2018-09-20 in leo.
Calculating dt3, from 2018-09-20 to 2019-02-11 in leo.
Calculating dt4, from 2019-02-11 to 2019-03-28 in leo.
Calculating dt5, from 2019-03-28 to 2019-07-31 in leo.
done





## Compute location level hotspot 

In [21]:
crs_dict_string={"mar":{'init': 'epsg:32754'},
         "leo":{'init': 'epsg:32755'}}

In [23]:
distance_value=35 #enough to include two adjacent transect and some obliques without getting to the second transect
k_value=0
mode="idw" #select from "knn" or "distance"

In [24]:
dh_df.location.unique()

array(['leo', 'mar'], dtype=object)

In [25]:
# run the analysis using a row standardised binary weight matrix with neighborhood radius of 35 m.

lisa_df=LISA_site_level(df=dh_df,
                        mode='distance',
                        distance_value=35,
                        unique_field="geometry",
                        crs_dict_string=crs_dict_string
)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for loc in tqdm(locs):


  0%|          | 0/2 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_in['geometry'] = df_in.loc[:, unique_field].apply(


Working on leo


  return _prepare_from_string(" ".join(pjargs))
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for dt in tqdm(dts):


  0%|          | 0/6 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gdf_input.dropna(axis=0, how='any', subset=['dh'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super(GeoDataFrame, self).__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gdf_input.dropna(axis=0, how='any', subset=['dh'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats

Working on mar


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_in['geometry'] = df_in.loc[:, unique_field].apply(
  return _prepare_from_string(" ".join(pjargs))
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for dt in tqdm(dts):


  0%|          | 0/8 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gdf_input.dropna(axis=0, how='any', subset=['dh'], inplace=True)
Island id: 6525
  w = W(neighbors, weights, ids, **kwargs)
Island id: 6525
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super(GeoDataFrame, self).__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gdf_input.dropna(axis=0, how='any', subset=['dh'], inplace=True)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super(GeoDataFrame, self).__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gdf_input.dropna(axis=0, how='any', subset=['dh'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super(GeoDataFrame, self).__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documen

In [27]:
# we obtain a dataset containing the fdr threshold, local moran-s Is, p and z values and,
# the quadrant in which each observation falls in a Moran's scatter plot.

# We are interested in HH and LL clusters, which we call hotspots, while Lh and HL points are sptial outliers.

lisa_df.tail()

Unnamed: 0,geometry,location,tr_id,distance,dt,date_pre,date_post,z_pre,z_post,dh,lisa_fdr,lisa_q,lisa_I,lisa_n_val_obs,lisa_opt_dist,lisa_dist_mode,lisa_p_sim,lisa_z_sim,lisa_z,decay
38676,POINT (731460.363 5705157.201),mar,3,52.4,dt_0,2018-06-01,2018-06-21,4.663569,4.66578,0.00221,0.042946,2,-0.074988,4083,35,distance_band,0.001,-5.634389,-0.26433,0
38677,POINT (731460.264 5705157.212),mar,3,52.5,dt_0,2018-06-01,2018-06-21,4.780209,4.787802,0.007593,0.042946,2,-0.069911,4083,35,distance_band,0.001,-5.678814,-0.246477,0
38678,POINT (731460.164 5705157.222),mar,3,52.6,dt_0,2018-06-01,2018-06-21,4.875618,4.903659,0.028041,0.042946,2,-0.050642,4083,35,distance_band,0.001,-5.780715,-0.178654,0
38679,POINT (731460.065 5705157.233),mar,3,52.7,dt_0,2018-06-01,2018-06-21,4.970114,4.974202,0.004088,0.042946,2,-0.073217,4083,35,distance_band,0.001,-5.848848,-0.258102,0
38680,POINT (731458.673 5705157.383),mar,3,54.1,dt_0,2018-06-01,2018-06-21,5.917675,5.955576,0.037901,0.042946,2,-0.041359,4083,35,distance_band,0.001,-5.841232,-0.14595,0


In [26]:
lisa_df.to_csv(r"C:\my_packages\doc_data\profiles\lisa_location.csv")

### Classify dh magnitudes and create classes of elevation changes (transient states)

In [41]:
#filter only significant HH and LL hotspots
#filter only beyond global LoD dh

sig_hhll=lisa_df.query("lisa_p_sim <= 0.001 & lisa_q in [1,3]")
sig_hhll= sig_hhll[~sig_hhll['dh'].between(-0.05, 0.05)]
sig_hhll.index=(range(sig_hhll.shape[0]))

In [42]:
#separate the erosion and the deposition clusters

sig_hhll_ero=sig_hhll[sig_hhll.dh < 0]
sig_hhll_depo=sig_hhll[sig_hhll.dh > 0]

## Equal classes adapted from JC

First, we use __Jenks_Caspall classification method (Natural Breaks)__ to obtain 5 classes of erosional and depositional elevation change values  in hotspots (LL and HH).

Then, in order to have the same magnitude classes for both erosion and deposition, we compute the __mean between cut-off values__ and obtain a Jenks-Caspall informed classification scheme.

__UPDATED__: we use JC on absolute value of dh outside -0.05 to 0.05

In [43]:
absolute=np.abs(sig_hhll.dh)
bins_abs_JC=mc.JenksCaspall(absolute)
print(f"Fit of the classifier: {bins_abs_JC.adcm}")

bins_abs_JC

Fit of the classifier: 1106.8712480317356


              JenksCaspall              
 
Lower          Upper               Count
        x[i] <= 0.170               6568
0.170 < x[i] <= 0.324               5265
0.324 < x[i] <= 0.537               3744
0.537 < x[i] <= 1.213               2073
1.213 < x[i] <= 4.973                737

In [49]:
bins_depo = [0.17, 0.32,0.54,1.21]     # Chloe hotspots full invof depositional classes paper ABSOLUTE values
bins_ero = [-1.21,-0.54,-0.32,-0.17]   # cChloe hotspots erosional classes paper ABSOLUTE values

bins_ero_JC = UserDefined(sig_hhll_ero.dh, bins_ero)
bins_depo_JC = UserDefined(sig_hhll_depo.dh, bins_depo)

# in what bin is my data?

class_erosion=bins_ero_JC.yb.tolist()
class_deposition=bins_depo_JC.yb.tolist()

In [50]:
# assign every bin to the right label (see table 1)

states_ero={0:"ee",1:"he",2:"me",3:"se",4:"ue"}   
states_depo={0:"ud",1:"sd",2:"md",3:"hd",4:"ed"}


tags_erosion=[states_ero[i] for i in class_erosion]
tags_deposition=[states_depo[i] for i in class_deposition]

sig_hhll_ero["jc_bin"]=class_erosion
sig_hhll_depo["jc_bin"]=class_deposition

sig_hhll_ero["markov_tag"]=tags_erosion
sig_hhll_depo["markov_tag"]=tags_deposition

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super(GeoDataFrame, self).__setitem__(key, value)


In [53]:
# concatenate both classified datasets to finally obtain
# a dataset of HH and LL hotspots (or clusters) of sand-only beach change

labelled_hotspot_df=pd.concat([sig_hhll_ero,sig_hhll_depo],ignore_index=False)
labelled_hotspot_df

Unnamed: 0,geometry,location,tr_id,distance,dt,date_pre,date_post,z_pre,z_post,dh,...,lisa_I,lisa_n_val_obs,lisa_opt_dist,lisa_dist_mode,lisa_p_sim,lisa_z_sim,lisa_z,decay,jc_bin,markov_tag
0,POINT (299913.616 5773633.212),leo,66,25.9,dt_5,2019-03-28,2019-07-31,0.254045,0.063604,-0.190440,...,0.064880,1526,35,distance_band,0.001,2.855193,-0.324401,0,3,se
1,POINT (299913.323 5773633.148),leo,66,26.2,dt_5,2019-03-28,2019-07-31,0.282150,0.101436,-0.180713,...,0.052581,1526,35,distance_band,0.001,2.782864,-0.262422,0,3,se
2,POINT (299912.834 5773633.041),leo,66,26.7,dt_5,2019-03-28,2019-07-31,0.334896,0.155535,-0.179361,...,0.050868,1526,35,distance_band,0.001,2.748782,-0.253808,0,3,se
3,POINT (299912.151 5773632.892),leo,66,27.4,dt_5,2019-03-28,2019-07-31,0.422973,0.256882,-0.166091,...,0.034007,1526,35,distance_band,0.001,2.681361,-0.169253,0,4,ue
4,POINT (299910.196 5773632.465),leo,66,29.4,dt_5,2019-03-28,2019-07-31,0.664177,0.523974,-0.140203,...,0.000868,1526,35,distance_band,0.001,2.835987,-0.004299,0,4,ue
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18382,POINT (731468.019 5705156.375),mar,3,44.7,dt_0,2018-06-01,2018-06-21,2.075071,2.708724,0.633653,...,0.508978,4083,35,distance_band,0.001,5.684459,1.830060,0,3,hd
18383,POINT (731467.820 5705156.397),mar,3,44.9,dt_0,2018-06-01,2018-06-21,2.090222,2.918043,0.827821,...,0.683857,4083,35,distance_band,0.001,5.455209,2.474081,0,3,hd
18384,POINT (731467.720 5705156.408),mar,3,45.0,dt_0,2018-06-01,2018-06-21,2.116984,3.143203,1.026220,...,0.860269,4083,35,distance_band,0.001,5.430229,3.132136,0,3,hd
18385,POINT (731467.621 5705156.418),mar,3,45.1,dt_0,2018-06-01,2018-06-21,2.140646,3.303227,1.162581,...,0.980183,4083,35,distance_band,0.001,5.837255,3.584423,0,3,hd


In [54]:
# this df is ready for BCD indices computation
labelled_hotspot_df.to_csv(r"C:\my_packages\doc_data\profiles\markov_tagged_df.csv")