# Example 3 - Labels correction and multitemporal table

<img src="images/banner3.png" width="100%" />

<font face="Calibri">
<br>
<font size="5"> <b>Sand classification, beachface clipping and multitemporal analysis</b></font>

<br>
<font size="4"> <b> Nicolas Pucino; PhD Student @ Deakin University, Australia </b> <br>
<img style="padding:7px;" src="images/sandpiper_sand_retouched.png" width="170" align="right" /></font>

<font size="3">This notebook illustrates how to use assign the final Sand or no-sand labels to the points, clip only beachface areas and create an organised dataframe storing elevation changes from each period available in all locations. <br>

<b>This notebook covers the following concepts:</b>

- Sand vs No-Sand classification.
- Beachface clipping.
- Multitemporal extraction
</font>


</font>

In [37]:
import pandas as pd
import geopandas as gpd
from datetime import datetime
from tqdm.notebook import tqdm
import glob
import matplotlib.pyplot as plt
from sandpyper.outils import coords_to_points
import os

crs_dict_string= {
                 'mar': {'init': 'epsg:32754'},
                 'leo':{'init': 'epsg:32755'}
                 }

pd.options.mode.chained_assignment = None  # default='warn'

import numpy as np

def check_dicts_duplicated_values(l_dicts):
    
    dict_check = {}
    dict_dups = {}
    all_dicts=[dicto for dicto in l_dicts.values()]

    for dict_in in all_dicts:
        for key in set().union(*all_dicts):
            if key in dict_in:
                dict_check.setdefault(key, []).extend(dict_in[key])

    for survey, labels in dict_check.items():
        duplicated=[x for x in labels if labels.count(x) > 1]
        if len(duplicated)>=1:
            dict_dups.update({survey:set(set(duplicated))})

    if len(dict_dups)>0:
        raise ValueError(f"Duplicated label_k found in the following dictionaries.\n\n{dict_dups}\n\nPlease revise and assigned those labels_k to only one class dictionary.")
        
        
        
def classify_labelk(labelled_dataset,l_dicts, cluster_field='label_k', fill_class='sand'):

    check_dicts_duplicated_values(l_dicts)
    
    labelled_dataset["pt_class"]=np.nan


    all_keys = set().union(*(d.keys() for d in [i for i in l_dicts.values()]))
    class_names=l_dicts.keys()

    classed_df=pd.DataFrame()

    for loc in labelled_dataset.location.unique():
        data_in_loc=labelled_dataset.query(f"location=='{loc}'")[["location","raw_date",cluster_field,"pt_class",'point_id']]

        for raw_date in data_in_loc.raw_date.unique():
            loc_date_tag=f"{loc}_{raw_date}"
            data_in=data_in_loc.query(f"raw_date=={raw_date}")

            if loc_date_tag in all_keys:

                for class_in in class_names:

                    if loc_date_tag in l_dicts[class_in].keys():
                        loc_date_class_values=l_dicts[class_in][loc_date_tag]

                        if len(loc_date_class_values)>=1:
                            tmp_dict={label_k:class_in for label_k in loc_date_class_values}
                            data_in['pt_class'].update(data_in[cluster_field].map(tmp_dict))

                        else:
                            pass
                    else:
                        pass
            else:
                print(f"{loc_date_tag} not in the class dictionaries. All their labels assigned to fill_class {fill_class}.")
                data_in["pt_class"].fillna(fill_class, inplace=True)

            classed_df=pd.concat([classed_df,data_in], ignore_index=True)

    merged=pd.merge(left=labelled_dataset.iloc[:,:-1], right=classed_df[['point_id','pt_class']], on='point_id', how='left')
    merged["pt_class"].fillna(fill_class, inplace=True)
    
    return merged

def cleanit(to_clean, l_dicts, cluster_field='label_k', fill_class='sand',
            watermasks_path=None, water_label='water',
            shoremasks_path=None, label_corrections_path=None,
            default_crs={'init': 'epsg:32754'}, crs_dict_string=None,
           geometry_field='coordinates'):
    
    print("Reclassifying dataset with the provided dictionaries." )
    to_clean_classified=classify_labelk(to_clean, l_dicts)
        
    if watermasks_path==None and shoremasks_path==None and label_corrections_path==None:
        print("No cleaning polygones have been passed. Returning classified dataset.")
        return to_clean_classified
    
    processes=[]
        
    if isinstance(to_clean_classified, pd.DataFrame):
        print(f"Pandas DataFrame provided. Transforming into GeoDataFrame with default_crs ({default_crs}) and provided field {geometry_field} .")
        
        input_crs=default_crs
        to_clean_classified.loc[:,'geometry']=to_clean_classified.loc[:,geometry_field].apply(coords_to_points)
        to_clean_classified=gpd.GeoDataFrame(to_clean_classified, geometry='geometry', crs=input_crs)


    elif isinstance(to_clean_classified, gpd.GeoDataFrame):
        input_crs=to_clean_classified.crs
    else:
        raise ValueError("to_clean_classified must be either a GeoDataFrame or a DataFRame with a geometry column in string dtype.")

    
    if label_corrections_path != None and os.path.isfile(label_corrections_path):
        label_corrections=gpd.read_file(label_corrections_path)
        print(f"Label corrections provided in CRS: {label_corrections.crs}")
        processes.append("polygon finetuning")
        to_update_finetune=pd.DataFrame()
        
                
        for loc in label_corrections.location.unique():
            print(f"Fine tuning in {loc}.")
            
            to_clean_subset_loc=to_clean_classified.query(f" location == '{loc}'")
            
            for raw_date in tqdm(label_corrections.query(f"location=='{loc}'").raw_date.unique()):
                
                subset_finetune_polys=label_corrections.query(f"location=='{loc}' and raw_date=={raw_date}")
                
                for i,row in subset_finetune_polys.iterrows(): # loops through all the polygones

                    target_k=int(row['target_label_k'])
                    new_class=row['new_class']
                    
                    if target_k != 999:

                        data_in=to_clean_subset_loc.query(f"raw_date == {raw_date} and label_k=={target_k}")
                        
                        selection=data_in[data_in.geometry.intersects(row['geometry'])]
                        selection["finetuned_label"]=new_class

                    elif target_k == 999:

                        data_in=to_clean_subset_loc.query(f"raw_date == {raw_date}")
                        selection=data_in[data_in.geometry.intersects(row['geometry'])]
                        selection["finetuned_label"]=new_class

                    print(f"Fine-tuning label_k {target_k} to {new_class} in {loc}-{raw_date}, found {selection.shape[0]} pts.")
                    to_update_finetune=pd.concat([selection,to_update_finetune], ignore_index=True)
                
        classed_df_finetuned=pd.merge(left=to_clean_classified, right=to_update_finetune.loc[:,['point_id','finetuned_label']], # Left Join 
                                     how='left', validate='one_to_one') 
        classed_df_finetuned.finetuned_label.fillna(classed_df_finetuned.pt_class, inplace=True) # Fill NaN with previous sand labels
        classed_df_finetuned["geometry"]=classed_df_finetuned.coordinates.apply(coords_to_points)
        classed_df_finetuned=gpd.GeoDataFrame(classed_df_finetuned,geometry='geometry', crs=input_crs)
        
    else:
        pass
    
    if shoremasks_path == None and watermasks_path == None:
        print(f"{processes} completed.")
        return classed_df_finetuned
    else:
        pass
    
    
    if watermasks_path != None and os.path.isfile(watermasks_path):
        # apply watermasks
        watermask=gpd.read_file(watermasks_path)
        print(f"watermask  provided in CRS: {watermask.crs}")

        
        print("Applying watermasks cleaning.")
        processes.append("watermasking")
        
        if "polygon finetuning" in processes:
            dataset_to_clean=classed_df_finetuned
            starting_labels='finetuned_label'
        else:
            dataset_to_clean=to_clean_classified
            starting_labels='pt_class'
            
        
        to_update_watermasked=pd.DataFrame()

        for loc in watermask.location.unique():
            print(f"Watermasking in {loc}.")
            
            for raw_date in tqdm(watermask.query(f"location=='{loc}'").raw_date.unique()):

                subset_data=dataset_to_clean.query(f"location=='{loc}' and raw_date == {raw_date}")
                subset_data["geometry"]=subset_data.coordinates.apply(coords_to_points) # might not be necessery
                subset_gdf=gpd.GeoDataFrame(subset_data,geometry='geometry', crs=crs_dict_string[loc])

                subset_masks=watermask.query(f"location=='{loc}' and raw_date == {raw_date}")

                selection=subset_gdf[subset_gdf.geometry.intersects(subset_masks.to_crs(crs_dict_string[loc]).geometry.iloc[0])]
                print(f"Setting to {water_label} {selection.shape[0]} pts overlapping provided watermasks.")
                
                selection["watermasked_label"]=water_label

                to_update_watermasked=pd.concat([selection,to_update_watermasked], ignore_index=True)

        classed_df_watermasked=pd.merge(left=dataset_to_clean, right=to_update_watermasked.loc[:,['point_id','watermasked_label']], # Left Join 
                                     how='left', validate='one_to_one') 
        classed_df_watermasked.watermasked_label.fillna(classed_df_watermasked.loc[:,starting_labels], inplace=True) # Fill NaN with previous sand labels
        classed_df_watermasked["geometry"]=classed_df_watermasked.coordinates.apply(coords_to_points)
        classed_df_watermasked=gpd.GeoDataFrame(classed_df_watermasked,geometry='geometry', crs=input_crs)
        
        if shoremasks_path == None:
            print(f"{processes} completed.")
            return classed_df_watermasked
        
    else:
        pass

    if shoremasks_path != None and os.path.isfile(shoremasks_path):
        # apply shoremasks
        shoremask=gpd.read_file(shoremasks_path)
        print(f"shoremask  provided in CRS: {shoremask.crs}")
        print("Applying shoremasks cleaning.")
        processes.append("shoremasking")
        
        
        if "polygon finetuning" in processes and "watermasking" not in processes:
            dataset_to_clean=classed_df_finetuned
            starting_labels='finetuned_label'
        elif "polygon finetuning" not in processes and "watermasking" in processes:
            dataset_to_clean=classed_df_watermasked
            starting_labels='watermasked_label'
        else:
            dataset_to_clean=to_clean_classified
            starting_labels='pt_class'
        
        inshore_cleaned=gpd.GeoDataFrame()
        for loc in shoremask.location.unique():
            print(f"Shoremasking in {loc}.")
            
            shore=shoremask.query(f"location=='{loc}'")
            loc_selection=dataset_to_clean.query(f"location=='{loc}'")
            in_shore=loc_selection[loc_selection.geometry.intersects(shore.to_crs(crs_dict_string[loc]).geometry.iloc[0])]
            print(f"Removing {loc_selection.shape[0] - in_shore.shape[0]} pts falling outside provided shore polygones.")
            inshore_cleaned=pd.concat([in_shore,inshore_cleaned], ignore_index=True)

    print(f"{processes} completed.")
    return inshore_cleaned

In [29]:
labelled_dataset=pd.read_csv(r"C:\my_packages\sandpyper\tests\test_data\test_to_classify.csv")

In [46]:
labelled_dataset.columns

Index(['Unnamed: 0', 'point_id', 'label_k', 'distance', 'z', 'tr_id',
       'raw_date', 'coordinates', 'location', 'survey_date', 'x', 'y', 'band1',
       'band2', 'band3', 'spatial_id', 'pt_class'],
      dtype='object')

In [30]:
# QGIS Labels

In [31]:
water_dict={'leo_20180606':[0,9,10],
'leo_20180713':[0,3,4,7],
'leo_20180920':[0,2,6,7],
'leo_20190211':[0,2,5],
'leo_20190328':[2,4,5],
'leo_20190731':[0,2,8,6],
'mar_20180601':[1,6],
'mar_20180621':[4,6],
'mar_20180727':[0,5,9,10],
'mar_20180925':[0],
'mar_20181113':[1],
'mar_20181211':[4],
'mar_20190205':[],
'mar_20190313':[],
'mar_20190516':[4,7]}

no_sand_dict={'leo_20180606':[5],
'leo_20180713':[],
'leo_20180920':[],
'leo_20190211':[1],
'leo_20190328':[],
'leo_20190731':[1],
'mar_20180601':[4,5],
'mar_20180621':[3,5],
'mar_20180727':[4,7],
'mar_20180925':[1,6],
'mar_20181113':[0],
'mar_20181211':[0],
'mar_20190205':[0,5],
'mar_20190313':[4],
'mar_20190516':[2,5]}

veg_dict={'leo_20180606':[1,3,7,8],
'leo_20180713':[1,5,9],
'leo_20180920':[1,4,5],
'leo_20190211':[4],
'leo_20190328':[0,1,6],
'leo_20190731':[3,7],
'mar_20180601':[0,7],
'mar_20180621':[1,7],
'mar_20180727':[1,3],
'mar_20180925':[4],
'mar_20181113':[3],
'mar_20181211':[2],
'mar_20190205':[3],
'mar_20190313':[1,5],
'mar_20190516':[0]}

sand_dict={'leo_20180606':[2,4,6],
'leo_20180713':[2,6,8],
'leo_20180920':[3],
'leo_20190211':[3],
'leo_20190328':[3],
'leo_20190731':[4,5],
'mar_20180601':[2,3],
'mar_20180621':[0,2],
'mar_20180727':[2,6,8],
'mar_20180925':[2,3,5],
'mar_20181113':[2,4],
'mar_20181211':[3,1],
'mar_20190205':[1,2,4],
'mar_20190313':[0,2,3],
'mar_20190516':[1,3,6]}

In [32]:
l_dicts={'no_sand': no_sand_dict,
         'sand': sand_dict,
        'water': water_dict,
        'veg':veg_dict}

In [34]:
label_corrections_path=r"C:\my_packages\sandpyper\tests\test_data\label_corrections.gpkg"
watermasks_path=r"C:\my_packages\sandpyper\tests\test_data\watermasks.gpkg"
shoremasks_path=r"C:\my_packages\sandpyper\tests\test_data\shoremasks.gpkg"

In [38]:
inshore_cleaned=cleanit(labelled_dataset, l_dicts, crs_dict_string=crs_dict_string,
                        watermasks_path=None, 
                        shoremasks_path=shoremasks_path,
                        label_corrections_path=None)

inshore_cleaned.head()

Reclassifying dataset with the provided dictionaries.
Pandas DataFrame provided. Transforming into GeoDataFrame with default_crs ({'init': 'epsg:32754'}) and provided field coordinates .


  return _prepare_from_string(" ".join(pjargs))
  for feature in features_lst:


shoremask  provided in CRS: epsg:32754
Applying shoremasks cleaning.
Shoremasking in mar.
Removing 3621 pts falling outside provided shore polygones.
Shoremasking in leo.
Removing 2358 pts falling outside provided shore polygones.
['shoremasking'] completed.


Unnamed: 0.1,Unnamed: 0,point_id,label_k,distance,z,tr_id,raw_date,coordinates,location,survey_date,x,y,band1,band2,band3,spatial_id,pt_class,geometry
0,29,0400o21665066708le106,8,16.0,1.173529,46,20180606,POINT (299891.3490051675 5773713.769093407),leo,2018-06-06,299891.349005,5773714.0,105.0,113.0,101.0,l00e46o016,veg,POINT (299891.349 5773713.769)
1,30,0400o21666017508le106,1,17.0,1.49427,46,20180606,POINT (299892.3427646156 5773713.880637836),leo,2018-06-06,299892.342765,5773714.0,84.0,91.0,77.0,l00e46o017,veg,POINT (299892.343 5773713.881)
2,31,0400o21667068308le106,1,18.0,2.086379,46,20180606,POINT (299893.3365240637 5773713.992182263),leo,2018-06-06,299893.336524,5773714.0,73.0,81.0,86.0,l00e46o018,veg,POINT (299893.337 5773713.992)
3,32,0400o21667019108le106,7,19.0,2.528307,46,20180606,POINT (299894.3302835117 5773714.103726692),leo,2018-06-06,299894.330284,5773714.0,51.0,58.0,66.0,l00e46o019,veg,POINT (299894.330 5773714.104)
4,33,0400o21668050908le206,3,20.0,2.864518,46,20180606,POINT (299895.3240429598 5773714.215271119),leo,2018-06-06,299895.324043,5773714.0,96.0,102.0,107.0,l00e46o020,veg,POINT (299895.324 5773714.215)


In [42]:
to_clean_classified=classify_labelk(labelled_dataset, l_dicts)

In [43]:
to_clean_classified

Unnamed: 0.1,Unnamed: 0,point_id,label_k,distance,z,tr_id,raw_date,coordinates,location,survey_date,x,y,band1,band2,band3,spatial_id,pt_class
0,0,67144080l2610600eo00,8,0.0,1.130296,47,20180606,POINT (299873.2179654416 5773731.859571524),leo,2018-06-06,299873.217965,5.773732e+06,133.0,143.0,104.0,0le4o0700,veg
1,1,67148080l2690700eo10,8,1.0,1.085163,47,20180606,POINT (299874.2117248897 5773731.971115951),leo,2018-06-06,299874.211725,5.773732e+06,109.0,107.0,106.0,0le4o0710,veg
2,2,67143080l2670800eo20,8,2.0,1.033864,47,20180606,POINT (299875.2054843378 5773732.08266038),leo,2018-06-06,299875.205484,5.773732e+06,98.0,94.0,105.0,0le4o0720,veg
3,3,67148080l2650800eo30,8,3.0,1.025817,47,20180606,POINT (299876.1992437858 5773732.194204807),leo,2018-06-06,299876.199244,5.773732e+06,99.0,97.0,108.0,0le4o0730,veg
4,4,67143080l2630900eo40,8,4.0,1.041824,47,20180606,POINT (299877.1930032339 5773732.305749236),leo,2018-06-06,299877.193003,5.773732e+06,103.0,109.0,127.0,0le4o0740,veg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23482,23482,60102091m2535900ar70,5,75.0,9.786558,0,20190516,POINT (731437.8933010239 5705159.623220895),mar,2019-05-16,731437.893301,5.705160e+06,92.0,107.0,92.0,0ma0r0075,no_sand
23483,23483,60109091m2566800ar70,5,76.0,12.814320,0,20190516,POINT (731436.8990619968 5705159.730406515),mar,2019-05-16,731436.899062,5.705160e+06,75.0,86.0,72.0,0ma0r0076,no_sand
23484,23484,60106091m2597800ar70,0,77.0,9.619781,0,20190516,POINT (731435.9048229698 5705159.837592136),mar,2019-05-16,731435.904823,5.705160e+06,64.0,73.0,58.0,0ma0r0077,veg
23485,23485,60104091m2528800ar70,0,78.0,8.493135,0,20190516,POINT (731434.9105839428 5705159.944777756),mar,2019-05-16,731434.910584,5.705160e+06,56.0,70.0,50.0,0ma0r0078,veg


In [None]:
inshore_cleaned.to_csv(r"C:\my_packages\sandpyper\tests\test_data\test_cleaned.csv")