In [None]:
from dask.distributed import Client, LocalCluster

# Connect to an existing LocalCluster if available
# The default port is 8786
try:
    # This creates a new Client connection to an existing Dask scheduler if one exists.
    # There is no practical way to get the LocalCluster object from the existing scheduler,
    # although the scheduler details can be accessed with `client.scheduler`.
    # The LocalCluster object is only available from the notebook that created it.
    # Restart the kernel or `client.close();cluster.close()` in each notebook that
    # created one to remove existing LocalClusters.
    client = Client(f'localhost:8786', timeout='2s')
    cluster = client.cluster  # None
except:
    cluster = LocalCluster(scheduler_port=8786)
    client = Client(cluster)

display(cluster if cluster else client)

Tab(children=(HTML(value='<div class="jp-RenderedHTMLCommon jp-RenderedHTML jp-mod-trusted jp-OutputArea-outpu…

In [2]:
# Initiliaze the Gateway client
from dask.distributed import Client
from dask_gateway import Gateway

gateway = Gateway()
gateway.cluster_options() 

VBox(children=(HTML(value='<h2>Cluster Options</h2>'), GridBox(children=(HTML(value="<p style='font-weight: bo…

In [3]:
clusters = gateway.list_clusters()
if not clusters:
    print('Creating new cluster. Please wait for this to finish.')
    cluster = gateway.new_cluster()
else:
    print(f'An existing cluster was found. Connecting to: {clusters[0].name}')
    cluster=gateway.connect(clusters[0].name)
display(cluster)

Creating new cluster. Please wait for this to finish.


VBox(children=(HTML(value='<h2>GatewayCluster</h2>'), HBox(children=(HTML(value='\n<div>\n<style scoped>\n    …

In [4]:
min_number_of_workers = 1
max_number_of_workers = 2

# Static scaling
cluster.scale(min_number_of_workers)

# Adaptive scaling
cluster.adapt(minimum=min_number_of_workers, maximum=max_number_of_workers)

In [8]:
client = cluster.get_client()
client.wait_for_workers(n_workers=min_number_of_workers)
client


+---------+--------+-----------+---------+
| Package | client | scheduler | workers |
+---------+--------+-----------+---------+
| pandas  | 1.4.1  | 1.3.4     | 1.3.4   |
+---------+--------+-----------+---------+


0,1
Connection method: Cluster object,Cluster type: dask_gateway.GatewayCluster
Dashboard: https://hub.csiro.easi-eo.solutions/services/dask-gateway/clusters/easihub.f886b2d4ca3843ee9fe46ae29da3595d/status,


In [1]:
import pandas as pd 
import numpy as np

In [2]:
water = pd.read_csv('water_MLset.csv')
                    
water

Unnamed: 0,sample_id,cluster,Country,General_Vegetation,Weather,Hydrogeomorphology,Intermittent_or_Perennial,Sediment,US.or.DS.of.Gauge,Precipitation.Within.Week,...,NPOC_mg.L.asC,del2H_permil,del18O_permil,Cl_mgL,SO4_mgL,NO3_mgL,NO2_mgL,F_mgL,Richness observed,Simpson's Diversity Index
0,SW_S19S.0003_U_1,Average,Canada,Not vegetated,Partly cloudy,Single-channel straight,Perennial,Silt/mud (<0.0625mm),Downstream,Yes,...,1.27,-140.0,-18.75,0.30,9.80,0.19,0.04,0.07,2335,0.999572
1,SW_S19S.0003_U_2,Average,Canada,Not vegetated,Partly cloudy,Single-channel straight,Perennial,Silt/mud (<0.0625mm),Downstream,Yes,...,1.80,,,0.31,8.03,0.23,0.04,0.07,2560,0.999609
2,SW_S19S.0003_U_3,Most,Canada,Not vegetated,Partly cloudy,Single-channel straight,Perennial,Silt/mud (<0.0625mm),Downstream,Yes,...,1.15,,,0.31,8.62,0.23,0.04,0.06,2782,0.999641
3,SW_S19S.0004_U_1,Most,Canada,Grass,Overcast. Raining,Single-channel straight,Perennial,Gravel/cobble (>2mm),Upstream,Yes,...,1.96,-129.3,-17.13,1.83,6.53,0.12,0.04,0.07,2823,0.999646
4,SW_S19S.0004_U_2,Most,Canada,Grass,Overcast. Raining,Single-channel straight,Perennial,Gravel/cobble (>2mm),Upstream,Yes,...,1.93,,,2.00,6.66,0.12,0.04,0.08,2869,0.999651
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260,SW_S19S.0099_U_1,Most,USA,Broadleaf deciduous tree. Grass,Sunny,Single-channel meandering,Intermittent,Gravel/cobble (>2mm),Upstream,No,...,2.30,-122.8,-16.08,70.55,47.79,4.43,0.04,0.33,2676,0.999626
261,SW_S19S.0099_U_2,Most,USA,Broadleaf deciduous tree. Grass,Sunny,Single-channel meandering,Intermittent,Gravel/cobble (>2mm),Upstream,No,...,0.45,,,68.35,45.59,4.64,0.04,0.29,3217,0.999689
262,SW_S19S.0099_U_3,Most,USA,Broadleaf deciduous tree. Grass,Sunny,Single-channel meandering,Intermittent,Gravel/cobble (>2mm),Upstream,No,...,1.62,,,66.16,44.37,4.75,0.04,0.30,2423,0.999587
263,SW_S19S.0100_U_1,Least,USA,Shrub. Grass,Sunny,Single-channel meandering,Perennial,Gravel/cobble (>2mm),Downstream,Yes,...,0.55,-43.3,-6.53,0.22,34.62,0.12,0.04,0.09,1289,0.999224


In [3]:
X = water.drop(['cluster', 'sample_id'], axis=1)
y = water['cluster']

X

Unnamed: 0,Country,General_Vegetation,Weather,Hydrogeomorphology,Intermittent_or_Perennial,Sediment,US.or.DS.of.Gauge,Precipitation.Within.Week,Dam.Upstream.Site,Stream_Order,...,NPOC_mg.L.asC,del2H_permil,del18O_permil,Cl_mgL,SO4_mgL,NO3_mgL,NO2_mgL,F_mgL,Richness observed,Simpson's Diversity Index
0,Canada,Not vegetated,Partly cloudy,Single-channel straight,Perennial,Silt/mud (<0.0625mm),Downstream,Yes,No,7.0,...,1.27,-140.0,-18.75,0.30,9.80,0.19,0.04,0.07,2335,0.999572
1,Canada,Not vegetated,Partly cloudy,Single-channel straight,Perennial,Silt/mud (<0.0625mm),Downstream,Yes,No,7.0,...,1.80,,,0.31,8.03,0.23,0.04,0.07,2560,0.999609
2,Canada,Not vegetated,Partly cloudy,Single-channel straight,Perennial,Silt/mud (<0.0625mm),Downstream,Yes,No,7.0,...,1.15,,,0.31,8.62,0.23,0.04,0.06,2782,0.999641
3,Canada,Grass,Overcast. Raining,Single-channel straight,Perennial,Gravel/cobble (>2mm),Upstream,Yes,No,6.0,...,1.96,-129.3,-17.13,1.83,6.53,0.12,0.04,0.07,2823,0.999646
4,Canada,Grass,Overcast. Raining,Single-channel straight,Perennial,Gravel/cobble (>2mm),Upstream,Yes,No,6.0,...,1.93,,,2.00,6.66,0.12,0.04,0.08,2869,0.999651
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260,USA,Broadleaf deciduous tree. Grass,Sunny,Single-channel meandering,Intermittent,Gravel/cobble (>2mm),Upstream,No,No,7.0,...,2.30,-122.8,-16.08,70.55,47.79,4.43,0.04,0.33,2676,0.999626
261,USA,Broadleaf deciduous tree. Grass,Sunny,Single-channel meandering,Intermittent,Gravel/cobble (>2mm),Upstream,No,No,7.0,...,0.45,,,68.35,45.59,4.64,0.04,0.29,3217,0.999689
262,USA,Broadleaf deciduous tree. Grass,Sunny,Single-channel meandering,Intermittent,Gravel/cobble (>2mm),Upstream,No,No,7.0,...,1.62,,,66.16,44.37,4.75,0.04,0.30,2423,0.999587
263,USA,Shrub. Grass,Sunny,Single-channel meandering,Perennial,Gravel/cobble (>2mm),Downstream,Yes,No,2.0,...,0.55,-43.3,-6.53,0.22,34.62,0.12,0.04,0.09,1289,0.999224


In [4]:
# check dtype of X 

X.dtypes

Country                               object
General_Vegetation                    object
Weather                               object
Hydrogeomorphology                    object
Intermittent_or_Perennial             object
Sediment                              object
US.or.DS.of.Gauge                     object
Precipitation.Within.Week             object
Dam.Upstream.Site                     object
Stream_Order                         float64
River_Gradient                        object
Distance_DS.and.MS_meters            float64
Distance_MS.and.US_meters            float64
Primary.Sources.Flow.Variation        object
pH                                    object
Temp_degC                             object
Approx.Distance.From.Gauge_meters     object
Number.Days.Since.Precip             float64
NPOC_mg.L.asC                        float64
del2H_permil                         float64
del18O_permil                        float64
Cl_mgL                               float64
SO4_mgL   

In [9]:
# X contains object data. This cant be passed to ML fit() 
# Apply OHE (or LE for ordinal data) 

from sklearn import preprocessing

#le = preprocessing.LabelEncoder() # Label encoder pre-processing
ohe = preprocessing.OneHotEncoder() 
for column_name in X.columns:
    if X[column_name].dtype == object:
        X[column_name] = ohe.fit_transform(X[column_name])  
    else:
        pass

X

Unnamed: 0,Country,General_Vegetation,Weather,Hydrogeomorphology,Intermittent_or_Perennial,Sediment,US.or.DS.of.Gauge,Precipitation.Within.Week,Dam.Upstream.Site,Stream_Order,...,NPOC_mg.L.asC,del2H_permil,del18O_permil,Cl_mgL,SO4_mgL,NO3_mgL,NO2_mgL,F_mgL,Richness observed,Simpson's Diversity Index
0,0,15,3,2,1,3,0,1,0,7.0,...,1.27,-140.0,-18.75,0.30,9.80,0.19,0.04,0.07,2335,0.999572
1,0,15,3,2,1,3,0,1,0,7.0,...,1.80,,,0.31,8.03,0.23,0.04,0.07,2560,0.999609
2,0,15,3,2,1,3,0,1,0,7.0,...,1.15,,,0.31,8.62,0.23,0.04,0.06,2782,0.999641
3,0,7,1,2,1,1,1,1,0,6.0,...,1.96,-129.3,-17.13,1.83,6.53,0.12,0.04,0.07,2823,0.999646
4,0,7,1,2,1,1,1,1,0,6.0,...,1.93,,,2.00,6.66,0.12,0.04,0.08,2869,0.999651
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260,6,2,7,1,0,1,1,0,0,7.0,...,2.30,-122.8,-16.08,70.55,47.79,4.43,0.04,0.33,2676,0.999626
261,6,2,7,1,0,1,1,0,0,7.0,...,0.45,,,68.35,45.59,4.64,0.04,0.29,3217,0.999689
262,6,2,7,1,0,1,1,0,0,7.0,...,1.62,,,66.16,44.37,4.75,0.04,0.30,2423,0.999587
263,6,17,7,1,1,1,0,1,0,2.0,...,0.55,-43.3,-6.53,0.22,34.62,0.12,0.04,0.09,1289,0.999224


In [11]:
# check new dtype

X.dtypes # all "object" converted to "int64" 

Country                                int64
General_Vegetation                     int64
Weather                                int64
Hydrogeomorphology                     int64
Intermittent_or_Perennial              int64
Sediment                               int64
US.or.DS.of.Gauge                      int64
Precipitation.Within.Week              int64
Dam.Upstream.Site                      int64
Stream_Order                         float64
River_Gradient                         int64
Distance_DS.and.MS_meters            float64
Distance_MS.and.US_meters            float64
Primary.Sources.Flow.Variation         int64
pH                                     int64
Temp_degC                              int64
Approx.Distance.From.Gauge_meters      int64
Number.Days.Since.Precip             float64
NPOC_mg.L.asC                        float64
del2H_permil                         float64
del18O_permil                        float64
Cl_mgL                               float64
SO4_mgL   

In [23]:
# combine X and Y together 
# and export as .csv --> preprocessed data for ML model 

combined = pd.concat([water['sample_id'], y, X], ignore_index=False, sort=False, axis=1) # axis = 1 concats columns, 0 concats rows
combined

# export to csv 

combined.to_csv("water_preprocesseddata.csv", index=False) 