In [None]:
import numpy as np
import cv2
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import geopandas as gpd
import keras
import shap


In [None]:
# Read the shapefile or pickle which we created in last article
df=gpd.read_file("points_data.shp")
# df=pd.read_pickle("points_data.pkl") # in case of pickle
df.head()

In [None]:
#check that there is no no data values in the dataset
print(df.isnull().sum())
#df = df.dropna() # use this to remove rows with no data values 

In [None]:
#Understand the data 
#Here we can see that we have a balanced dataset (equal number of flooded and non flooeded locations
sns.countplot(x="Label", data=df) #0 - Notflooded   1 - Flooded

In [None]:
# show the correlation matric for the dataset
corrMatrix = df.corr()
fig, ax = plt.subplots(figsize=(10,10))         # Sample figsize in inches
#sns.heatmap(df.iloc[:, 1:6:], annot=True, linewidths=.5, ax=ax)
sns.heatmap(corrMatrix, annot=True, linewidths=.5, ax=ax)

In [None]:
#Define the dependent variable that needs to be predicted (labels)
Y = df["Label"].values

#Define the independent variables. Let's also drop gemotry and label
X = df.drop(labels = ["Label", "geometry"], axis=1) 
features_list = list(X.columns)  #List features so we can rank their importance later 

In [None]:
# we need to convert X from dataframe to array to train the neural netowrk
X_arr=X.to_numpy()

In [None]:
#Split data into train (60 %), validate (20 %) and test (20%) to verify accuracy after fitting the model.
# training data is used to train the model
# validation data is used for hyperparameter tuning
# testing data is used to test the model

from sklearn.model_selection import train_test_split
X_train_val, X_test, y_train_val, y_test = train_test_split(X, Y, test_size=0.2,shuffle=True, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25,shuffle=True, random_state=42)

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import TensorBoard
from keras.layers import Dropout

NAME= "DNN"
# here is the network layers
model =Sequential([
    Dense(9, activation='relu', input_shape=(11,)),
    Dropout(0.1),
    Dense(64, activation= 'relu'),
    Dropout(0.4),
    Dense(64, activation= 'relu'),
    Dropout(0.4),
    Dense(64, activation= 'relu'),
    Dense(1,activation= 'sigmoid'),
])


# save the model
from keras.callbacks import ModelCheckpoint, EarlyStopping
checkpoint = ModelCheckpoint("DNN.h5", monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=False, mode='auto', period=1)

# Early stopping to stop the network when there is no improvement based on the validation loss
early = EarlyStopping(monitor='val_loss', min_delta=0, patience=20, verbose=1, mode='auto')

# you can see the training and validation losses for each epoch 
tensorboard = TensorBoard(log_dir="logs/{}".format(NAME))

model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])


model.summary()

In [None]:
# train the model
history=model.fit(X_train,y_train,epochs=500,verbose=2,batch_size=64,validation_split=0.25, callbacks=[checkpoint,early,tensorboard])

In [None]:
#plot the training and validation loss at each epoch
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(loss) + 1)
plt.plot(epochs, loss, 'y', label='Training loss')
plt.plot(epochs, val_loss, 'r', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()


In [None]:
#plot the training and validation accuracy at each epoch

acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
plt.plot(epochs, acc, 'y', label='Training acc')
plt.plot(epochs, val_acc, 'r', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
# Calculate accuracy
_, acc = model.evaluate(X_test, y_test)
print("Accuracy = ", (acc * 100.0), "%")


In [None]:
#Check the confusion matrix for various thresholds. Which one is good?
#Need to balance positive, negative, false positive and false negative. 
#ROC can help identify the right threshold.
#Receiver Operating Characteristic (ROC) Curve is a plot that helps us 
#visualize the performance of a binary classifier when the threshold is varied. 
#ROC

from sklearn.metrics import roc_curve
y_preds = model.predict(X_test).ravel()

fpr, tpr, thresholds = roc_curve(y_test, y_preds)
plt.figure(1)
plt.plot([0, 1], [0, 1], 'y--')
plt.plot(fpr, tpr, marker='.')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.show()


In [None]:
#AUC
#Area under the curve (AUC) for ROC plot can be used to understand how well a classifier is performing. 
#% chance that the model can distinguish between positive and negative classes.

from sklearn.metrics import auc
auc_value = auc(fpr, tpr)
print("Area under curve, AUC = ", auc_value)

## Map the whole study area

In [None]:
# Read shapefile for the whole study area
df_SA=gpd.read_file("Study_area.shp")
df_SA.head() # make sure that the dataset has the same column arrangement as the training dataset



In [None]:
X_SA= df_SA.drop(labels = ["geometry"], axis=1) # we need to remove all the columns except the predictive features
X_SA.head()


In [None]:
prediction_SA = model.predict(X_SA.to_numpy()) # predict if the location is flooded (1) or not flooded (0)



In [None]:
# In order to map the flood susceptibility we need to cacluate the probability of being flooded
prediction_prob=model.predict_proba(X_SA) # This function return an array with lists 
# each list has two values [probability of being not flooded , probability of being flooded]

# We need only the probablity of being flooded
# We need to add the value coressponding to each point

df_SA['FSM']= prediction_prob[:,1]

In [None]:
# Save the dataframe tp a shapefile in case of converting the points to raster using QGIS or Arcmap
df_SA.to_file("FSM.shp")

In [None]:
# Converting the point shapefile to raster.
# We will use the model prediction (column FSM in df_SA to make a raster)
from geocube.api.core import make_geocube
import rasterio as rio

out_grid= make_geocube(vector_data=df_SA, measurements=["FSM"], resolution=(-1, 1)) #for most crs negative comes first in resolution
out_grid["FSM"].rio.to_raster("Flood_susceptibility.tif")