# 🌊 Flood Prediction - Modelling

# 📚 1. Import Libraries

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.set_option('display.float_format', lambda x:'%.3f' % x)
import warnings
warnings.simplefilter(action="ignore", category=Warning) # Ignore warnings

import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sb
# Configure default settings for plots
sb.set(style='ticks')
sb.set_palette('Paired')

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, cross_validate
from sklearn.model_selection import GridSearchCV
# Data Processing Librariies
from sklearn.preprocessing import MinMaxScaler

## Algorithms
# Ensemble Methods
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
# Non-linear Methods
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBRegressor
# Linear Mehtods

# 📖 2. Load Data

In [5]:
raw_data = pd.read_csv('./Data/Processed/train_no_id.csv')
raw_data.shape

(1117957, 21)

In [6]:
df = raw_data.copy()

# 🧹 3. Data Preparation

In [7]:
features = df.columns[1:-1]
features = features.tolist()
target = 'FloodProbability'

## 3.1. Transform Dataset

In [8]:
def transform_data(df, target, num_features):
    # Assign features and labels
    x = df[num_features]
    y = df[target]

    # Split the dataset into train and test sets
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

    # Standardization and Encoding
    # Define transformers for different column types
    mmx_scaler = MinMaxScaler()
    mmx_scaler.fit(x_train)

    # Transform train and test data using fitted transformers
    x_train_transformed = mmx_scaler.transform(x_train)
    x_test_transformed = mmx_scaler.transform(x_test)
    
    return x_train_transformed, x_test_transformed, y_train, y_test

In [9]:
x_train, x_test, y_train, y_test = transform_data(df, target, features)

# 🧱 4. Model Building

## 4.1. Comparing Different Algorithms
In this section we'll compare non-linear and ensemble algorithms using k-fold cross validation and then decide on the candidate algorith for hyperparameter tunin based on the accuracy metric.

In [10]:
def model_comparison(x, y, models):
    names = []
    scoring = ['r2', 'neg_mean_absolute_error', 'neg_root_mean_squared_error']

    df_results = pd.DataFrame(columns=['Algorithm', 'R2 Mean', 'R2 STD', 'RMSE Mean', 'RMSE STD'])
    results_r2 = []
    results_mae = []
    results_rmse = []
    
    for name, model in models:
        names.append(name)
        kfold = KFold(n_splits=10, shuffle=True, random_state=101)
        result = cross_validate(model, x, y, cv=kfold, scoring=scoring)
    
        # R2 Score of the model
        r2_mean = result['test_r2'].mean()
        r2_std = result['test_r2'].std()

        # MAE Score of the model
        mae_mean = abs(result['test_neg_mean_absolute_error'].mean())
        mae_std = result['test_neg_mean_absolute_error'].std()
        
        # RMSE Score of the model
        rmse_mean = abs(result['test_neg_root_mean_squared_error'].mean())
        rmse_std = result['test_neg_root_mean_squared_error'].std()
        

        df_result_row = {'Algorithm': name, 'R2 Mean': r2_mean, 'R2 STD': r2_std, 'RMSE Mean': rmse_mean, 
                         'RMSE STD': rmse_std, 'MAE Mean': mae_mean, 'MEA STD': mae_std}
        df_results = pd.concat([df_results, pd.DataFrame([df_result_row])], ignore_index=True)
        results_r2.append(result['test_r2'])
        results_mae.append(abs(result['test_neg_mean_absolute_error']))
        results_rmse.append(abs(result['test_neg_root_mean_squared_error']))

    df_results = df_results.set_index('Algorithm')
    pd.set_option('display.float_format', lambda x: '%.3f' % x)
    # Display the mean and standard deviation of all metrics for all algorithms
    print(df_results)
    
    # Display the overall results in a boxplot graph
    plot_objects = plt.subplots(nrows=1, ncols=3, figsize=(18, 6))
    fig, (ax1, ax2, ax3) = plot_objects
    
    ax1.boxplot(results_r2)
    ax1.set_title('R2', fontsize=14)
    ax1.set_xticklabels(names, rotation=30)
    ax2.boxplot(results_mae)
    ax2.set_title('MAE', fontsize=14)
    ax2.set_xticklabels(names, rotation=30)
    ax3.boxplot(results_rmse)
    ax3.set_title('RMSE', fontsize=14)
    ax3.set_xticklabels(names, rotation=30)
    plt.tight_layout()
    plt.show()

### 4.1.1. Ensemble Mehtods

In [11]:
ens_models = []
ens_models.append(('RFR', RandomForestRegressor()))
ens_models.append(('ADR', AdaBoostRegressor()))
ens_models.append(('GBR', GradientBoostingRegressor()))

In [12]:
model_comparison(x_train, y_train, ens_models)

KeyboardInterrupt: 