In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from catboost import CatBoostRegressor
import optuna

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("../data/Flood Prediction Dataset/train.csv")

In [3]:
df.columns

Index(['id', 'MonsoonIntensity', 'TopographyDrainage', 'RiverManagement',
       'Deforestation', 'Urbanization', 'ClimateChange', 'DamsQuality',
       'Siltation', 'AgriculturalPractices', 'Encroachments',
       'IneffectiveDisasterPreparedness', 'DrainageSystems',
       'CoastalVulnerability', 'Landslides', 'Watersheds',
       'DeterioratingInfrastructure', 'PopulationScore', 'WetlandLoss',
       'InadequatePlanning', 'PoliticalFactors', 'FloodProbability'],
      dtype='object')

In [4]:
df.dtypes

id                                   int64
MonsoonIntensity                     int64
TopographyDrainage                   int64
RiverManagement                      int64
Deforestation                        int64
Urbanization                         int64
ClimateChange                        int64
DamsQuality                          int64
Siltation                            int64
AgriculturalPractices                int64
Encroachments                        int64
IneffectiveDisasterPreparedness      int64
DrainageSystems                      int64
CoastalVulnerability                 int64
Landslides                           int64
Watersheds                           int64
DeterioratingInfrastructure          int64
PopulationScore                      int64
WetlandLoss                          int64
InadequatePlanning                   int64
PoliticalFactors                     int64
FloodProbability                   float64
dtype: object

In [5]:
df.isna().mean()

id                                 0.0
MonsoonIntensity                   0.0
TopographyDrainage                 0.0
RiverManagement                    0.0
Deforestation                      0.0
Urbanization                       0.0
ClimateChange                      0.0
DamsQuality                        0.0
Siltation                          0.0
AgriculturalPractices              0.0
Encroachments                      0.0
IneffectiveDisasterPreparedness    0.0
DrainageSystems                    0.0
CoastalVulnerability               0.0
Landslides                         0.0
Watersheds                         0.0
DeterioratingInfrastructure        0.0
PopulationScore                    0.0
WetlandLoss                        0.0
InadequatePlanning                 0.0
PoliticalFactors                   0.0
FloodProbability                   0.0
dtype: float64

In [6]:
len(df)

1117957

In [23]:
X = df.drop(["FloodProbability", "id"], axis=1)
y = df["FloodProbability"]

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)

In [25]:
cb = CatBoostRegressor()

In [26]:
cb.fit(X_train, y_train, eval_set=(X_test, y_test), plot=True, verbose=False);

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [27]:
y_preds = cb.predict(X_test)

In [28]:
r2_score(y_test, y_preds)

0.8463182330426229

In [12]:
def objective(trial):

    X = df.drop(["FloodProbability", "id"], axis=1)
    y = df["FloodProbability"]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)

    params = {
        "iterations": trial.suggest_int("iterations", 200, 1000),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1),
        "l2_leaf_reg": trial.suggest_int("l2_leaf_reg", 1, 10),
        "depth": trial.suggest_int("depth", 8, 16),
    }

    model = CatBoostRegressor(**params, verbose=False, early_stopping_rounds=100)
    model.fit(X_train, y_train, eval_set=(X_test, y_test))

    preds = model.predict(X_test)
    return r2_score(y_test, preds)

In [17]:
study = optuna.create_study(direction="maximize")

[I 2024-06-06 15:59:42,675] A new study created in memory with name: no-name-ecf696fc-0066-4333-900f-6bb4a71565ab


In [18]:
study.optimize(objective, n_trials=10)

[I 2024-06-06 16:01:31,212] Trial 0 finished with value: 0.8459376455173284 and parameters: {'iterations': 987, 'learning_rate': 0.04197680384724507, 'l2_leaf_reg': 3, 'depth': 10}. Best is trial 0 with value: 0.8459376455173284.
[I 2024-06-06 16:02:15,389] Trial 1 finished with value: 0.8372989360373077 and parameters: {'iterations': 245, 'learning_rate': 0.05514959825125708, 'l2_leaf_reg': 3, 'depth': 13}. Best is trial 0 with value: 0.8459376455173284.
[I 2024-06-06 16:03:39,072] Trial 2 finished with value: 0.836472154206552 and parameters: {'iterations': 467, 'learning_rate': 0.028866785595904214, 'l2_leaf_reg': 8, 'depth': 13}. Best is trial 0 with value: 0.8459376455173284.
[I 2024-06-06 16:07:38,186] Trial 3 finished with value: 0.8422142967215468 and parameters: {'iterations': 633, 'learning_rate': 0.04940643207304055, 'l2_leaf_reg': 6, 'depth': 15}. Best is trial 0 with value: 0.8459376455173284.
[I 2024-06-06 16:08:32,923] Trial 4 finished with value: 0.801811708388321 and p

In [19]:
study.best_value

0.8459808206179973

In [20]:
study.best_params

{'iterations': 621,
 'learning_rate': 0.06978678088217069,
 'l2_leaf_reg': 2,
 'depth': 10}