In [1]:
import joblib
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score, confusion_matrix, mean_squared_error, root_mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

In [2]:
INPUT_CSV = "malaysia_flood_dataset.csv"

MODEL_FILE_NAME = "malaysia_flood_model.pkl"
FEATURES_FILE_NAME = "malaysia_flood_features.pkl"

In [3]:
# Read the dataset
data = pd.read_csv(INPUT_CSV)

In [4]:
# Replace null values with the column average
if data.isnull().sum().any():
    data = data.fillna(data.mean())

In [5]:
# Define Feature and Target
x = data.drop('FLOOD', axis=1)  # feature (independent variables)
y = data['FLOOD']  # target (dependent variable)

In [6]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.15, random_state=420)

In [7]:
# Initialize the model as Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=69, max_depth=10)

In [8]:
# Train the model
rf_model.fit(X_train, y_train)

In [9]:
# Dump model and feature names to disk
joblib.dump(rf_model, MODEL_FILE_NAME)
joblib.dump(x.columns.tolist(), FEATURES_FILE_NAME)

['malaysia_flood_features.pkl']

In [10]:
# Test model accuracy
y_pred = rf_model.predict(X_test)

# Converting both continuous predictions and true values to binary for classification metrics
y_pred_binary = [1 if x >= 0.5 else 0 for x in y_pred]  # Thresholding predictions
y_test_binary = [1 if x >= 0.5 else 0 for x in y_test]  # Thresholding true values

# Metrics Calculation
f1 = f1_score(y_test_binary, y_pred_binary)
recall = recall_score(y_test_binary, y_pred_binary)
precision = precision_score(y_test_binary, y_pred_binary)
accuracy = accuracy_score(y_test_binary, y_pred_binary)

# Confusion Matrix to calculate specificity
tn, fp, fn, tp = confusion_matrix(y_test_binary, y_pred_binary).ravel()
specificity = tn / (tn + fp)

# Print out the scores
print(f"F1 Score: \t{round(f1, 2)}")
print(f"Recall:   \t{round(recall, 2)}")
print(f"Precision:\t{round(precision, 2)}")
print(f"Accuracy: \t{round(accuracy, 3)}")
print(f"Specificity: \t{round(specificity, 2)}")


F1 Score: 	0.65
Recall:   	0.67
Precision:	0.63
Accuracy: 	0.702
Specificity: 	0.73


In [11]:
# Calculate regression metrics
mse = mean_squared_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)  # RMSE is the square root of MSE
r2 = r2_score(y_test, y_pred)

print(f"MSE:      {round(mse, 2)}")
print(f"RMSE:     {round(rmse, 2)}")
print(f"R² Score: {round(r2, 2)}")

MSE:      0.19
RMSE:     0.44
R² Score: 0.22


In [12]:
# Wrapper function for running a prediction
def predict_flood_chance(input_data):
    # Load the model and feature names
    model = joblib.load(MODEL_FILE_NAME)
    feature_names = joblib.load(FEATURES_FILE_NAME)

    # Convert input to DataFrame for prediction and predict
    custom_data = pd.DataFrame([input_data], columns=feature_names)
    return model.predict(custom_data)[0]

In [56]:
example_input = {
    'STATE': 112,
    'DISTRICT': 112001,
    'YEAR': 2030,
    'JAN': 100,
    'FEB': 100,
    'MAR': 100,
    'APR': 100,
    'MAY': 100,
    'JUN': 100,
    'JUL': 100,
    'AUG': 100,
    'SEP': 100,
    'OCT': 100,
    'NOV': 100,
    'DEC': 100,
    'ANNUAL RAINFALL': 1200
}

# Predict flood probability for example input
flood_probability = predict_flood_chance(example_input)
print(f"Flood Probability Prediction for Custom Input: {round(flood_probability, 2) * 100} %")

Flood Probability Prediction for Custom Input: 59.0 %
