In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.arima.model import ARIMA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.losses import BinaryCrossentropy
import tensorflow.keras.backend as K
from sklearn.utils import class_weight
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score # Import r2_score



# Train test split

In [None]:
df = pd.read_csv("AllData_After_EDA.csv", parse_dates=["Date"], index_col="Date")
# Remove timezone info if it exists
df.index = df.index.tz_localize(None)
df.sort_index(inplace=True)

In [None]:
# Extract features (X) and target variable (y)
X = df.drop(columns=['flooded'])  # Drop Date & Target column
y = df['flooded']

# Split into initial train-test sets (before resampling)
train_size = 0.8  # 80% for training, 20% for testing
split_index = int(len(df) * train_size)

X_train, X_test = X.iloc[:split_index], X.iloc[split_index:]
y_train, y_test = y.iloc[:split_index], y.iloc[split_index:]

In [None]:
import pandas as pd
import numpy as np

def windowed_resampling(X_train, y_train, window_size='15D', max_copies=1):
    """
    Oversamples the minority class using windowed resampling.

    Parameters:
    - X_train: Feature DataFrame with datetime index.
    - y_train: Target Series with datetime index.
    - window_size: Time delta defining the window size (e.g., '3D' for 3 days).
    - max_copies: Maximum number of times a sample can be duplicated.

    Returns:
    - Resampled X_train and y_train.
    """
    # Convert the index to DatetimeIndex if it's not already
    X_train.index = pd.to_datetime(X_train.index)
    y_train.index = pd.to_datetime(y_train.index)

    X_resampled = X_train.copy()
    y_resampled = y_train.copy()

    minority_class_idx = y_train[y_train == 1].index

    for idx in minority_class_idx:
        for _ in range(min(np.random.randint(1, max_copies + 1), max_copies)):
            window_start = idx - pd.Timedelta(window_size)
            window_end = idx + pd.Timedelta(window_size)

            sampled_row = X_train.loc[idx].copy()
            sampled_label = y_train.loc[idx]

            sampled_row.name = idx  # Keep the datetime index
            X_resampled = pd.concat([X_resampled, sampled_row.to_frame().T])
            y_resampled = pd.concat([y_resampled, pd.Series(sampled_label, index=[idx])])

    return X_resampled.sort_index(), y_resampled.sort_index()

# Example usage with your dataset
X_train_resampled, y_train_resampled = windowed_resampling(X_train, y_train, window_size='3D', max_copies=3)
print(y_train_resampled.value_counts())

0    2527
1    1193
Name: count, dtype: int64


# Direct Prediction

## Trying Supervised Models

## Logistic Regression

In [None]:
# Logistic Regression
logreg_model = LogisticRegression(max_iter=1000)
logreg_model.fit(X_train_resampled, y_train_resampled)

# Make predictions
y_pred_logreg = logreg_model.predict(X_test)

# Evaluate the model
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_logreg))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_logreg))
print("Classification Report:\n", classification_report(y_test, y_pred_logreg))


Logistic Regression Accuracy: 0.8043775649794802
Confusion Matrix:
 [[527 143]
 [  0  61]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.79      0.88       670
           1       0.30      1.00      0.46        61

    accuracy                           0.80       731
   macro avg       0.65      0.89      0.67       731
weighted avg       0.94      0.80      0.85       731



## RF

In [None]:
# Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_resampled, y_train_resampled)

# Make predictions
y_pred_rf = rf_model.predict(X_test)

# Evaluate the model
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))


Random Forest Accuracy: 0.893296853625171
Confusion Matrix:
 [[628  42]
 [ 36  25]]
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.94      0.94       670
           1       0.37      0.41      0.39        61

    accuracy                           0.89       731
   macro avg       0.66      0.67      0.67       731
weighted avg       0.90      0.89      0.90       731



In [None]:
from sklearn.model_selection import GridSearchCV

# Random Forest Hyperparameter Tuning
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search_rf = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, scoring='accuracy')
grid_search_rf.fit(X_train, y_train)

# Best parameters and score
print("Best Parameters for Random Forest:", grid_search_rf.best_params_)
print("Best Accuracy for Random Forest:", grid_search_rf.best_score_)


KeyboardInterrupt: 

In [None]:
# Get the best model
best_rf_model = grid_search_rf.best_estimator_

# Make predictions on the test set
y_pred_rf = best_rf_model.predict(X_test)

print("Test Accuracy for XGBoost:", accuracy_score(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))


Test Accuracy for XGBoost: 0.9042407660738714
Confusion Matrix:
 [[644  26]
 [ 44  17]]
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.96      0.95       670
           1       0.40      0.28      0.33        61

    accuracy                           0.90       731
   macro avg       0.67      0.62      0.64       731
weighted avg       0.89      0.90      0.90       731



## XGBoost

In [None]:
# XGBoost Classifier
xgb_model = XGBClassifier(n_estimators=100, random_state=42)
xgb_model.fit(X_train_resampled, y_train_resampled)

# Make predictions
y_pred_xgb = xgb_model.predict(X_test)

# Evaluate the model
print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb))
print("Classification Report:\n", classification_report(y_test, y_pred_xgb))


XGBoost Accuracy: 0.8645690834473324
Confusion Matrix:
 [[579  91]
 [  8  53]]
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.86      0.92       670
           1       0.37      0.87      0.52        61

    accuracy                           0.86       731
   macro avg       0.68      0.87      0.72       731
weighted avg       0.93      0.86      0.89       731



In [None]:

param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2],
    'scale_pos_weight': [5, 7, 9, 11]  # Adjusting for class imbalance
}

xgb_model = XGBClassifier(objective='binary:logistic', random_state=42)

cv_strategy = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid,
                           scoring='recall', cv=cv_strategy, verbose=2, n_jobs=-1)

grid_search.fit(X_train_resampled, y_train_resampled)

print("Best Parameters for XGBoost:", grid_search.best_params_)
print("Best F1-score for XGBoost:", grid_search.best_score_)



Fitting 3 folds for each of 8748 candidates, totalling 26244 fits
Best Parameters for XGBoost: {'colsample_bytree': 0.7, 'gamma': 0, 'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 100, 'scale_pos_weight': 7, 'subsample': 0.8}
Best F1-score for XGBoost: 1.0


In [None]:
# Get the best model
best_xgb_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred_xgb = best_xgb_model.predict(X_test)

print("Test Accuracy for XGBoost:", accuracy_score(y_test, y_pred_xgb))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb))
print("Classification Report:\n", classification_report(y_test, y_pred_xgb))


Test Accuracy for XGBoost: 0.5129958960328317
Confusion Matrix:
 [[314 356]
 [  0  61]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.47      0.64       670
           1       0.15      1.00      0.26        61

    accuracy                           0.51       731
   macro avg       0.57      0.73      0.45       731
weighted avg       0.93      0.51      0.61       731



In [None]:

param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2],
    'scale_pos_weight': [5, 7, 9, 11]  # Adjusting for class imbalance
}

xgb_model = XGBClassifier(objective='binary:logistic', random_state=42)

cv_strategy = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid,
                           scoring='roc_auc', cv=cv_strategy, verbose=2, n_jobs=-1)

grid_search.fit(X_train_resampled, y_train_resampled)

print("Best Parameters for XGBoost:", grid_search.best_params_)
print("Best F1-score for XGBoost:", grid_search.best_score_)


Fitting 3 folds for each of 8748 candidates, totalling 26244 fits
Best Parameters for XGBoost: {'colsample_bytree': 0.7, 'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 5, 'min_child_weight': 1, 'n_estimators': 200, 'scale_pos_weight': 7, 'subsample': 1.0}
Best F1-score for XGBoost: 0.9999969268403617


In [None]:
# Get the best model
best_xgb_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred_xgb = best_xgb_model.predict(X_test)

print("Test Accuracy for XGBoost:", accuracy_score(y_test, y_pred_xgb))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb))
print("Classification Report:\n", classification_report(y_test, y_pred_xgb))


Test Accuracy for XGBoost: 0.8057455540355677
Confusion Matrix:
 [[537 133]
 [  9  52]]
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.80      0.88       670
           1       0.28      0.85      0.42        61

    accuracy                           0.81       731
   macro avg       0.63      0.83      0.65       731
weighted avg       0.92      0.81      0.84       731



### LSTM

In [None]:
def create_sequences(X, y, time_steps=10):
    X_seq, y_seq = [], []

    for i in range(len(X) - time_steps):
        X_seq.append(X[i:i+time_steps])  # Collect the last 5 days of data
        y_seq.append(y[i+time_steps])  # Predict the next day's flood status

    return np.array(X_seq), np.array(y_seq)

# Create sequences for training and testing
X_train_seq, y_train_seq = create_sequences(X_train, y_train)
X_test_seq, y_test_seq = create_sequences(X_test, y_test)


# Scaling the training data
scaler = StandardScaler()
X_train_seq_scaled = scaler.fit_transform(X_train_seq.reshape(-1, X_train_seq.shape[-1])).reshape(X_train_seq.shape)

# Apply the same scaling to the test data
X_test_seq_scaled = scaler.transform(X_test_seq.reshape(-1, X_test_seq.shape[-1])).reshape(X_test_seq.shape)

train_size_seq = int(len(X_train_seq_scaled) * 0.8)  # 70% for training, 30% for validation

X_train_seq_final, X_val_seq = X_train_seq_scaled[:train_size_seq], X_train_seq_scaled[train_size_seq:]
y_train_seq_final, y_val_seq = y_train_seq[:train_size_seq], y_train_seq[train_size_seq:]

  y_seq.append(y[i+time_steps])  # Predict the next day's flood status


In [None]:
y_train.sum() / y_train.count()

0.13547724940130004

In [None]:
def f1_score(y_true, y_pred):
    y_true = K.cast(y_true, 'float32')  # Convert labels to float32
    y_pred = K.round(y_pred)  # Convert probabilities to 0/1

    tp = K.sum(y_true * y_pred)  # True positives
    precision = tp / (K.sum(y_pred) + K.epsilon())  # Avoid division by zero
    recall = tp / (K.sum(y_true) + K.epsilon())  # Avoid division by zero
    return 2 * (precision * recall) / (precision + recall + K.epsilon())

# Define focal loss function
def focal_loss(alpha=0.25, gamma=2.0):
    def loss(y_true, y_pred):
        bce = BinaryCrossentropy()(y_true, y_pred)
        p_t = y_true * y_pred + (1 - y_true) * (1 - y_pred)
        return K.mean(alpha * K.pow((1 - p_t), gamma) * bce)
    return loss

In [None]:
# Define the LSTM model
model = Sequential()

# Add LSTM layer
model.add(LSTM(units=64, activation='relu', input_shape=(X_train_seq_scaled.shape[1], X_train_seq_scaled.shape[2]), return_sequences=False))

# Add a dropout layer for regularization
model.add(Dropout(0.2))

# Add output layer for binary classification (sigmoid activation)
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss=focal_loss(), metrics=[f1_score])


In [None]:
#train the model
class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train_seq), y=y_train_seq)
class_weights = dict(enumerate(class_weights))

history = model.fit(X_train_seq_final, y_train_seq_final, epochs=20, batch_size=32,  class_weight=class_weights,validation_data=(X_val_seq, y_val_seq), verbose=1)

Epoch 1/20
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 15ms/step - f1_score: 0.4938 - loss: 0.0144 - val_f1_score: 0.0000e+00 - val_loss: 0.0314
Epoch 2/20
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - f1_score: 0.6177 - loss: 0.0055 - val_f1_score: 0.0000e+00 - val_loss: 0.0836
Epoch 3/20
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - f1_score: 3.1739 - loss: 0.0042 - val_f1_score: 0.0000e+00 - val_loss: 0.0749
Epoch 4/20
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - f1_score: 3.7177 - loss: 0.0027 - val_f1_score: 0.0000e+00 - val_loss: 0.1570
Epoch 5/20
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - f1_score: 3.5676 - loss: 0.0024 - val_f1_score: 0.0000e+00 - val_loss: 0.5923
Epoch 6/20
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - f1_score: 3.6132 - loss: 0.0023 - val_f1_score: 0.0000e+00 - val_loss: 0.9581
Epoch 7/20


In [None]:
# Make predictions on the test set
y_pred = model.predict(X_test_seq_scaled)
y_pred = (y_pred > 0.5)  # Convert probabilities to binary classification

# Evaluate performance

print("Test Accuracy:", accuracy_score(y_test_seq, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test_seq, y_pred))
print("Classification Report:\n", classification_report(y_test_seq, y_pred))



[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
Test Accuracy: 0.9153952843273232
Confusion Matrix:
 [[660   0]
 [ 61   0]]
Classification Report:
               precision    recall  f1-score   support

           0       0.92      1.00      0.96       660
           1       0.00      0.00      0.00        61

    accuracy                           0.92       721
   macro avg       0.46      0.50      0.48       721
weighted avg       0.84      0.92      0.87       721



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Isolation Forest  

In [None]:
from sklearn.ensemble import IsolationForest

# Fit the Isolation Forest model on the training data
model = IsolationForest(contamination=0.15)  # contamination is the expected proportion of anomalies
model.fit(X_train_seq_scaled.reshape(X_train_seq_scaled.shape[0], -1))  # Flattening the time-series data
y_pred = model.predict(X_test_seq_scaled.reshape(X_test_seq_scaled.shape[0], -1))

# Convert predictions: 1 for normal, -1 for anomalies (flooded)
y_pred = [1 if i == 1 else 0 for i in y_pred]  # Convert to 0 (non-flooded) and 1 (flooded)


In [None]:
print("Test Accuracy:", accuracy_score(y_test_seq, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test_seq, y_pred))
print("Classification Report:\n", classification_report(y_test_seq, y_pred))

Test Accuracy: 0.30578512396694213
Confusion Matrix:
 [[184 481]
 [ 23  38]]
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.28      0.42       665
           1       0.07      0.62      0.13        61

    accuracy                           0.31       726
   macro avg       0.48      0.45      0.28       726
weighted avg       0.82      0.31      0.40       726



### SVM

In [None]:
from sklearn.svm import OneClassSVM

# Fit the One-Class SVM model on the training data
model = OneClassSVM(nu=0.1, kernel='rbf', gamma='scale')
model.fit(X_train_seq_scaled.reshape(X_train_seq_scaled.shape[0], -1))
y_pred = model.predict(X_test_seq_scaled.reshape(X_test_seq_scaled.shape[0], -1))

# Convert predictions: 1 for normal, -1 for anomalies (flooded)
y_pred = [1 if i == 1 else 0 for i in y_pred]  # Convert to 0 (non-flooded) and 1 (flooded)

In [None]:
print("Test Accuracy:", accuracy_score(y_test_seq, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test_seq, y_pred))
print("Classification Report:\n", classification_report(y_test_seq, y_pred))

Test Accuracy: 0.6101928374655647
Confusion Matrix:
 [[434 231]
 [ 52   9]]
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.65      0.75       665
           1       0.04      0.15      0.06        61

    accuracy                           0.61       726
   macro avg       0.47      0.40      0.41       726
weighted avg       0.82      0.61      0.70       726



### LOF

In [None]:
from sklearn.neighbors import LocalOutlierFactor

# Fit the LOF model on the training data
model = LocalOutlierFactor(n_neighbors=20, contamination=0.1)
y_pred = model.fit_predict(X_test_seq_scaled.reshape(X_test_seq_scaled.shape[0], -1))

# Convert predictions: 1 for normal, -1 for anomalies (flooded)
y_pred = [1 if i == 1 else 0 for i in y_pred]  # Convert to 0 (non-flooded) and 1 (flooded)


In [None]:
print("Test Accuracy:", accuracy_score(y_test_seq, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test_seq, y_pred))
print("Classification Report:\n", classification_report(y_test_seq, y_pred))

Test Accuracy: 0.1790633608815427
Confusion Matrix:
 [[ 71 594]
 [  2  59]]
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.11      0.19       665
           1       0.09      0.97      0.17        61

    accuracy                           0.18       726
   macro avg       0.53      0.54      0.18       726
weighted avg       0.90      0.18      0.19       726

