<a href="https://colab.research.google.com/github/marwahfaraj/Smart-Water-Quality-Monitoring-Using-IoT-and-Machine-Learning/blob/suryaWaterPrediction/notebooks/04_water_quality_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Water Quality Predection Model

## AAI-530 Final Project - Machine Learning Method 3

This notebook implements a machine learning predection model to predict water quality status (Safe/Warning/Unsafe) based on sensor readings.

**Objective**: Predict water quality status using multiple sensor inputs




In [1]:
!git clone https://github.com/marwahfaraj/Smart-Water-Quality-Monitoring-Using-IoT-and-Machine-Learning.git

Cloning into 'Smart-Water-Quality-Monitoring-Using-IoT-and-Machine-Learning'...
remote: Enumerating objects: 72, done.[K
remote: Counting objects: 100% (72/72), done.[K
remote: Compressing objects: 100% (64/64), done.[K
remote: Total 72 (delta 11), reused 59 (delta 6), pack-reused 0 (from 0)[K
Receiving objects: 100% (72/72), 31.01 MiB | 15.54 MiB/s, done.
Resolving deltas: 100% (11/11), done.


In [60]:
# Import required libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns
import os
import warnings

from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

from tensorflow.keras.layers import GRU, Input, Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

warnings.filterwarnings('ignore')
np.random.seed(42)

print("Libraries loaded successfully!")


Libraries loaded successfully!


## 1. Load and Prepare Data

In [26]:
# Load raw data and preprocess (if processed data not available)
DATA_DIR = "/content/Smart-Water-Quality-Monitoring-Using-IoT-and-Machine-Learning/outputs/water_quality_processed.csv"
# DATA_DIR = '../outputs/water_quality_processed.csv'

# Load data
df = pd.read_csv(DATA_DIR)

print(f"Loaded {len(df):,} records from {df['Station'].nunique()} stations")
print(f"Date range: {df['Timestamp'].min()} to {df['Timestamp'].max()}\n")

print(df.head())

Loaded 295,754 records from 11 stations
Date range: 2016-03-01 00:00:00 to 2020-04-01 23:00:00

             Timestamp   Q  Conductivity  NO3  Temp  Turbidity  Level  \
0  2017-05-11 14:00:00 NaN      13502.10  NaN   NaN      12.79    NaN   
1  2017-05-11 15:00:00 NaN      10304.00  NaN   NaN      11.93    NaN   
2  2017-05-11 16:00:00 NaN       5588.08  NaN   NaN      10.34    NaN   
3  2017-05-11 17:00:00 NaN      13937.00  NaN   NaN      24.02    NaN   
4  2017-05-11 18:00:00 NaN      44761.40  NaN   NaN      18.46    NaN   

   Dayofweek  Month                         Station  ...  \
0          3      5  Johnstone River Coquette Point  ...   
1          3      5  Johnstone River Coquette Point  ...   
2          3      5  Johnstone River Coquette Point  ...   
3          3      5  Johnstone River Coquette Point  ...   
4          3      5  Johnstone River Coquette Point  ...   

   Turbidity_rolling_mean_12h Turbidity_rolling_std_12h  \
0                   12.790000                

In [27]:
df.columns

Index(['Timestamp', 'Q', 'Conductivity', 'NO3', 'Temp', 'Turbidity', 'Level',
       'Dayofweek', 'Month', 'Station', 'Hour', 'Quality_Status', 'Year',
       'DayOfWeek', 'DayOfMonth', 'WeekOfYear', 'IsWeekend', 'Hour_sin',
       'Hour_cos', 'Month_sin', 'Month_cos', 'Turbidity_lag_1h',
       'Turbidity_lag_6h', 'Turbidity_lag_12h', 'Turbidity_lag_24h',
       'Conductivity_lag_1h', 'Conductivity_lag_6h', 'Conductivity_lag_12h',
       'Conductivity_lag_24h', 'Turbidity_rolling_mean_6h',
       'Turbidity_rolling_std_6h', 'Turbidity_rolling_mean_12h',
       'Turbidity_rolling_std_12h', 'Turbidity_rolling_mean_24h',
       'Turbidity_rolling_std_24h', 'Conductivity_rolling_mean_6h',
       'Conductivity_rolling_std_6h', 'Conductivity_rolling_mean_12h',
       'Conductivity_rolling_std_12h', 'Conductivity_rolling_mean_24h',
       'Conductivity_rolling_std_24h'],
      dtype='object')

## 2. Feature Selection and Sequence Creation

In [28]:
target_col = "Turbidity"
drop_cols = [
    "Timestamp",
    "Quality_Status",     # leakage / label-like
    "Turbidity",          # target
    "Dayofweek",          # drop one of the duplicates
    "Month",              # you already have Month_sin/cos (optional)
    "Hour",               # you already have Hour_sin/cos (optional)
    "DayOfWeek",          # keep only one of Dayofweek/DayOfWeek (choose one)
]

feature_cols = [c for c in df.columns if c not in drop_cols]
print("Num candidate features:", len(feature_cols))
print(feature_cols)


Num candidate features: 34
['Q', 'Conductivity', 'NO3', 'Temp', 'Level', 'Station', 'Year', 'DayOfMonth', 'WeekOfYear', 'IsWeekend', 'Hour_sin', 'Hour_cos', 'Month_sin', 'Month_cos', 'Turbidity_lag_1h', 'Turbidity_lag_6h', 'Turbidity_lag_12h', 'Turbidity_lag_24h', 'Conductivity_lag_1h', 'Conductivity_lag_6h', 'Conductivity_lag_12h', 'Conductivity_lag_24h', 'Turbidity_rolling_mean_6h', 'Turbidity_rolling_std_6h', 'Turbidity_rolling_mean_12h', 'Turbidity_rolling_std_12h', 'Turbidity_rolling_mean_24h', 'Turbidity_rolling_std_24h', 'Conductivity_rolling_mean_6h', 'Conductivity_rolling_std_6h', 'Conductivity_rolling_mean_12h', 'Conductivity_rolling_std_12h', 'Conductivity_rolling_mean_24h', 'Conductivity_rolling_std_24h']


In [29]:
df = df.sort_values("Timestamp").reset_index(drop=True)

n = len(df)
train_end = int(n*0.7)
val_end = int(n*0.85)

train_df = df.iloc[:train_end]
val_df = df.iloc[train_end:val_end]
test_df = df.iloc[val_end:]

In [34]:
# Cleaning the data set for NA values.
train_clean = train_df.dropna(subset=[target_col]).copy()

X_train = train_clean[feature_cols].copy()

# hot encoding the column values because some columns may contain
# string values, where RFR will accept only the numerical values.
X_train = pd.get_dummies(X_train, drop_first=True)
y_train = train_clean[target_col].values

rf = RandomForestRegressor(
    n_estimators=80,
    random_state=42,
    n_jobs=-1,
    verbose=1
)

rf.fit(X_train, y_train)

importances = pd.Series(rf.feature_importances_, index = X_train.columns).sort_values(ascending=False)


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  6.9min
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed: 10.8min finished


In [35]:
# Top 20 features
importances.head(20)


Unnamed: 0,0
Turbidity_rolling_mean_6h,0.742123
Turbidity_lag_1h,0.147415
Turbidity_rolling_std_6h,0.021037
Turbidity_lag_24h,0.01472
Q,0.012781
Turbidity_rolling_std_12h,0.009229
Level,0.005743
Turbidity_lag_12h,0.004128
Conductivity,0.004072
Conductivity_lag_1h,0.003897


In [36]:
top_k = 20
top_features = importances.head(top_k).index.tolist()
top_features

['Turbidity_rolling_mean_6h',
 'Turbidity_lag_1h',
 'Turbidity_rolling_std_6h',
 'Turbidity_lag_24h',
 'Q',
 'Turbidity_rolling_std_12h',
 'Level',
 'Turbidity_lag_12h',
 'Conductivity',
 'Conductivity_lag_1h',
 'Turbidity_lag_6h',
 'Turbidity_rolling_mean_12h',
 'Conductivity_rolling_std_6h',
 'Turbidity_rolling_std_24h',
 'Conductivity_lag_6h',
 'Conductivity_lag_12h',
 'Turbidity_rolling_mean_24h',
 'Conductivity_lag_24h',
 'Conductivity_rolling_std_12h',
 'Hour_cos']

In [37]:
# Remove redundant features

corr = train_df[top_features].corr().abs()
upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))

to_drop = [col for col in upper.columns if any(upper[col]> 0.95)]
final_features = [f for f in top_features if f not in to_drop]

print("Dropped due to high correlation:", to_drop)
print("Final Features:", final_features)

Dropped due to high correlation: ['Turbidity_lag_1h', 'Conductivity_lag_1h', 'Turbidity_rolling_mean_12h', 'Turbidity_rolling_mean_24h', 'Conductivity_lag_24h']
Final Features: ['Turbidity_rolling_mean_6h', 'Turbidity_rolling_std_6h', 'Turbidity_lag_24h', 'Q', 'Turbidity_rolling_std_12h', 'Level', 'Turbidity_lag_12h', 'Conductivity', 'Turbidity_lag_6h', 'Conductivity_rolling_std_6h', 'Turbidity_rolling_std_24h', 'Conductivity_lag_6h', 'Conductivity_lag_12h', 'Conductivity_rolling_std_12h', 'Hour_cos']


In [41]:
print(len(top_features), "->", len(final_features))
print("Any non-numeric in final_features?",
      train_df[final_features].select_dtypes(include=["object"]).columns.tolist())

20 -> 15
Any non-numeric in final_features? []


## 3. Sequence Creation

In [46]:
X = df[final_features]
y = df[target_col]

valid_idx = X.notna().all(axis=1) & y.notna()
X = X.loc[valid_idx]
y = y.loc[valid_idx]

scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()

X_scaled = scaler_X.fit_transform(X)
y_scaled = scaler_y.fit_transform(y.values.reshape(-1, 1))

In [47]:
def create_sequences(X, y, seq_len):
  Xs,ys = [],[]
  for i in range(len(X) - seq_len):
    Xs.append(X[i:i+seq_len])
    ys.append(y[i+seq_len])
  return np.array(Xs), np.array(ys)

SEQUENCE_LENGTH = 24

X_seq, y_seq = create_sequences(X_scaled, y_scaled, SEQUENCE_LENGTH)

print(X_seq.shape)
print(y_seq.shape)

(80203, 24, 15)
(80203, 1)


In [50]:
X_train, y_train = X_seq[:train_end], y_seq[:train_end]
X_val, y_val = X_seq[train_end:val_end], y_seq[train_end:val_end]
X_test, y_test = X_seq[val_end:], y_seq[val_end:]

print("Train:", X_train.shape, y_train.shape)
print("Val  :", X_val.shape, y_val.shape)
print("Test :", X_test.shape, y_test.shape)

Train: (80203, 24, 15) (80203, 1)
Val  : (0, 24, 15) (0, 1)
Test : (0, 24, 15) (0, 1)


## 4. GRU Model

In [61]:
def build_gru_model(sequence_length, n_features, units_1=64, units_2=32, dropout_rate=0.2, lr=0.001):
    """
    GRU model for time series regression (predict Turbidity).
    Input shape: (sequence_length, n_features)
    Output: single value (next turbidity)
    """
    model = Sequential([
        Input(shape=(sequence_length, n_features)),

        GRU(units_1, return_sequences=True),
        Dropout(dropout_rate),

        GRU(units_2, return_sequences=False),
        Dropout(dropout_rate),

        Dense(16, activation='relu'),
        Dense(1)  # regression output
    ])

    model.compile(
        optimizer=Adam(learning_rate=lr),
        loss='mse',
        metrics=['mae']
    )
    return model


In [62]:
BATCH_SIZE = 64
EPOCHS = 50

callbacks = [
    EarlyStopping(
        monitor="val_loss",
        patience=8,
        restore_best_weights=True
    ),
    ReduceLROnPlateau(
        monitor="val_loss",
        factor=0.5,
        patience=3,
        min_lr=1e-6,
        verbose=1
    )
]

gru_model = build_gru_model(SEQUENCE_LENGTH, X_train.shape[2], units_1=64, units_2=32, dropout_rate=0.2, lr=0.001)

gru_model.summary()

gru_history = gru_model.fit(
    X_train, y_train,
    epochs = EPOCHS,
    batch_size=BATCH_SIZE,
    validation_data=(X_val, y_val),
    callbacks=callbacks,
    verbose=1
)

Epoch 1/50
[1m1254/1254[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 9ms/step - loss: 0.0037 - mae: 0.0395 - learning_rate: 0.0010
Epoch 2/50
[1m1254/1254[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 16ms/step - loss: 0.0030 - mae: 0.0347 - learning_rate: 0.0010
Epoch 3/50
[1m1254/1254[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 9ms/step - loss: 0.0031 - mae: 0.0347 - learning_rate: 0.0010
Epoch 4/50
[1m1254/1254[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 9ms/step - loss: 0.0031 - mae: 0.0344 - learning_rate: 0.0010
Epoch 5/50
[1m1254/1254[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 9ms/step - loss: 0.0030 - mae: 0.0342 - learning_rate: 0.0010
Epoch 6/50
[1m1254/1254[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 9ms/step - loss: 0.0030 - mae: 0.0337 - learning_rate: 0.0010
Epoch 7/50
[1m1254/1254[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 10ms/step - loss: 0.0030 - mae: 0.0338 - learning_rate: 0.0010
Epoc

In [63]:
# ===== Evaluate GRU (same inverse scaling approach as your LSTM) =====
y_pred_scaled_gru = gru_model.predict(X_test, verbose=0)

y_pred_gru = inverse_transform_target(
    y_pred_scaled_gru, scaler, target_idx, len(available_features)
)

y_test_original = inverse_transform_target(
    y_test.reshape(-1, 1), scaler, target_idx, len(available_features)
)

mse_gru = mean_squared_error(y_test_original, y_pred_gru)
mae_gru = mean_absolute_error(y_test_original, y_pred_gru)
r2_gru = r2_score(y_test_original, y_pred_gru)

print(f"GRU Test MSE: {mse_gru:.4f}")
print(f"GRU Test MAE: {mae_gru:.4f}")
print(f"GRU Test R² : {r2_gru:.4f}")

UnboundLocalError: cannot access local variable 'batch_outputs' where it is not associated with a value