# Water Quality Classification Model

## AAI-530 Final Project - Machine Learning Method 2

This notebook implements a machine learning classification model to predict water quality status (Safe/Warning/Unsafe) based on sensor readings.

**Objective**: Classify water quality status using multiple sensor inputs

**Why Classification?**
- Automated alerting system for water quality issues
- Helps stakeholders make quick decisions
- Can trigger real-time notifications in IoT systems

**Model**: Random Forest Classifier
- Handles non-linear relationships well
- Provides feature importance insights
- Robust to outliers and missing values
- Different target variable from LSTM (status vs. turbidity value)


In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings

# Machine Learning libraries
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (classification_report, confusion_matrix, 
                            accuracy_score, precision_score, recall_score, 
                            f1_score, roc_auc_score, roc_curve)
import joblib

warnings.filterwarnings('ignore')
np.random.seed(42)

print("Libraries loaded successfully!")


## 1. Load and Prepare Data


In [None]:
# Load raw data
DATA_DIR = '../archive'

def load_all_stations(data_dir):
    """Load and combine data from all monitoring stations."""
    all_data = []
    for filename in os.listdir(data_dir):
        if filename.endswith('.csv'):
            filepath = os.path.join(data_dir, filename)
            station_name = filename.replace('_joined.csv', '').replace('_', ' ').title()
            df = pd.read_csv(filepath)
            df['Timestamp'] = pd.to_datetime(df['Timestamp'])
            df['Station'] = station_name
            all_data.append(df)
    return pd.concat(all_data, ignore_index=True)

df = load_all_stations(DATA_DIR)
print(f"Loaded {len(df):,} records from {df['Station'].nunique()} stations")
print(f"\nAvailable columns: {list(df.columns)}")


In [None]:
# Create water quality classification labels
def classify_water_quality(row):
    """
    Classify water quality based on Australian water quality guidelines.
    
    Thresholds:
    - Turbidity: Safe < 5 NTU, Warning 5-50 NTU, Unsafe > 50 NTU
    - Conductivity: Safe < 30000 µS/cm, Warning 30000-50000, Unsafe > 50000
    - Temperature: Safe 10-30°C, Warning 5-10 or 30-35°C, Unsafe < 5 or > 35°C
    
    Returns: 'Safe', 'Warning', or 'Unsafe'
    """
    unsafe_count = 0
    warning_count = 0
    
    # Check Turbidity
    if pd.notna(row.get('Turbidity')):
        if row['Turbidity'] > 50:
            unsafe_count += 1
        elif row['Turbidity'] > 5:
            warning_count += 1
    
    # Check Conductivity
    if pd.notna(row.get('Conductivity')):
        if row['Conductivity'] > 50000:
            unsafe_count += 1
        elif row['Conductivity'] > 30000:
            warning_count += 1
    
    # Check Temperature
    if pd.notna(row.get('Temp')):
        if row['Temp'] < 5 or row['Temp'] > 35:
            unsafe_count += 1
        elif row['Temp'] < 10 or row['Temp'] > 30:
            warning_count += 1
    
    if unsafe_count >= 1:
        return 'Unsafe'
    elif warning_count >= 1:
        return 'Warning'
    else:
        return 'Safe'

# Apply classification
print("Creating water quality labels...")
df['Quality_Status'] = df.apply(classify_water_quality, axis=1)

# Display distribution
print("\nWater Quality Distribution:")
print(df['Quality_Status'].value_counts())
print(f"\nPercentages:")
print((df['Quality_Status'].value_counts(normalize=True) * 100).round(2))


In [None]:
# Feature engineering for classification
# Add time-based features
df['Hour'] = df['Timestamp'].dt.hour
df['DayOfWeek'] = df['Timestamp'].dt.dayofweek
df['Month'] = df['Timestamp'].dt.month
df['Year'] = df['Timestamp'].dt.year
df['IsWeekend'] = df['DayOfWeek'].isin([5, 6]).astype(int)

# Cyclical encoding
df['Hour_sin'] = np.sin(2 * np.pi * df['Hour'] / 24)
df['Hour_cos'] = np.cos(2 * np.pi * df['Hour'] / 24)
df['Month_sin'] = np.sin(2 * np.pi * df['Month'] / 12)
df['Month_cos'] = np.cos(2 * np.pi * df['Month'] / 12)

print("Added time-based features!")
print(f"Dataset shape: {df.shape}")


## 2. Prepare Features for Classification


In [None]:
# Select features for classification
# Use sensor readings and time features (NOT the raw thresholding variables used in labels)
potential_features = ['Conductivity', 'NO3', 'Temp', 'Turbidity', 'Level', 'Q',
                      'Hour_sin', 'Hour_cos', 'Month_sin', 'Month_cos', 'IsWeekend']

# Check which features are available and have sufficient data
feature_cols = []
for col in potential_features:
    if col in df.columns:
        non_null_pct = df[col].notna().sum() / len(df) * 100
        if non_null_pct > 10:  # Keep features with >10% data
            feature_cols.append(col)
            print(f"  {col}: {non_null_pct:.1f}% non-null")

print(f"\nSelected {len(feature_cols)} features for classification")


In [None]:
# Create feature matrix and target vector
# Drop rows with missing values in selected features
df_model = df[feature_cols + ['Quality_Status']].dropna()

print(f"Dataset for modeling: {len(df_model):,} samples")

# Separate features and target
X = df_model[feature_cols].values
y = df_model['Quality_Status'].values

# Encode target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print(f"\nClass mapping: {dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))}")
print(f"\nClass distribution in final dataset:")
for cls, count in zip(*np.unique(y_encoded, return_counts=True)):
    print(f"  {label_encoder.inverse_transform([cls])[0]}: {count:,} ({count/len(y_encoded)*100:.1f}%)")


In [None]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, 
    test_size=0.2, 
    random_state=42,
    stratify=y_encoded  # Maintain class distribution
)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Data Split Summary:")
print(f"  Training set: {X_train.shape[0]:,} samples")
print(f"  Test set: {X_test.shape[0]:,} samples")


## 3. Train Random Forest Classifier


In [None]:
# Build and train Random Forest Classifier
rf_model = RandomForestClassifier(
    n_estimators=100,          # Number of trees
    max_depth=15,              # Maximum depth of trees
    min_samples_split=5,       # Minimum samples to split node
    min_samples_leaf=2,        # Minimum samples in leaf node
    class_weight='balanced',   # Handle class imbalance
    random_state=42,
    n_jobs=-1                  # Use all CPU cores
)

print("Training Random Forest Classifier...")
rf_model.fit(X_train_scaled, y_train)
print("Training completed!")

# Cross-validation score
cv_scores = cross_val_score(rf_model, X_train_scaled, y_train, cv=5, scoring='accuracy')
print(f"\nCross-validation scores: {cv_scores}")
print(f"Mean CV accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std()*2:.4f})")


## 4. Model Evaluation


In [None]:
# Make predictions on test set
y_pred = rf_model.predict(X_test_scaled)
y_pred_proba = rf_model.predict_proba(X_test_scaled)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Classification Metrics (Test Set):")
print("=" * 50)
print(f"  Accuracy:  {accuracy:.4f}")
print(f"  Precision: {precision:.4f}")
print(f"  Recall:    {recall:.4f}")
print(f"  F1-Score:  {f1:.4f}")
print("=" * 50)

# Detailed classification report
print("\nDetailed Classification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


In [None]:
# Confusion Matrix
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Raw counts
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0],
            xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
axes[0].set_xlabel('Predicted', fontsize=12)
axes[0].set_ylabel('Actual', fontsize=12)
axes[0].set_title('Confusion Matrix (Counts)', fontsize=14, fontweight='bold')

# Normalized
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
sns.heatmap(cm_normalized, annot=True, fmt='.2%', cmap='Blues', ax=axes[1],
            xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
axes[1].set_xlabel('Predicted', fontsize=12)
axes[1].set_ylabel('Actual', fontsize=12)
axes[1].set_title('Confusion Matrix (Normalized)', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.savefig('../outputs/classification_confusion_matrix.png', dpi=150, bbox_inches='tight')
plt.show()


In [None]:
# Feature Importance
feature_importance = pd.DataFrame({
    'Feature': feature_cols,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=True)

fig, ax = plt.subplots(figsize=(10, 6))
colors = plt.cm.RdYlGn(feature_importance['Importance'] / feature_importance['Importance'].max())
bars = ax.barh(feature_importance['Feature'], feature_importance['Importance'], color=colors)
ax.set_xlabel('Importance', fontsize=12)
ax.set_ylabel('Feature', fontsize=12)
ax.set_title('Feature Importance for Water Quality Classification', fontsize=14, fontweight='bold')

# Add value labels
for bar, imp in zip(bars, feature_importance['Importance']):
    ax.text(bar.get_width() + 0.01, bar.get_y() + bar.get_height()/2, 
            f'{imp:.3f}', va='center', fontsize=10)

plt.tight_layout()
plt.savefig('../outputs/classification_feature_importance.png', dpi=150, bbox_inches='tight')
plt.show()

print("\nTop 5 Most Important Features:")
for _, row in feature_importance.tail(5).iloc[::-1].iterrows():
    print(f"  {row['Feature']}: {row['Importance']:.4f}")


## 5. Export Results for Dashboard


In [None]:
# Create classification results dataframe for Tableau dashboard
classification_results = pd.DataFrame({
    'Actual_Class': label_encoder.inverse_transform(y_test),
    'Predicted_Class': label_encoder.inverse_transform(y_pred),
    'Safe_Probability': y_pred_proba[:, label_encoder.transform(['Safe'])[0]],
    'Warning_Probability': y_pred_proba[:, label_encoder.transform(['Warning'])[0]],
    'Unsafe_Probability': y_pred_proba[:, label_encoder.transform(['Unsafe'])[0]] if 'Unsafe' in label_encoder.classes_ else 0
})

# Add feature values for context
for i, col in enumerate(feature_cols):
    classification_results[col] = X_test[:, i]

# Add correct/incorrect flag
classification_results['Correct_Prediction'] = (classification_results['Actual_Class'] == 
                                                  classification_results['Predicted_Class']).astype(int)

# Save to CSV
classification_results.to_csv('../outputs/classification_results.csv', index=False)
print("Saved classification results to: ../outputs/classification_results.csv")
print(f"Results shape: {classification_results.shape}")
classification_results.head(10)


In [None]:
# Create summary metrics for dashboard
summary_metrics = {
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'Total Samples', 
               'Safe Count', 'Warning Count', 'Unsafe Count'],
    'Value': [accuracy, precision, recall, f1, len(y_test),
              (y_pred == label_encoder.transform(['Safe'])[0]).sum(),
              (y_pred == label_encoder.transform(['Warning'])[0]).sum(),
              (y_pred == label_encoder.transform(['Unsafe'])[0]).sum() if 'Unsafe' in label_encoder.classes_ else 0]
}
summary_df = pd.DataFrame(summary_metrics)
summary_df.to_csv('../outputs/classification_summary.csv', index=False)
print("Saved classification summary to: ../outputs/classification_summary.csv")
summary_df


In [None]:
# Save the trained model
joblib.dump(rf_model, '../models/random_forest_classifier.joblib')
joblib.dump(scaler, '../models/scaler.joblib')
joblib.dump(label_encoder, '../models/label_encoder.joblib')
print("Saved model and preprocessing objects to ../models/")
