# Phase 6 - Step 1: Generate Water Filter Dataset

## What We're Building

A **Predictive Maintenance System** for Home RO Water Filters.

We'll generate 10,000 realistic sensor readings from 200 water filters over 12 months.

### Why Generate Data?
In real projects you'd get this from IoT sensors. Since we don't have actual sensors, we'll simulate realistic data with proper correlations between features.

In [None]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta

np.random.seed(42)  # Reproducible results

In [None]:
# Configuration
NUM_FILTERS = 200
READINGS_PER_FILTER = 50  # ~50 readings per filter over a year
TOTAL_RECORDS = NUM_FILTERS * READINGS_PER_FILTER  # 10,000

# Assign each filter to a region (affects input water quality)
regions = np.random.choice(
    ['North', 'South', 'East', 'West'], 
    NUM_FILTERS, 
    p=[0.3, 0.25, 0.25, 0.2]
)

# Region base TDS (input water quality varies by region)
region_tds = {'North': 350, 'South': 500, 'East': 650, 'West': 250}

print(f"Generating {TOTAL_RECORDS} records for {NUM_FILTERS} filters...")

In [None]:
records = []

for i in range(NUM_FILTERS):
    filter_id = f'WF{str(i+1).zfill(4)}'
    region = regions[i]
    base_tds_input = region_tds[region]
    
    # Each filter starts at a random age (some are new, some are old)
    initial_age = np.random.randint(0, 200)
    
    for j in range(READINGS_PER_FILTER):
        # Filter age increases with each reading (~7 days apart)
        filter_age_days = initial_age + j * 7 + np.random.randint(-2, 3)
        filter_age_days = max(0, filter_age_days)
        
        # Reading date
        reading_date = datetime(2025, 1, 1) + timedelta(days=j * 7 + np.random.randint(-1, 2))
        
        # Temperature varies by season (month)
        month = reading_date.month
        seasonal_temp = 20 + 10 * np.sin((month - 3) * np.pi / 6)  # Peak in June
        temperature_c = seasonal_temp + np.random.randn() * 3
        temperature_c = np.clip(temperature_c, 10, 40)
        
        # TDS Input: base + seasonal variation + noise
        seasonal_factor = 1.0 + 0.15 * np.sin((month - 6) * np.pi / 6)  # Higher in monsoon
        tds_input = base_tds_input * seasonal_factor + np.random.randn() * 50
        tds_input = np.clip(tds_input, 100, 900)
        
        # Pressure (random but somewhat stable per filter)
        pressure_psi = 45 + np.random.randn() * 10
        pressure_psi = np.clip(pressure_psi, 25, 75)
        
        # Daily usage (liters)
        daily_usage = np.random.uniform(8, 40)
        total_usage = daily_usage * filter_age_days
        
        # === KEY RELATIONSHIPS ===
        
        # TDS Output: depends heavily on filter age and membrane condition
        # New filter: 15-50 ppm, gradually increases with age
        age_factor = filter_age_days / 365  # 0 to ~1.5
        degradation = age_factor ** 1.5 * 80  # Exponential degradation
        tds_output = 25 + degradation + np.random.randn() * 8
        # Higher input TDS makes output worse too
        tds_output += (tds_input - 400) * 0.03
        tds_output = np.clip(tds_output, 10, tds_input * 0.8)
        
        # Flow rate: decreases with age (clogging)
        flow_rate = 2.2 - age_factor * 1.2 + np.random.randn() * 0.15
        # Low pressure also reduces flow
        flow_rate += (pressure_psi - 45) * 0.01
        flow_rate = np.clip(flow_rate, 0.2, 2.8)
        
        # Sediment filter age (replaced every ~90 days)
        sediment_filter_age = filter_age_days % 120 + np.random.randint(-5, 5)
        sediment_filter_age = np.clip(sediment_filter_age, 0, 150)
        
        # Membrane status
        if filter_age_days < 150:
            membrane_status = 'good'
        elif filter_age_days < 280:
            membrane_status = np.random.choice(['good', 'degraded'], p=[0.6, 0.4])
        else:
            membrane_status = np.random.choice(['degraded', 'needs_replacement'], p=[0.4, 0.6])
        
        # TARGET: maintenance_needed
        maintenance_score = 0
        if tds_output > 80: maintenance_score += 2
        if tds_output > 120: maintenance_score += 2
        if flow_rate < 1.0: maintenance_score += 2
        if flow_rate < 0.7: maintenance_score += 1
        if filter_age_days > 300: maintenance_score += 1
        if membrane_status == 'needs_replacement': maintenance_score += 3
        if membrane_status == 'degraded': maintenance_score += 1
        if sediment_filter_age > 100: maintenance_score += 1
        
        # Add some randomness to maintenance decision
        maintenance_score += np.random.randn() * 0.5
        maintenance_needed = 1 if maintenance_score >= 3 else 0
        
        # TDS Alert: output exceeds safe limit
        tds_alert = 1 if tds_output > 100 else 0
        
        records.append({
            'filter_id': filter_id,
            'region': region,
            'reading_date': reading_date.strftime('%Y-%m-%d'),
            'filter_age_days': int(filter_age_days),
            'tds_input': round(tds_input, 1),
            'tds_output': round(tds_output, 1),
            'flow_rate_lpm': round(flow_rate, 2),
            'pressure_psi': round(pressure_psi, 1),
            'temperature_c': round(temperature_c, 1),
            'daily_usage_liters': round(daily_usage, 1),
            'total_usage_liters': round(total_usage, 0),
            'sediment_filter_age_days': int(sediment_filter_age),
            'membrane_status': membrane_status,
            'maintenance_needed': maintenance_needed,
            'tds_alert': tds_alert,
        })

df = pd.DataFrame(records)
print(f"Generated {len(df)} records!")
df.head(10)

In [None]:
# Quick data exploration
print("=== Shape ===")
print(f"{df.shape[0]} rows x {df.shape[1]} columns")
print("\n=== Data Types ===")
print(df.dtypes)
print("\n=== Target Distribution ===")
print(f"Maintenance needed: {df['maintenance_needed'].sum()} ({df['maintenance_needed'].mean():.1%})")
print(f"TDS alerts: {df['tds_alert'].sum()} ({df['tds_alert'].mean():.1%})")
print("\n=== Numeric Summary ===")
df.describe().round(1)

In [None]:
# Save to CSV
df.to_csv('../data/water_filter_readings.csv', index=False)
print(f"Saved to ../data/water_filter_readings.csv")
print(f"File ready for the ML project notebook!")