In [4]:
import pandas as pd
import sys, os

# Add project root to sys.path so we can import src/*
sys.path.append(os.path.abspath(".."))
from src.ml.anomaly import AnomalyDetector

# Load your cleaned data (Parquet)
df = pd.read_parquet("../data/processed/building_replay")

# Initialize detector
detector = AnomalyDetector(window_size=50, contamination=0.02)
# Select relevant numeric features
features = ["temp", "humidity", "co2", "light", "movement"]

# Convert movement (True/False) to 1/0
df["movement"] = df["movement"].astype(str).str.lower().map({"true": 1, "false": 0})

# Convert all others to numeric (in case they are strings)
df[features] = df[features].apply(pd.to_numeric, errors="coerce")

# Drop rows with missing numeric values
df = df.dropna(subset=features)

# Double check types
print(df[features].dtypes)
print(f"✅ Cleaned dataset ready with {len(df)} samples")

detector.fit(df)

# Detect anomalies
df_anomalies = detector.detect(df)

# View top anomalous events
print(df_anomalies[df_anomalies["anomaly"] == 1].head(10))

# Summary by room
summary = detector.get_anomaly_summary(df_anomalies)
print(summary.head(10))

# Optional: Save
df_anomalies.to_csv("../data/processed/anomaly_detected.csv", index=False)


info
temp        float64
humidity    float64
co2         float64
light       float64
movement      int64
dtype: object
✅ Cleaned dataset ready with 2010077 samples


  df = df.dropna(subset=["date_time", "room_id"]).fillna(method="ffill")


✅ Model trained on 2010077 samples and 5 features.


  df = df.dropna(subset=["date_time", "room_id"]).fillna(method="ffill")
  X = df[self.features].fillna(method="ffill")


info             date_time   co2  humidity  light  movement    nodeId  \
index                                                                   
3735   2019-02-09 17:32:06  55.0      67.0    0.0         1  52ABEF43   
3829   2019-02-09 17:33:10  55.0      67.0    0.0         1  52ABEF43   
3931   2019-02-09 17:34:14  55.0      67.0    0.0         1  52ABEF43   
13237  2019-02-09 19:12:41  55.0      66.0    0.0         1  52ABEF43   
13341  2019-02-09 19:13:45  55.0      66.0    0.0         1  52ABEF43   
13438  2019-02-09 19:14:49  55.0      66.0    0.0         1  52ABEF43   
13541  2019-02-09 19:15:54  55.0      66.0    0.0         1  52ABEF43   
19980  2019-02-09 20:23:19  55.0      65.0    0.0         1  52ABEF43   
20083  2019-02-09 20:24:23  55.0      65.0    0.0         1  52ABEF43   
20189  2019-02-09 20:25:27  55.0      65.0    0.0         1  52ABEF43   

info  pressure roomNum   rssi   snr   temp voltage room_id  co2_z    temp_z  \
index                                       

  df.groupby("room_id")["anomaly"]
