
# Anomaly Detection POC (Synthetic, Binder-ready) — hack2025

This notebook demonstrates anomaly detection on **synthetic time-series** data using:
- **Baseline**: Z-score thresholding
- **Model**: Isolation Forest (multivariate)
- *(Optional)* LSTM Autoencoder (installs `torch` on demand)

**Sections**
1. Setup
2. Generate synthetic time series
3. Visualize series & true anomalies
4. Baseline: Z-score thresholding
5. Isolation Forest (multivariate) + metrics & scores plot
6. Prepare data for Streamlit app (persist CSV)
7. Create the Streamlit app file
8. Launch Streamlit in Binder (proxied)


## 1) Setup

In [None]:

import os, warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime, timedelta

from sklearn.ensemble import IsolationForest
from sklearn.metrics import precision_recall_fscore_support
from sklearn.preprocessing import StandardScaler

%matplotlib inline
os.makedirs('outputs', exist_ok=True)
print('Environment ready.')


## 2) Generate Synthetic Time Series

In [None]:

np.random.seed(42)
start = datetime(2024, 1, 1)
periods = 24 * 90  # 90 days hourly
index = [start + timedelta(hours=i) for i in range(periods)]

trend = np.linspace(0, 10, periods)
daily = 5 * np.sin(2 * np.pi * (np.arange(periods) % 24) / 24)
noise = np.random.normal(0, 0.8, periods)
value = 50 + trend + daily + noise

# Point anomalies
anomaly_idx = np.random.choice(np.arange(50, periods-50), size=25, replace=False)
value[anomaly_idx] += np.random.choice([15, -15], size=25) + np.random.normal(0, 3, 25)

# Contextual anomalies (regime shifts)
for start_shift in [1000, 2000]:
    value[start_shift:start_shift+48] += 8

labels = np.zeros(periods, dtype=int)
labels[anomaly_idx] = 1
labels[1000:1048] = 1
labels[2000:2048] = 1

# Extra features
feature_temp = 20 + 10*np.sin(2*np.pi*(np.arange(periods)%24)/24) + np.random.normal(0,1,periods)
feature_load = 0.3*value + np.random.normal(0,2,periods)

df = pd.DataFrame({
    'timestamp': index,
    'value': value,
    'is_anomaly': labels,
    'feature_temp': feature_temp,
    'feature_load': feature_load
})

print(df.shape)
df.head()


## 3) Visualize Series & True Anomalies

In [None]:

plt.figure(figsize=(12,5))
plt.plot(df['timestamp'], df['value'], label='value')
plt.scatter(df['timestamp'][df['is_anomaly']==1], df['value'][df['is_anomaly']==1], s=18, color='red', label='true anomalies')
plt.title('Synthetic Time Series with True Anomalies')
plt.legend(); plt.tight_layout(); plt.show()


## 4) Baseline: Z-score Thresholding

In [None]:

series = df['value'].values.astype(float)
y_true = df['is_anomaly'].values.astype(int)

z = StandardScaler().fit_transform(series.reshape(-1,1)).ravel()
z_thresh = 3.0
pred_baseline = (np.abs(z) > z_thresh).astype(int)

p_b, r_b, f_b, _ = precision_recall_fscore_support(y_true, pred_baseline, average='binary', zero_division=0)
print({'precision': p_b, 'recall': r_b, 'f1': f_b})


## 5) Isolation Forest (Multivariate) + Metrics & Score Plot

In [None]:

X = df[['value', 'feature_temp', 'feature_load']].values
contam = max(1e-3, y_true.mean()+0.01)
iso = IsolationForest(n_estimators=200, contamination=contam, random_state=42)
iso.fit(X)
scores = -iso.score_samples(X)

# Threshold using contamination percentile
thresh = np.percentile(scores, 100 - 100*contam)
pred_iso = (scores >= thresh).astype(int)

p_i, r_i, f_i, _ = precision_recall_fscore_support(y_true, pred_iso, average='binary', zero_division=0)
print({'precision': p_i, 'recall': r_i, 'f1': f_i})

# Plot scores
plt.figure(figsize=(12,5))
plt.plot(df['timestamp'], scores, label='IF anomaly score')
plt.axhline(thresh, color='orange', linestyle='--', label='threshold')
plt.title('Isolation Forest Anomaly Scores')
plt.legend(); plt.tight_layout(); plt.show()


### Optional: LSTM Autoencoder (on-demand)
Run this cell if you want to try a simple LSTM autoencoder. It will install `torch` in-session (not required for Binder build).


In [None]:

# Uncomment to run (can take a few minutes on Binder)
# import sys
# try:
#     import torch, torch.nn as nn
# except Exception:
#     !{sys.executable} -m pip -q install torch
#     import torch, torch.nn as nn
# window = 24
# Xw = np.array([series[i:i+window] for i in range(len(series)-window)])
# Xw = (Xw - Xw.mean()) / (Xw.std() + 1e-6)
# Xw_t = torch.tensor(Xw, dtype=torch.float32).unsqueeze(-1)
# class LSTMAE(nn.Module):
#     def __init__(self, hidden=16):
#         super().__init__()
#         self.encoder = nn.LSTM(input_size=1, hidden_size=hidden, batch_first=True)
#         self.decoder = nn.LSTM(input_size=hidden, hidden_size=1, batch_first=True)
#     def forward(self, x):
#         z,_ = self.encoder(x)
#         z_last = z[:,-1,:].unsqueeze(1).repeat(1, x.size(1), 1)
#         out,_ = self.decoder(z_last)
#         return out
# model = LSTMAE(hidden=16)
# opt = torch.optim.Adam(model.parameters(), lr=1e-3)
# loss_fn = nn.MSELoss()
# model.train()
# for epoch in range(20):
#     opt.zero_grad(); out = model(Xw_t)
#     loss = loss_fn(out, Xw_t); loss.backward(); opt.step()
# model.eval()
# with torch.no_grad():
#     rec = model(Xw_t)
#     err = ((rec - Xw_t)**2).mean(dim=(1,2)).cpu().numpy()
# err_series = np.zeros_like(series); err_series[window:] = err
# e_thr = np.percentile(err_series[window:], 100 - 100*contam)
# pred_lstm = (err_series >= e_thr).astype(int)[:len(y_true)]
# p_l, r_l, f_l, _ = precision_recall_fscore_support(y_true, pred_lstm, average='binary', zero_division=0)
# print({'precision': p_l, 'recall': r_l, 'f1': f_l})


## 6) Prepare data for Streamlit app

In [None]:

os.makedirs('data', exist_ok=True)
# Recompute z-score defensively
z_score = StandardScaler().fit_transform(df['value'].values.reshape(-1,1)).ravel()
# Recompute IF scores defensively
X_ = df[['value', 'feature_temp', 'feature_load']].values
iso_ = IsolationForest(n_estimators=200, contamination=contam, random_state=42)
iso_.fit(X_)
if_scores = -iso_.score_samples(X_)

# Persist
df_out = df.copy()
df_out['z_score'] = z_score
df_out['if_score'] = if_scores
csv_path = 'data/anomaly_results.csv'
df_out.to_csv(csv_path, index=False)
print('Saved', csv_path)


## 7) Create the Streamlit app

In [None]:

with open('streamlit_app.py', 'w') as f:
    f.write("\nimport streamlit as st\nimport pandas as pd\nimport numpy as np\nimport altair as alt\nfrom sklearn.metrics import precision_recall_fscore_support\n\nst.set_page_config(page_title='Anomaly Detection POC', layout='wide')\nst.title('Anomaly Detection \u2014 Synthetic Time Series')\n\n@st.cache_data\ndef load_data():\n    return pd.read_csv('data/anomaly_results.csv', parse_dates=['timestamp'])\n\ndf = load_data()\n\nwith st.sidebar:\n    st.header('Controls')\n    model = st.selectbox('Model', ['Isolation Forest', 'Z-score'])\n    if model == 'Isolation Forest':\n        default_pct = float(max(0.1, (df['is_anomaly'].mean()+0.01)*100))\n        pct = st.slider('Anomaly rate (percentile threshold)', min_value=0.1, max_value=20.0, value=round(default_pct,2), step=0.1)\n        thr = np.percentile(df['if_score'], 100 - pct)\n        pred = (df['if_score'] >= thr).astype(int)\n        st.caption(f\"IF threshold = {thr:.3f} (percentile {100-pct:.1f})\")\n    else:\n        z_thr = st.slider('Z-score threshold (abs)', min_value=1.0, max_value=5.0, value=3.0, step=0.1)\n        pred = (np.abs(df['z_score']) >= z_thr).astype(int)\n        st.caption(f\"Z-score |z| \u2265 {z_thr:.2f}\")\n\n    show_true = st.checkbox('Show true anomalies', value=True)\n\n# Metrics\ny_true = df['is_anomaly'].astype(int)\nprecision, recall, f1, _ = precision_recall_fscore_support(y_true, pred, average='binary', zero_division=0)\ncol1, col2, col3 = st.columns(3)\ncol1.metric('Precision', f\"{precision:.3f}\")\ncol2.metric('Recall', f\"{recall:.3f}\")\ncol3.metric('F1', f\"{f1:.3f}\")\n\n# Chart\nplot_df = df.copy()\nplot_df['predicted'] = pred\nbase = alt.Chart(plot_df).encode(x='timestamp:T')\nline = base.mark_line(color='#1f77b4').encode(y='value:Q')\npred_chart = alt.Chart(plot_df[plot_df['predicted']==1]).mark_point(color='orange', size=30, opacity=0.75).encode(x='timestamp:T', y='value:Q')\nlayers = [line, pred_chart]\nif show_true:\n    true_chart = alt.Chart(plot_df[plot_df['is_anomaly']==1]).mark_point(color='red', size=50, opacity=0.75).encode(x='timestamp:T', y='value:Q')\n    layers.append(true_chart)\nchart = alt.layer(*layers).resolve_scale(y='shared').properties(height=420)\nst.altair_chart(chart, use_container_width=True)\n\nst.download_button('Download data (CSV)', data=df.to_csv(index=False), file_name='anomaly_results.csv', mime='text/csv')\n")
print('Created streamlit_app.py')


## 8) Launch Streamlit (Binder)
Run the cell below, then open: **[/proxy/8501/](/proxy/8501/)**

> If the app returns 404 at first, wait ~10–20 seconds and refresh.


In [None]:

import os, sys, subprocess, time
print('Starting Streamlit on port 8501...')
proc = subprocess.Popen([sys.executable, '-m', 'streamlit', 'run', 'streamlit_app.py', '--server.port', '8501', '--server.address', '0.0.0.0'])
for i in range(10):
    time.sleep(1)
    print('.', end='')
print('
Open the app at /proxy/8501/')
