---
title: "Advanced Sensor Data Analysis"
format: html
---

In [None]:
# ─── 1. Imports ─────────────────────────────────────────────────
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
import seaborn as sns
from scipy.fft import fft, fftfreq
from scipy.signal import welch, butter, filtfilt
from scipy.stats import entropy as scipy_entropy
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-whitegrid')
print('✓ All imports loaded.')

In [None]:
# ─── 2. Synthetic 16-channel gas-sensor array ────────────────
def make_sensor_data(n=20000, n_ch=16, seed=42):
    rng = np.random.default_rng(seed)
    t = np.linspace(0, 200, n)

    # Each sensor: sum of 2-3 sinusoids + correlated drift + noise
    base_freqs = rng.uniform(0.05, 0.8, (n_ch, 3))   # Hz-equivalent
    base_amps = rng.uniform(0.2, 1.5, (n_ch, 3))
    phases = rng.uniform(0, 2*np.pi, (n_ch, 3))

    signals = np.zeros((n, n_ch))
    for ch in range(n_ch):
        for k in range(3):
            signals[:, ch] += base_amps[ch, k] * np.sin(
                2*np.pi*base_freqs[ch, k]*t + phases[ch, k])

    # Shared low-freq drift (environmental)
    env_drift = np.sin(t / 40) * 0.4 + np.sin(t / 17 + 1) * 0.2
    signals += env_drift[:, None]

    # Cross-correlated noise (sensor clusters 0-3, 4-7, 8-11, 12-15)
    for cluster_start in range(0, n_ch, 4):
        shared = rng.normal(0, 0.12, n)
        signals[:, cluster_start:cluster_start+4] += shared[:, None]

    # Independent noise
    signals += rng.normal(0, 0.06, (n, n_ch))

    # Inject transient events (gas pulses) — correlated across cluster
    for _ in range(40):
        idx = rng.integers(100, n-100)
        clust = rng.integers(0, 4)
        width = rng.integers(30, 120)
        amp = rng.uniform(0.8, 2.5)
        pulse = amp * \
            np.exp(-0.5 * ((np.arange(width) - width//2) / (width/6))**2)
        end = min(idx + width, n)
        pw = end - idx
        for ch in range(clust*4, clust*4+4):
            signals[idx:end, ch] += pulse[:pw] * rng.uniform(0.5, 1.0)

    cols = [f'Sensor_{i:02d}' for i in range(n_ch)]
    df = pd.DataFrame(signals, columns=cols)
    df['Time'] = pd.date_range('2024-01-01', periods=n, freq='1min')
    return df


df = make_sensor_data()
print(f'Shape: {df.shape}')
df.head()

<!-- 1. Power Spectral Density (PSD) — Welch's Method

Welch's method segments the signal, windows each segment, computes per-segment FFTs, and averages — reducing spectral leakage and variance compared to a raw periodogram. -->


In [None]:
# ─── 3. PSD via Welch for every sensor ──────────────────────
sensor_cols = [c for c in df.columns if c.startswith('Sensor')]
fs = 1.0  # 1 sample / min

psd_matrix = {}   # sensor → (freqs, Pxx)
for col in sensor_cols:
    f, Pxx = welch(df[col].values, fs=fs, nperseg=512, noverlap=256,
                   detrend='linear', scaling='density')
    psd_matrix[col] = (f, Pxx)

# Plot first 8 sensors
fig, axes = plt.subplots(2, 4, figsize=(18, 7), sharex=True, sharey=False)
for ax, col in zip(axes.flatten(), sensor_cols[:8]):
    f, Pxx = psd_matrix[col]
    ax.semilogy(f, Pxx, lw=0.9, color='steelblue')
    ax.set_title(col, fontsize=10)
    ax.set_xlabel('Frequency (cycles/min)')
    ax.set_ylabel('PSD')
    # Mark dominant frequency (exclude DC)
    mask = f > 0.01
    dom_idx = np.argmax(Pxx[mask])
    dom_f = f[mask][dom_idx]
    ax.axvline(dom_f, color='crimson', ls='--', lw=0.8, alpha=0.7)
    ax.text(dom_f+0.005, Pxx[mask][dom_idx]*0.5,
            f'{dom_f:.3f}', fontsize=7, color='crimson')

plt.suptitle('Power Spectral Density — Welch (8 Sensors)', fontsize=14, y=1.02)
plt.tight_layout()
plt.show()

In [None]:
# ─── 4. Dominant-frequency heatmap across all 16 sensors ──
dom_freqs = {}
for col in sensor_cols:
    f, Pxx = psd_matrix[col]
    mask = f > 0.01
    dom_freqs[col] = f[mask][np.argmax(Pxx[mask])]

freq_df = pd.DataFrame(
    {'Sensor': sensor_cols, 'Dominant Freq (cycles/min)': list(dom_freqs.values())})
freq_df = freq_df.set_index('Sensor').sort_values(
    'Dominant Freq (cycles/min)', ascending=False)

fig, ax = plt.subplots(figsize=(10, 5))
sns.heatmap(freq_df, annot=True, fmt='.3f', cmap='YlOrRd',
            ax=ax, cbar_kws={'label': 'Freq'})
ax.set_title('Dominant Frequency per Sensor', fontsize=13)
plt.tight_layout()
plt.show()

<!-- 2. Discrete Wavelet Transform — Time–Frequency Energy Map

DWT decomposes a signal into approximation (low-freq) and detail (high-freq) coefficients at multiple levels. We compute the **energy** at each level across sliding windows to build a 2-D time–frequency energy map. -->


In [None]:
# ─── 5. Manual DWT (Haar) — no pywt dependency ─────────────
def haar_dwt_1level(x):
    """One level of Haar DWT. Input length must be even."""
    n = len(x) - len(x) % 2   # truncate to even
    x = x[:n]
    approx = (x[0::2] + x[1::2]) / np.sqrt(2)
    detail = (x[0::2] - x[1::2]) / np.sqrt(2)
    return approx, detail


def haar_dwt_multi(x, levels=5):
    """Multi-level Haar DWT. Returns list of detail coeffs + final approx."""
    details = []
    current = x.copy()
    for _ in range(levels):
        current, detail = haar_dwt_1level(current)
        details.append(detail)
    return current, details   # approx, [D1, D2, ..., D_levels]


# Compute DWT for Sensor_00
LEVELS = 6
approx, details = haar_dwt_multi(df['Sensor_00'].values, levels=LEVELS)

# Energy in sliding windows per level
WIN = 128
energy_map = []
for lvl, d in enumerate(details):
    energies = []
    for i in range(0, len(d) - WIN, WIN // 2):
        energies.append(np.sum(d[i:i+WIN]**2))
    energy_map.append(energies)

# Plot
fig, ax = plt.subplots(figsize=(16, 5))
img = ax.imshow(energy_map, aspect='auto', cmap='inferno', origin='lower')
ax.set_ylabel('Decomposition Level (1=high freq)')
ax.set_xlabel('Time Window Index')
ax.set_title('Wavelet Energy Map — Sensor_00 (Haar DWT)', fontsize=13)
plt.colorbar(img, ax=ax, label='Energy')
plt.tight_layout()
plt.show()

<!-- 3. Transfer Entropy — Causal Information Flow Between Sensors

Transfer entropy (TE) quantifies **directed** information flow. how much knowing the past of sensor A reduces uncertainty about the future of sensor B, beyond B's own past. This goes beyond correlation to detect **asymmetric causality**. -->

In [None]:
# ─── 6. Discretised Transfer Entropy (binned estimator) ──
def transfer_entropy(x, y, lag=3, n_bins=8):
    """
    Estimate TE(X → Y): how much X's past helps predict Y's future
    beyond Y's own past.  Binned (histogram-based) estimator.
    """
    n = len(x) - lag
    # Build joint vectors
    x_past = x[:n]
    y_past = y[:n]
    y_fut = y[lag:lag+n]

    # Discretise
    def digitise(arr):
        return np.digitize(arr, np.histogram_bin_edges(arr, bins=n_bins)[1:-1])

    xp = digitise(x_past)
    yp = digitise(y_past)
    yf = digitise(y_fut)

    # Counts → probabilities (with Laplace smoothing)
    from collections import Counter

    def prob(keys):
        counts = Counter(keys)
        total = sum(counts.values()) + len(set(keys))  # smoothing
        return {k: (v+1)/total for k, v in counts.items()}

    # Joint and marginal probabilities
    p_yp_yf = prob(list(zip(yp, yf)))
    p_xp_yp_yf = prob(list(zip(xp, yp, yf)))
    p_yp = prob(list(yp))
    p_xp_yp = prob(list(zip(xp, yp)))

    te = 0.0
    for i in range(n):
        key3 = (xp[i], yp[i], yf[i])
        key2a = (yp[i], yf[i])
        key2b = (xp[i], yp[i])
        key1 = yp[i]
        p3 = p_xp_yp_yf.get(key3, 1e-10)
        p2a = p_yp_yf.get(key2a, 1e-10)
        p2b = p_xp_yp.get(key2b, 1e-10)
        p1 = p_yp.get(key1, 1e-10)
        te += p3 * np.log2(p3 * p1 / (p2a * p2b))
    return te / n


# Compute TE for the 4-sensor cluster (0-3) — subsample for speed
SUB = 3000
rng = np.random.default_rng(0)
idx = np.sort(rng.choice(len(df), SUB, replace=False))
sub = df[sensor_cols[:4]].iloc[idx].values

print('Computing Transfer Entropy (4 sensors, subsample) …')
n_s = 4
te_matrix = np.zeros((n_s, n_s))
for i in range(n_s):
    for j in range(n_s):
        if i != j:
            te_matrix[i, j] = transfer_entropy(
                sub[:, i], sub[:, j], lag=5, n_bins=6)
            print(
                f'  TE({sensor_cols[i]} → {sensor_cols[j]}) = {te_matrix[i, j]:.4f}')

te_df = pd.DataFrame(te_matrix, index=sensor_cols[:4], columns=sensor_cols[:4])
print('\nTransfer Entropy Matrix:')
te_df.round(4)

In [None]:
# ─── 7. Visualise TE as a directed heatmap ───────────────
fig, ax = plt.subplots(figsize=(7, 5.5))
sns.heatmap(te_df, annot=True, fmt='.4f', cmap='YlGnBu', ax=ax,
            linewidths=0.5, cbar_kws={'label': 'Transfer Entropy (bits)'})
ax.set_title('Transfer Entropy (row → col)', fontsize=13)
ax.set_xlabel('Target Sensor')
ax.set_ylabel('Source Sensor')
plt.tight_layout()
plt.show()

<!-- 4. Granger Causality — VAR-based Pairwise Test

Granger causality tests whether lagged values of sensor X provide **statistically significant** improvement in predicting sensor Y using an F-test on nested VAR models. -->


In [None]:
# ─── 8. Granger Causality (manual VAR F-test) ────────────
from scipy.stats import f as f_dist
from numpy.linalg import lstsq


def granger_f_test(y, x, max_lag=8):
    """
    F-test: does x Granger-cause y?
    Compares:
      H0 (restricted):  y_t = a1*y_{t-1} + ... + a_p*y_{t-p}
      H1 (full):         + b1*x_{t-1} + ... + b_p*x_{t-p}
    """
    n = len(y)
    p = max_lag
    # Build matrices
    Y_dep = y[p:]
    # Restricted (y lags only)
    X_r = np.column_stack([y[p-i-1:n-i-1] for i in range(p)])
    # Full (y + x lags)
    X_f = np.column_stack([X_r,
                           *[x[p-i-1:n-i-1] for i in range(p)]])

    # OLS
    beta_r, res_r, _, _ = lstsq(np.column_stack(
        [X_r, np.ones(len(Y_dep))]), Y_dep, rcond=None)
    beta_f, res_f, _, _ = lstsq(np.column_stack(
        [X_f, np.ones(len(Y_dep))]), Y_dep, rcond=None)

    # RSS
    rss_r = np.sum(
        (Y_dep - np.column_stack([X_r, np.ones(len(Y_dep))]) @ beta_r)**2)
    rss_f = np.sum(
        (Y_dep - np.column_stack([X_f, np.ones(len(Y_dep))]) @ beta_f)**2)

    # F statistic
    q = p                          # number of restrictions
    k = X_f.shape[1] + 1           # full model params
    T = len(Y_dep)
    F = ((rss_r - rss_f) / q) / (rss_f / (T - k))
    return F


# Use first 5000 rows for speed
sub5 = df[sensor_cols[:6]].values[:5000]
n_sens = 6
gran_F = np.full((n_sens, n_sens), np.nan)
gran_p = np.full((n_sens, n_sens), np.nan)
LAG = 8

print('Running Granger F-tests (6 sensors) …')
for i in range(n_sens):
    for j in range(n_sens):
        if i != j:
            F_val = granger_f_test(sub5[:, j], sub5[:, i], max_lag=LAG)
            gran_F[i, j] = F_val
            # p-value: F(LAG, T - 2*LAG - 1)
            df_num = LAG
            df_den = len(sub5) - 2*LAG - 1
            gran_p[i, j] = 1 - f_dist.cdf(F_val, df_num, df_den)

gran_F_df = pd.DataFrame(
    gran_F, index=sensor_cols[:6], columns=sensor_cols[:6])
gran_p_df = pd.DataFrame(
    gran_p, index=sensor_cols[:6], columns=sensor_cols[:6])
print('\nGranger F-statistic matrix (row → col):')
gran_F_df.round(2)

In [None]:
# ─── 9. Granger significance heatmap (α = 0.05) ─────────
sig = (gran_p_df < 0.05).astype(int)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# F-statistic
sns.heatmap(gran_F_df.round(1), annot=True, fmt='.1f', cmap='Reds',
            ax=axes[0], linewidths=0.5, cbar_kws={'label': 'F-statistic'})
axes[0].set_title('Granger F-statistic (row → col)')
axes[0].set_xlabel('Target')
axes[0].set_ylabel('Source')

# Significance map
sns.heatmap(sig, annot=True, fmt='d', cmap='RdYlGn', vmin=0, vmax=1,
            ax=axes[1], linewidths=0.5, cbar_kws={'label': 'Significant (α<0.05)'})
axes[1].set_title('Significant Causal Links')
axes[1].set_xlabel('Target')
axes[1].set_ylabel('Source')

plt.suptitle('Granger Causality Analysis', fontsize=14, y=1.03)
plt.tight_layout()
plt.show()

<!-- 5. EWMA Control Chart — Adaptive Threshold

Unlike fixed 3σ limits, EWMA **exponentially weights recent observations**, making it highly sensitive to small persistent shifts. We overlay it on a sensor with an injected gradual drift. -->

In [None]:
# ─── 10. Inject a subtle drift and apply EWMA control ────
rng = np.random.default_rng(77)
n = len(df)
signal = df['Sensor_00'].values.copy()

# Inject subtle +0.8 drift starting at index 12000
drift_start = 12000
signal[drift_start:] += np.linspace(0, 0.8, n - drift_start)

# EWMA parameters
lam = 0.1      # smoothing factor (lower = more memory)
mu = np.mean(signal[:drift_start])
sigma = np.std(signal[:drift_start])
L = 2.7      # control-limit width factor

ewma = np.zeros(n)
ewma[0] = signal[0]
ucl = np.zeros(n)
lcl = np.zeros(n)

for i in range(1, n):
    ewma[i] = lam * signal[i] + (1 - lam) * ewma[i-1]
    # EWMA variance grows then saturates
    var_ewma = (sigma**2 * lam / (2 - lam)) * (1 - (1 - lam)**(2*(i+1)))
    ucl[i] = mu + L * np.sqrt(var_ewma)
    lcl[i] = mu - L * np.sqrt(var_ewma)

alerts = (ewma > ucl) | (ewma < lcl)

fig, axes = plt.subplots(2, 1, figsize=(16, 7), sharex=True)

# Raw + EWMA
axes[0].plot(signal, lw=0.5, color='steelblue', alpha=0.5, label='Raw')
axes[0].plot(ewma,   lw=1.2, color='darkblue',            label='EWMA')
axes[0].axvline(drift_start, color='red', ls=':',
                lw=1.5, label='Drift injected')
axes[0].set_title('EWMA-Smoothed Signal vs Raw', fontsize=12)
axes[0].set_ylabel('Response')
axes[0].legend(loc='upper left')

# EWMA with control limits
axes[1].plot(ewma, lw=1.2, color='darkblue', label='EWMA')
axes[1].fill_between(range(n), lcl, ucl, color='green',
                     alpha=0.08, label='Control band')
axes[1].plot(ucl, color='red',   ls='--', lw=0.8)
axes[1].plot(lcl, color='red',   ls='--', lw=0.8)
axes[1].scatter(np.where(alerts)[0], ewma[alerts], s=12,
                color='red', zorder=5, label=f'Alerts ({alerts.sum()})')
axes[1].axvline(drift_start, color='red', ls=':', lw=1.5)
axes[1].set_title('EWMA Control Chart (adaptive limits)', fontsize=12)
axes[1].set_ylabel('EWMA')
axes[1].set_xlabel('Sample Index')
axes[1].legend(loc='upper left')

plt.tight_layout()
plt.show()
print(
    f'First alert at index {np.argmax(alerts)} vs drift at {drift_start} → detection lag = {np.argmax(alerts)-drift_start} samples')

<!-- 6. Sensor Fusion — PCA + t-SNE Joint State Embedding

We define operating states by sliding-window statistics (mean, std, skewness, kurtosis) across all 16 sensors, then project into 2-D using t-SNE to visualise cluster structure. -->


In [None]:
# ─── 11. Sliding-window feature extraction ──────────────
WIN = 200
STEP = 100
rows = []
for start in range(0, len(df) - WIN, STEP):
    chunk = df[sensor_cols].iloc[start:start+WIN]
    feat = {}
    feat['window_start'] = start
    for col in sensor_cols:
        s = chunk[col]
        feat[f'{col}_mean'] = s.mean()
        feat[f'{col}_std'] = s.std()
        feat[f'{col}_skew'] = s.skew()
        feat[f'{col}_kurt'] = s.kurtosis()
    rows.append(feat)

feat_df = pd.DataFrame(rows)
print(f'Feature matrix: {feat_df.shape}')
feat_df.head()

In [None]:
# ─── 12. PCA → t-SNE two-stage embedding ─────────────────
feat_cols = [c for c in feat_df.columns if c != 'window_start']
scaler = StandardScaler()
X_s = scaler.fit_transform(feat_df[feat_cols])

# PCA to 30 dims first (speeds up t-SNE dramatically)
pca30 = PCA(n_components=30, random_state=0)
X_pca = pca30.fit_transform(X_s)
print(
    f'PCA 30-d explained variance: {pca30.explained_variance_ratio_.sum()*100:.1f}%')

# t-SNE to 2-D
tsne = TSNE(n_components=2, perplexity=40, learning_rate=200,
            n_iter=1000, random_state=0, metric='euclidean')
X_tsne = tsne.fit_transform(X_pca)

# Colour by the dominant sensor's rolling std (proxy for activity level)
activity = feat_df[[f'{s}_std' for s in sensor_cols[:4]]].mean(axis=1).values

fig, ax = plt.subplots(figsize=(10, 8))
sc = ax.scatter(X_tsne[:, 0], X_tsne[:, 1], c=activity, cmap='plasma',
                s=15, alpha=0.7, edgecolors='none')
plt.colorbar(sc, ax=ax, label='Mean Activity (Std of top-4 sensors)')
ax.set_title('t-SNE Embedding of 16-Sensor Window Features', fontsize=13)
ax.set_xlabel('t-SNE dim 1')
ax.set_ylabel('t-SNE dim 2')
plt.tight_layout()
plt.show()

In [None]:
# ─── 13. K-Means clustering on the embedding ─────────────
from sklearn.cluster import KMeans

K = 5
km = KMeans(n_clusters=K, random_state=0, n_init=10)
labels = km.fit_predict(X_tsne)

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# t-SNE coloured by cluster
scatter = axes[0].scatter(X_tsne[:, 0], X_tsne[:, 1], c=labels,
                          cmap='tab10', s=15, alpha=0.7, edgecolors='none')
axes[0].set_title(f't-SNE with K-Means (K={K})')
axes[0].set_xlabel('t-SNE 1')
axes[0].set_ylabel('t-SNE 2')
plt.colorbar(scatter, ax=axes[0], label='Cluster')

# Cluster sizes + mean activity
cluster_activity = pd.DataFrame({'Cluster': labels, 'Activity': activity})
ca_mean = cluster_activity.groupby(
    'Cluster')['Activity'].agg(['mean', 'count'])
ca_mean['mean'].plot(kind='bar', ax=axes[1],
                     color='steelblue', edgecolor='white')
axes[1].set_title('Mean Activity per Cluster')
axes[1].set_ylabel('Activity Level')
axes[1].set_xlabel('Cluster')
axes[1].tick_params(axis='x', rotation=0)
for i, (m, c) in enumerate(ca_mean.itertuples(index=False)):
    axes[1].text(i, m+0.002, f'n={int(c)}', ha='center', fontsize=9)

plt.tight_layout()
plt.show()

---
## Summary & Portfolio Takeaways

| Technique | Insight Gained |
|---|---|
| **Welch PSD** | Each sensor has 1-3 dominant frequencies; cluster members share spectral signatures |
| **Haar DWT** | Transient gas-pulse events appear as high-energy bursts at detail levels 2-4 |
| **Transfer Entropy** | Asymmetric causal links within sensor clusters; identifies driver sensors |
| **Granger Causality** | Statistically significant directed relationships validated with F-tests |
| **EWMA Control** | Detects subtle drifts within ~20 samples — far earlier than 3σ charts |
| **PCA + t-SNE + KMeans** | Sensor network has 4-5 distinct operating regimes; activity-level clustering |

These techniques form the analytical backbone for **predictive maintenance**, **sensor-network health monitoring**, and **root-cause isolation** in industrial IoT systems.
