# Glance
A graphical anomaly detection engine for real world sensor data.

This is a new approach to a common problem in manufacturing and industrial control. Remote sensors provide monitoring o fprocess parameters but it can be a challenge to identify anomalies which occur rarely and can be subtle in real world noisy data. By approaching this classification as an image classification with CNNs we can leverage some of the powerful human-like visual intuition that experts in the field often have, but in an automated way.

In [106]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

## Imports

In [2]:
from pprint import pprint
import random
import datetime

In [3]:
from IPython.core.debugger import set_trace

In [4]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [5]:
import sys
from pathlib import Path
sys.path.append(str(Path.cwd().parent))

In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F

## Read in data subset for training test
Set mean and std of training set for normalization

In [7]:
import dill as pickle

with open('data/scratch/data_slim.p', 'rb') as file:
    data = pickle.load(file)

with open('data/scratch/data_val.p', 'rb') as file:
    data_val = pickle.load(file)

In [8]:
data.shape, data_val.shape

((1000, 6), (250, 6))

In [9]:
# Remove any duplicate indices in data
data = data.drop_duplicates(data.columns.drop('data'))  # leave out data column b/c unhashable
data_val = data_val.drop_duplicates(data_val.columns.drop('data'))  # leave out data column b/c unhashable

In [10]:
data.shape, data_val.shape

((985, 6), (249, 6))

In [21]:
def make_plot(vector):
    plt.figure(num=None, figsize=(2.24, 2.24), dpi=100,
               facecolor='w', edgecolor='k')
    plt.plot(vector, 'k')
    plt.axis('off')

    buf = io.BytesIO()
    plt.savefig(buf, format='png')
    buf.seek(0)
    im = Image.open(buf)
    pix = np.array(im)
    buf.close()
    plt.close()
    return pix[:, :, :3]

In [22]:
for n, i in enumerate(np.random.choice(data.index, 100)):
    pix = make_plot(data.data[i])
    pix_mean = pix[:, :, 0].mean()
    pix_std = pix[:, :, 0].std()
    if n == 0:
        mean = pix_mean
        std = pix_mean
    else:
        mean += pix_mean
        mean /= 2
        std += pix_std
        std /= 2

In [23]:
mean, std

(217.87001988686944, 75.8441425486478)

#### Use a torch imagefolder

In [24]:
import os
from torchvision.datasets.folder import ImageFolder, default_loader
from torchvision.datasets.utils import download_url, check_integrity

################################################################################
# Helpers
def attr(obj):
    """
    Return all public attributes of an object.
    """
    return [x for x in dir(obj) if not x.startswith("_")]


################################################################################
# PyTorch
class SensorDataset(ImageFolder):

    def __init__(self,
                 root: str,
                 suffix: str,
                 transform=None,
                 target_transform=None,
                 loader=default_loader,
                 download=False):
        self.root = os.path.expanduser(root)

        path = os.path.join(self.root, suffix)
        print(f"Loading data from {path}.")
        assert os.path.isdir(path), f"'{suffix}' is not valid."

        super().__init__(path, transform, target_transform, loader)


In [27]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [28]:
# Training helpers
def get_trainable(model_params):
    return (p for p in model_params if p.requires_grad)


def get_frozen(model_params):
    return (p for p in model_params if not p.requires_grad)


def all_trainable(model_params):
    return all(p.requires_grad for p in model_params)


def all_frozen(model_params):
    return all(not p.requires_grad for p in model_params)


def freeze_all(model_params):
    for param in model_params:
        param.requires_grad = False

## Transforms

In [46]:
from torchvision import transforms

IMG_SIZE = 224
_mean = [mean] * 3
_std = [std] * 3

train_trans = transforms.Compose([
    transforms.Resize(256),
    transforms.RandomCrop(IMG_SIZE),
    transforms.ToTensor(),
    transforms.Normalize(_mean, _std),
])
val_trans = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(IMG_SIZE),
    transforms.ToTensor(),
    transforms.Normalize(_mean, _std),
])

## Dataset

In [47]:
BATCH_SIZE = 100
n_classes = 14

In [48]:
train_ds = SensorDataset("data", "sample/train", transform=train_trans)
val_ds = SensorDataset("data", "sample/valid", transform=val_trans)

Loading data from data/sample/train.
Loading data from data/sample/valid.


In [49]:
print(train_ds)
print(val_ds)

Dataset SensorDataset
    Number of datapoints: 985
    Root Location: data/sample/train
    Transforms (if any): Compose(
                             Resize(size=256, interpolation=PIL.Image.BILINEAR)
                             RandomCrop(size=(224, 224), padding=0)
                             ToTensor()
                             Normalize(mean=[217.87001988686944, 217.87001988686944, 217.87001988686944], std=[75.8441425486478, 75.8441425486478, 75.8441425486478])
                         )
    Target Transforms (if any): None
Dataset SensorDataset
    Number of datapoints: 268
    Root Location: data/sample/valid
    Transforms (if any): Compose(
                             Resize(size=256, interpolation=PIL.Image.BILINEAR)
                             CenterCrop(size=(224, 224))
                             ToTensor()
                             Normalize(mean=[217.87001988686944, 217.87001988686944, 217.87001988686944], std=[75.8441425486478, 75.8441425486478, 75.844142548

## DataLoader
Batch loading for datasets with multi-processing and different sample strategies.

In [50]:
from torch.utils.data import DataLoader

train_dl = DataLoader(
    train_ds,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=4,
)
val_dl = DataLoader(
    val_ds,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=4,
)

# The Model
PyTorch offers quite a few [pre-trained networks](https://pytorch.org/docs/stable/torchvision/models.html) for you to use:
- AlexNet
- VGG
- ResNet
- SqueezeNet
- DenseNet
- Inception v3

And there are more available via [pretrained-models.pytorch](https://github.com/Cadene/pretrained-models.pytorch)
- NASNet,
- ResNeXt,
- InceptionV4,
- InceptionResnetV2, 
- Xception, 
- DPN,
- ...

In [51]:
from torchvision import models

model = models.resnet18(pretrained=True)

In [52]:
model

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace)
      (conv2): Co

In [54]:
# Freeze all parameters
freeze_all(model.parameters())
assert all_frozen(model.parameters())

Replace the last layer with a linear layer. New layers have `requires_grad = True`.

In [55]:
model.fc = nn.Linear(512, n_classes)

In [56]:
all_frozen(model.parameters())

False

In [57]:
model = model.to(device)

# The Loss

In [58]:
criterion = nn.CrossEntropyLoss()

# The Optimizer

In [59]:
optimizer = torch.optim.Adam(
    get_trainable(model.parameters()),
    lr=0.001,
)

# The Train Loop

In [62]:
N_EPOCHS = 10

[autoreload of create_plots failed: Traceback (most recent call last):
  File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/IPython/extensions/autoreload.py", line 245, in check
    superreload(m, reload, self.old_objects)
  File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/site-packages/IPython/extensions/autoreload.py", line 368, in superreload
    module = reload(module)
  File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/imp.py", line 315, in reload
    return importlib.reload(module)
  File "/home/ubuntu/anaconda3/envs/pytorch_p36/lib/python3.6/importlib/__init__.py", line 166, in reload
    _bootstrap._exec(spec, module)
  File "<frozen importlib._bootstrap>", line 618, in _exec
  File "<frozen importlib._bootstrap_external>", line 678, in exec_module
  File "<frozen importlib._bootstrap>", line 219, in _call_with_frames_removed
  File "/home/ubuntu/github/glance/create_plots.py", line 16, in <module>
    PATH = Path('~/github/glance/dat

In [63]:
for epoch in range(N_EPOCHS):
    print(f"Epoch {epoch+1}/{N_EPOCHS}")
    
    # Train
    model.train()  # IMPORTANT
    
    running_loss, correct = 0.0, 0
    for X, y in train_dl:
        X, y = X.to(device), y.to(device)
        
        optimizer.zero_grad()
        y_ = model(X)
        
        loss = criterion(y_, y)

        loss.backward()
        optimizer.step()
        
        # Statistics
        print(f"    batch loss: {loss.item():0.3f}")
        _, y_label_ = torch.max(y_, 1)
        correct += (y_label_ == y).sum().item()
        running_loss += loss.item() * X.shape[0]
    
    print(f"  Train Loss: {running_loss / data.shape[0]}")
    print(f"  Train Acc:  {correct / data.shape[0]}")
    
    
    # Eval
    model.eval()  # IMPORTANT - turns off training tricks like dropout
    
    running_loss, correct = 0.0, 0
    with torch.no_grad():  # IMPORTANT - prevents calculation of gradients (extra compute)
        for X, y in val_dl:
            X, y = X.to(device), y.to(device)
                    
            y_ = model(X)
            
            _, y_label_ = torch.max(y_, 1)
            correct += (y_label_ == y).sum().item()
            
            loss = criterion(y_, y)
            running_loss += loss.item() * X.shape[0]
    
    print(f"  Valid Loss: {running_loss / data_val.shape[0]}")
    print(f"  Valid Acc:  {correct / data_val.shape[0]}")
    print()

Epoch 1/10
    batch loss: 2.082
    batch loss: 2.118
    batch loss: 2.122
    batch loss: 2.106
    batch loss: 1.985
    batch loss: 2.024
    batch loss: 2.004
    batch loss: 1.949
    batch loss: 1.876
    batch loss: 1.940
  Train Loss: 2.021755153757667
  Train Acc:  0.566497461928934
  Valid Loss: 2.9851943364583824
  Valid Acc:  0.11646586345381527

Epoch 2/10
    batch loss: 1.921
    batch loss: 1.814
    batch loss: 1.878
    batch loss: 1.783
    batch loss: 1.757
    batch loss: 1.777
    batch loss: 1.816
    batch loss: 1.765
    batch loss: 1.690
    batch loss: 1.653
  Train Loss: 1.787619188957408
  Train Acc:  0.6446700507614214
  Valid Loss: 2.908298289440722
  Valid Acc:  0.07228915662650602

Epoch 3/10
    batch loss: 1.676
    batch loss: 1.596
    batch loss: 1.540
    batch loss: 1.579
    batch loss: 1.609
    batch loss: 1.655
    batch loss: 1.533
    batch loss: 1.594
    batch loss: 1.515
    batch loss: 1.503
  Train Loss: 1.5811270554053602
  Train Ac

In [68]:
os.mkdir('models')

In [69]:
ls

create_data.py   [0m[34;42mdata[0m/    [01;34mmodels[0m/       README.md
create_plots.py  LICENSE  [01;34m__pycache__[0m/  train_model.ipynb


In [70]:
cd models

/home/ubuntu/github/glance/models


In [79]:
with open('190122-CNN.p', 'wb') as file:
    torch.save(model, file)

In [72]:
ls

In [73]:
!touch 

touch: missing file operand
Try 'touch --help' for more information.


In [74]:
!touch '190122-CNN.P'

In [75]:
!mv '190122-CNN.P' '190122-CNN.p'

In [76]:
ls

190122-CNN.p


In [80]:
with open('190122-CNN.p', 'rb') as file:
    test = torch.load(file)

In [81]:
test

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace)
      (conv2): Co

In [88]:
X_test = list(val_dl)[0]

In [112]:
X_test.shape

torch.Size([100, 3, 224, 224])

In [111]:
y_test = model(y)

RuntimeError: Expected 4-dimensional input for 4-dimensional weight [64, 3, 7, 7], but got 1-dimensional input of size [68] instead

In [94]:
data_val

Unnamed: 0,cycle,data,description,sample_rate,sensor,unit
10292,1472,"[9.696, 9.707, 9.727, 9.741, 9.75, 9.737, 9.71...",Pressure,100 Hz,PS5,bar
26958,498,"[52.477, 52.461000000000006, 52.46899999999999...",Temperature,1 Hz,TS4,C
17168,1733,"[8.061, 0.677, 0.527, 0.003, 0.004, 0.0, 0.003...",Volume flow,10 Hz,FS1,l/min
27846,1386,"[39.664, 39.66, 39.676, 39.676, 39.656, 39.676...",Temperature,1 Hz,TS4,C
24433,178,"[55.586000000000006, 55.574, 55.578, 55.578, 5...",Temperature,1 Hz,TS3,C
15762,327,"[7.517, 0.9079999999999999, 0.505, 0.033, 0.00...",Volume flow,10 Hz,FS1,l/min
21897,2052,"[36.187, 36.18, 36.094, 36.102, 36.008, 36.02,...",Temperature,1 Hz,TS1,C
10597,1777,"[9.833, 9.825, 9.832, 9.849, 9.871, 9.877, 9.8...",Pressure,100 Hz,PS5,bar
23058,1008,"[50.754, 50.727, 50.695, 50.656000000000006, 5...",Temperature,1 Hz,TS2,C
8926,106,"[8.573, 8.59, 8.611, 8.626, 8.618, 8.611, 8.59...",Pressure,100 Hz,PS5,bar


In [101]:
def create_plot(vector):
    plt.figure(num=None, figsize=(2.24, 2.24), dpi=100,
               facecolor='w', edgecolor='k')
    plt.plot(vector, 'k')
    plt.axis('off')

    buf = io.BytesIO()
    plt.savefig(buf, format='png')
    buf.seek(0)
    im = Image.open(buf)
    buf.close()
    plt.close()
    return im

def make_plot(vector):
    im = create_plot(vector)
    pix = np.array(im)
    return pix[:, :, :3]

In [109]:
im = create_plot(data_val.data.iloc[0])
im.show()