In [271]:
import pandas as pd
import numpy as np
import torch
from torch_geometric.data import Data

import torch
import torch.nn.functional as F
import torch.nn as nn
from torch_geometric.nn import GCNConv

from sklearn.mixture import GaussianMixture 

import json

import matplotlib.pyplot as plt
%matplotlib inline

In [272]:
def loadData(file):
    data = pd.read_csv(file)
    print('Raw shape: ',data.shape)
    data['date'] = pd.to_datetime(data.date)
    print('Days: ',len(set(data.date)))
    return data

In [273]:
def getTimeSeries(df):
    table = pd.pivot_table(df, values='amount', index=['date'],
                    columns=['start_id','end_id'], aggfunc=np.sum, fill_value=0)
    return table

In [274]:
dataDir = '/home/urwa/Documents/Projects/AnomalyDetection/Pipeline/data/'
dataFile = '20190402_TaipeiEdgesDatewise.csv'
events_data =dataDir+'TaipeiEvents.csv'

In [275]:
file = dataDir + dataFile
dataRaw = loadData(file)

Raw shape:  (7374816, 5)
Days:  638


In [276]:
dataIn =  pd.pivot_table(dataRaw, values='amount', index=['date'],
                    columns=['end_id'], aggfunc=np.sum, fill_value=0)
dataIn.head(1)

end_id,BL01,BL02,BL03,BL04,BL05,BL06,BL07,BL08,BL09,BL10,...,R20,R21,R22,R22A,R23,R24,R25,R26,R27,R28
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-01-01,6326,11444,5696,16834,12256,25559,45609,27781,18745,33240,...,5597,5937,15733,12194,2972,2067,9518,8439,6473,73538


In [277]:
dataOut =  pd.pivot_table(dataRaw, values='amount', index=['date'],
                    columns=['start_id'], aggfunc=np.sum, fill_value=0)
dataOut.head(1)

start_id,BL01,BL02,BL03,BL04,BL05,BL06,BL07,BL08,BL09,BL10,...,R20,R21,R22,R22A,R23,R24,R25,R26,R27,R28
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-01-01,6292,10268,5591,15834,12955,26456,42135,29606,18572,34387,...,5734,6801,14919,12373,2943,1957,8887,8603,6870,76751


In [278]:
matrix = np.stack((dataOut.values, dataIn.values),-1)
matrix = matrix.astype(float)
matrix.shape

(638, 108, 2)

In [279]:
# for i in range(matrix.shape[1]):
#     for j in range(matrix.shape[2]):
#         matrix[:, i,j] = (matrix[:, i,j] - np.mean(matrix[:, i,j])) / (np.std(matrix[:, i,j]))
#         #matrix[:, i,j] = (matrix[:, i,j] - matrix[:, i,j].min()) / (matrix[:, i,j].max() - matrix[:, i,j].min())

In [280]:
for i in range(matrix.shape[0]):
    for j in range(matrix.shape[2]):
        matrix[i,:,j] = (matrix[i,:,j] - np.mean(matrix[i,:,j])) / (np.std(matrix[i,:,j]))
        #matrix[:, i,j] = (matrix[:, i,j] - matrix[:, i,j].min()) / (matrix[:, i,j].max() - matrix[:, i,j].min())

In [281]:
with open('../topology.json') as json_file:
    topology = json.load(json_file)

In [282]:
stationToIdx = dict(zip(dataOut.columns.values,range(len(dataOut.columns.values))))

In [283]:
edge_index = []
for t in topology:
    edge_index.append([stationToIdx[t[0]], stationToIdx[t[1]]])

In [284]:
stations = list(set(dataRaw.start_id))
n= len(stations)
#edge_index = [[a//n,a%n] for a in range(n*n)]
edge_index = torch.tensor(edge_index, dtype=torch.long)

In [285]:
dates = list(dataOut.index)
DOW = list(pd.to_datetime(dataOut.index.values).dayofweek)
DOW = ((np.array(DOW) == 5) | (np.array(DOW) == 6)).astype(int)
DOW[:10]

array([1, 0, 0, 0, 0, 0, 1, 1, 0, 0])

In [286]:
dataList = []

for i in range(len(DOW)):
    x = torch.tensor(matrix[i], dtype=torch.float)
    y = torch.tensor(np.array([DOW[i]]), dtype=torch.long)
    data = Data(x=x, edge_index=edge_index.t().contiguous(),y=y)
    dataList.append(data)

dataList[0]

Data(edge_index=[2, 232], x=[108, 2], y=[1])

In [287]:
n_classes = 2
featureDim = 2
hiddenDim = [500,100,20]

In [288]:
class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = GCNConv(featureDim, featureDim)
        self.conv2 = GCNConv(featureDim, featureDim)
        self.conv3 = GCNConv(featureDim, featureDim)
        self.conv4 = GCNConv(featureDim, 1)
        self.linear1 = nn.Linear(n*1, hiddenDim[0])
        self.linear2 = nn.Linear(hiddenDim[0], hiddenDim[1])
        self.linear3 = nn.Linear(hiddenDim[1], hiddenDim[2])
        self.linear4 = nn.Linear(hiddenDim[2], n_classes)

        self.init_weights(self.linear1)
        self.init_weights(self.linear2)
        self.init_weights(self.linear3)
        self.init_weights(self.linear4)
        
    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = self.conv3(x, edge_index)
        x = F.relu(x)
        #x = F.dropout(x, training=self.training)
        x = self.conv4(x, edge_index)
        x = F.relu(x)
        #x = F.dropout(x, training=self.training)
        x = x.view(-1,n*1)
        x = self.linear1(x)
        x = F.relu(x)
        #x = F.dropout(x, training=self.training)
        x = self.linear2(x)
        x = F.relu(x)
        #x = F.dropout(x, training=self.training)
        x = self.linear3(x)
        x = F.relu(x)
        #x = F.dropout(x, training=self.training)
        x = self.linear4(x)
        return F.log_softmax(x, dim=1)
        
    def init_weights(self,m):
        if type(m) == nn.Linear:
            nn.init.xavier_uniform_(m.weight)
            m.bias.data.fill_(0.01)
    
    def representation(self, x):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = self.conv3(x, edge_index)
        x = F.relu(x)
        #x = F.dropout(x, training=self.training)
        x = self.conv4(x, edge_index)
        x = F.relu(x)
        x = x.view(-1,n*1)
        x = self.linear1(x)
        x = F.relu(x)
        #x = F.dropout(x, training=self.training)
        x = self.linear2(x)
        x = F.relu(x)
        #x = F.dropout(x, training=self.training)
        x = self.linear3(x)
        return x
            

In [289]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#device = torch.device('cpu')
model = Net().to(device)
#data = dataset[0].to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, weight_decay=5e-4)
criterion = nn.CrossEntropyLoss()

In [290]:
device

device(type='cuda')

In [291]:
#optimizer = torch.optim.Adam(model.parameters(), lr=0.1, weight_decay=5e-4)
num_epochs = 60

In [292]:
model.train()

for epoch in range(num_epochs):
    epochLoss = 0
    for data in dataList:
        #data.view(1,-1,-1)
        data = data.to(device)
        optimizer.zero_grad()
        out = model(data)
#         print(out.shape)
#         print(data.y.shape)
        loss = criterion(out, data.y)
        loss.backward()
        optimizer.step()
        
        epochLoss += loss.item()
    
#     if epoch == 500:
#         optimizer = torch.optim.Adam(model.parameters(), lr=0.00001, weight_decay=5e-4)
        
    print('epoch [{}/{}], loss:{:.4f}'
    .format(epoch + 1, num_epochs, np.mean(epochLoss)))

epoch [1/60], loss:297.6051
epoch [2/60], loss:150.4654
epoch [3/60], loss:110.5402
epoch [4/60], loss:103.6780
epoch [5/60], loss:99.9160
epoch [6/60], loss:96.9636
epoch [7/60], loss:94.4279
epoch [8/60], loss:92.0624
epoch [9/60], loss:89.8478
epoch [10/60], loss:87.9251
epoch [11/60], loss:87.1518
epoch [12/60], loss:84.8776
epoch [13/60], loss:82.8071
epoch [14/60], loss:80.8272
epoch [15/60], loss:79.3079
epoch [16/60], loss:78.0616
epoch [17/60], loss:76.6479
epoch [18/60], loss:75.4471
epoch [19/60], loss:74.3898
epoch [20/60], loss:73.4513
epoch [21/60], loss:72.5256
epoch [22/60], loss:71.3905
epoch [23/60], loss:70.5251
epoch [24/60], loss:69.5283
epoch [25/60], loss:68.6413
epoch [26/60], loss:68.0824
epoch [27/60], loss:67.0830
epoch [28/60], loss:66.5410
epoch [29/60], loss:65.8555
epoch [30/60], loss:65.0184
epoch [31/60], loss:64.4599
epoch [32/60], loss:63.8344
epoch [33/60], loss:63.3014
epoch [34/60], loss:62.8560
epoch [35/60], loss:62.0515
epoch [36/60], loss:61.61

In [293]:
model.eval()
with torch.no_grad():
    y = DOW
    predList = []
    for data in dataList:
        data = data.to(device)
        pred = model(data).cpu().numpy()
        predList.append(np.argmax(pred))
    
    print('Accuracy: {:.4f}'.format( np.sum(np.array(predList) == np.array(y)) / len(predList) ))

Accuracy: 0.9702


In [294]:
representation = np.empty((matrix.shape[0],hiddenDim[2]))
prediction = np.empty((matrix.shape[0],n_classes))
model.eval()
with torch.no_grad():
    for i,data in enumerate(dataList):
        data = data.to(device)
        representation[i] = model.representation(data).cpu().numpy()
        prediction[i] = model(data).cpu().numpy()
        predList.append(np.argmax(pred))

In [295]:
representation.shape

(638, 20)

In [296]:
prediction.shape

(638, 2)

In [297]:
# import events data
events_data =dataDir+'TaipeiEvents.csv'
df_events = pd.read_csv(events_data, encoding = "ISO-8859-1", parse_dates=['Date'], infer_datetime_format=True)


In [298]:
df_events.head()

Unnamed: 0,Type,Name,Date
0,National holiday,Republic Day/New Year's Day observed,2017-01-02
1,National holiday,Chinese New Year's Eve,2017-01-27
2,National holiday,Chinese New Year's Day,2017-01-28
3,National holiday,Chinese New Year Holiday 1,2017-01-29
4,National holiday,Chinese New Year Holiday 2,2017-01-30


In [299]:
holidayDates = df_events[df_events.Type == 'National holiday'].Date

In [300]:
holidayDates = [str(d.date()) for d in holidayDates]
dates = [str(d.date()) for d in dates]

In [301]:
anomalyIndex = [i for i,d in enumerate(dates) if d in holidayDates]
len(anomalyIndex)
indexBool = np.array([i in anomalyIndex for i in list(range(matrix.shape[0]))])

In [302]:
def anomalyDetection(y,pval = 0.2,iterN=5,n_com=1):
    #index of regular (non-outlier points)
    #rind=y[:,0]>-10 
    rind = np.array(range(y.shape[0]))
    
    #clustering model
    gm=GaussianMixture(n_components=n_com, n_init=100, max_iter=1000,random_state=0) 
    for i in range(iterN): #iterate
        print('Iteration {}'.format(i+1))  
        clustering=gm.fit(y[rind,:]) #fit EM clustering model excluding outliers
        l=clustering.score_samples(y) #estimate likelihood for each point
        Lthres=sorted(l)[int(len(l)*pval)] #anomaly threshold
        rind0=0+rind
        rind=l>Lthres #non-anomalous points
        if all(rind==rind0):
            print('Convergence in {} iterations'.format(i+1))
            break
    return l < Lthres

In [303]:
def getResults(reducedMatrix,threshHolds,iterN=5,n_com=1):
    results = []
    for th in threshHolds:
        #th = thres/100
        print("Threshhold: ",th)
        outliers = anomalyDetection(reducedMatrix,th,iterN,n_com)

        tpr = sum(outliers & indexBool)/sum(indexBool)
        fpr = sum(outliers & ~indexBool)/sum(~indexBool)
        precision = sum(outliers & indexBool)/sum(outliers)

        F1 = 2 * (precision * tpr) / (precision + tpr)

        res = {'Cat':'Global', 'th':th, 'TPR':tpr, 'FPR':fpr, 'F1':F1, 'Precision':precision}
        results.append(res)

    resDf = pd.DataFrame(results)    
    return resDf

In [304]:
threshHolds = [0.01, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

In [305]:
Res1 = getResults(representation,threshHolds,iterN=5,n_com=1)

Threshhold:  0.01
Iteration 1
Iteration 2
Iteration 3
Convergence in 3 iterations
Threshhold:  0.03
Iteration 1
Iteration 2
Iteration 3
Convergence in 3 iterations
Threshhold:  0.04
Iteration 1
Iteration 2
Iteration 3
Iteration 4
Iteration 5
Convergence in 5 iterations
Threshhold:  0.05
Iteration 1
Iteration 2
Iteration 3
Iteration 4
Iteration 5
Threshhold:  0.06
Iteration 1
Iteration 2
Iteration 3
Iteration 4
Iteration 5
Threshhold:  0.07
Iteration 1
Iteration 2
Iteration 3
Iteration 4
Convergence in 4 iterations
Threshhold:  0.08
Iteration 1
Iteration 2
Iteration 3
Iteration 4
Iteration 5
Convergence in 5 iterations
Threshhold:  0.1
Iteration 1
Iteration 2
Iteration 3
Iteration 4
Iteration 5
Convergence in 5 iterations
Threshhold:  0.2
Iteration 1
Iteration 2
Iteration 3
Iteration 4
Iteration 5
Threshhold:  0.3
Iteration 1
Iteration 2
Iteration 3
Iteration 4
Iteration 5
Threshhold:  0.4
Iteration 1
Iteration 2
Iteration 3
Iteration 4
Iteration 5
Threshhold:  0.5
Iteration 1
Iteration

In [306]:
Res1

Unnamed: 0,Cat,F1,FPR,Precision,TPR,th
0,Global,0.342857,0.0,1.0,0.206897,0.01
1,Global,0.583333,0.00821,0.736842,0.482759,0.03
2,Global,0.703704,0.009852,0.76,0.655172,0.04
3,Global,0.7,0.01642,0.677419,0.724138,0.05
4,Global,0.656716,0.026273,0.578947,0.758621,0.06
5,Global,0.60274,0.036125,0.5,0.758621,0.07
6,Global,0.575,0.045977,0.45098,0.793103,0.08
7,Global,0.5,0.065681,0.365079,0.793103,0.1
8,Global,0.333333,0.165846,0.204724,0.896552,0.2
9,Global,0.245455,0.269294,0.141361,0.931034,0.3


In [307]:
representation.shape

(638, 20)