In [1]:
#add necessary libraries
import networkx as nx #library supporting networks
import matplotlib.pyplot as plt #plotting
import pandas as pd
import numpy as np
import scipy.stats as stat
from scipy import optimize
#make sure plots are embedded into the notebook
%pylab inline 
#import statsmodels.formula.api as smf
#import shapefile as shp
#from shapely.geometry.polygon import Polygon
#from descartes import PolygonPatch
import os
from networkx.algorithms import community
from sklearn.mixture import GaussianMixture 
import json
from sklearn.cluster import KMeans
#import seaborn as sns

Populating the interactive namespace from numpy and matplotlib


In [2]:
threshHolds = [0.03, 0.04, 0.05, 0.06, 0.07]

In [3]:
def anomalyDetection(y,pval = 0.2,iterN=5,n_com=1):
    #index of regular (non-outlier points)
    #rind=y[:,0]>-10 
    rind = np.array(range(y.shape[0]))
    
    #clustering model
    gm=GaussianMixture(n_components=n_com, n_init=100, max_iter=1000,random_state=0) 
    for i in range(iterN): #iterate
        print('Iteration {}'.format(i+1))  
        clustering=gm.fit(y[rind,:]) #fit EM clustering model excluding outliers
        l=clustering.score_samples(y) #estimate likelihood for each point
        Lthres=sorted(l)[int(len(l)*pval)] #anomaly threshold
        rind0=0+rind
        rind=l>Lthres #non-anomalous points
        if all(rind==rind0):
            print('Convergence in {} iterations'.format(i+1))
            break
    return l < Lthres

In [4]:
def getResults(reducedMatrix,threshHolds,iterN=5,n_com=1):
    results = []
    for th in threshHolds:
        #th = thres/100
        print("Threshhold: ",th)
        outliers = anomalyDetection(reducedMatrix,th,iterN,n_com)

        tpr = sum(outliers & indexBool)/sum(indexBool)
        fpr = sum(outliers & ~indexBool)/sum(~indexBool)
        precision = sum(outliers & indexBool)/sum(outliers)

        F1 = 2 * (precision * tpr) / (precision + tpr)

        res = {'Cat':'Global', 'th':th, 'TPR':tpr, 'FPR':fpr, 'F1':F1, 'Precision':precision}
        results.append(res)

    resDf = pd.DataFrame(results)    
    return resDf

In [5]:
def loadData(file):
    data = pd.read_csv(file)
    print('Raw shape: ',data.shape)
    data['date'] = pd.to_datetime(data.date)
    print('Days: ',len(set(data.date)))
    return data

In [6]:
def getTimeSeries(df):
    table = pd.pivot_table(df, values='amount', index=['date'],
                    columns=['start_id','end_id'], aggfunc=np.sum, fill_value=0)
    return table,table.index.values

In [7]:
def getMatrixData():
    file = dataDir + dataFile
    dataRaw = loadData(file)
    dataTs,dates = getTimeSeries(dataRaw)
    matrix = dataTs.values
    print('Matrix Shape: ',matrix.shape)
    return matrix,dates

In [8]:
def getInOutMatrix(matrix):
    
    columnInfo = pd.read_csv('columnInfo.csv')
    columnInfo['amount']=np.sum(matrix, axis=0)
    columnInfo = columnInfo[['start_id','end_id','amount']]

    stations = list(set(columnInfo.start_id))
    partition = dict(zip(stations, list(range(len(stations)))))

    columnInfo['start_community'] = columnInfo['start_id'].apply(lambda x: partition[x])
    columnInfo['end_community'] = columnInfo['end_id'].apply(lambda x: partition[x])
    
    n_com = len(set(columnInfo.start_community))
    newmatrix = np.zeros((matrix.shape[0],n_com))
    for i in range(columnInfo.shape[0]):
        s_com = columnInfo.iloc[i]['start_community']
        e_com = columnInfo.iloc[i]['end_community']

        newmatrix[:,s_com] += matrix[:,i]
        newmatrix[:,e_com] += matrix[:,i] 

    print("Newmatrix: ",newmatrix.shape)
    
    return newmatrix

In [9]:
dataDir = '/home/urwa/Documents/Projects/AnomalyDetection/Pipeline/data/'
dataFile = '20190402_TaipeiEdgesDatewise.csv'
events_data =dataDir+'TaipeiEvents.csv'

In [10]:
matrix,dates = getMatrixData()

Raw shape:  (7374816, 5)
Days:  638
Matrix Shape:  (638, 11664)


In [11]:
matrix = getInOutMatrix(matrix)

Newmatrix:  (638, 108)


In [12]:
# import events data
df_events = pd.read_csv(events_data, encoding = "ISO-8859-1", parse_dates=['Date'], infer_datetime_format=True)

holidayDates = df_events[df_events.Type == 'National holiday'].Date
holidayDates = [str(d.date()) for d in holidayDates]
dates = [str(d.date()) for d in pd.to_datetime(dates)]

anomalyIndex = [i for i,d in enumerate(dates) if d in holidayDates]
len(anomalyIndex)
indexBool = np.array([i in anomalyIndex for i in list(range(matrix.shape[0]))])

In [13]:
threshHolds = [0.01, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

In [14]:
results = getResults(matrix,threshHolds,iterN=5,n_com=1)

Threshhold:  0.01
Iteration 1
Iteration 2
Convergence in 2 iterations
Threshhold:  0.03
Iteration 1


  if sys.path[0] == '':


Iteration 2
Convergence in 2 iterations
Threshhold:  0.04
Iteration 1
Iteration 2
Convergence in 2 iterations
Threshhold:  0.05
Iteration 1
Iteration 2
Convergence in 2 iterations
Threshhold:  0.06
Iteration 1
Iteration 2
Convergence in 2 iterations
Threshhold:  0.07
Iteration 1
Iteration 2
Convergence in 2 iterations
Threshhold:  0.08
Iteration 1
Iteration 2
Convergence in 2 iterations
Threshhold:  0.1
Iteration 1
Iteration 2
Convergence in 2 iterations
Threshhold:  0.2
Iteration 1
Iteration 2
Iteration 3
Convergence in 3 iterations
Threshhold:  0.3
Iteration 1
Iteration 2
Iteration 3
Convergence in 3 iterations
Threshhold:  0.4
Iteration 1
Iteration 2
Iteration 3
Iteration 4
Iteration 5
Convergence in 5 iterations
Threshhold:  0.5
Iteration 1
Iteration 2
Iteration 3
Convergence in 3 iterations
Threshhold:  0.6
Iteration 1
Iteration 2
Convergence in 2 iterations
Threshhold:  0.7
Iteration 1
Iteration 2
Convergence in 2 iterations
Threshhold:  0.8
Iteration 1
Iteration 2
Convergence in

In [15]:
results

Unnamed: 0,Cat,F1,FPR,Precision,TPR,th
0,Global,,0.009852,0.0,0.0,0.01
1,Global,0.166667,0.024631,0.210526,0.137931,0.03
2,Global,0.222222,0.031199,0.24,0.206897,0.04
3,Global,0.2,0.041051,0.193548,0.206897,0.05
4,Global,0.208955,0.050903,0.184211,0.241379,0.06
5,Global,0.191781,0.060755,0.159091,0.241379,0.07
6,Global,0.225,0.068966,0.176471,0.310345,0.08
7,Global,0.195652,0.08867,0.142857,0.310345,0.1
8,Global,0.294872,0.170772,0.181102,0.793103,0.2
9,Global,0.236364,0.270936,0.136126,0.896552,0.3


In [16]:
from sklearn.decomposition import PCA
pca = PCA(n_components=40)
pca.fit(matrix)

reducedMatrixPCA = pca.transform(matrix)
print(reducedMatrixPCA.shape)

(638, 40)


In [17]:
resultsPca = getResults(reducedMatrixPCA,threshHolds,iterN=5,n_com=1)

Threshhold:  0.01
Iteration 1
Iteration 2
Convergence in 2 iterations
Threshhold:  0.03
Iteration 1
Iteration 2
Iteration 3
Convergence in 3 iterations
Threshhold:  0.04
Iteration 1
Iteration 2
Iteration 3
Convergence in 3 iterations
Threshhold:  0.05
Iteration 1
Iteration 2
Iteration 3
Iteration 4
Convergence in 4 iterations
Threshhold:  0.06
Iteration 1
Iteration 2
Iteration 3
Iteration 4
Iteration 5
Threshhold:  0.07
Iteration 1
Iteration 2
Iteration 3
Iteration 4
Convergence in 4 iterations
Threshhold:  0.08
Iteration 1
Iteration 2
Iteration 3
Iteration 4
Convergence in 4 iterations
Threshhold:  0.1
Iteration 1
Iteration 2
Iteration 3
Convergence in 3 iterations
Threshhold:  0.2
Iteration 1
Iteration 2
Iteration 3
Iteration 4
Iteration 5
Threshhold:  0.3
Iteration 1
Iteration 2
Iteration 3
Iteration 4
Convergence in 4 iterations
Threshhold:  0.4
Iteration 1
Iteration 2
Iteration 3
Iteration 4
Iteration 5
Convergence in 5 iterations
Threshhold:  0.5
Iteration 1
Iteration 2
Iteration

In [18]:
resultsPca

Unnamed: 0,Cat,F1,FPR,Precision,TPR,th
0,Global,0.114286,0.006568,0.333333,0.068966,0.01
1,Global,0.166667,0.024631,0.210526,0.137931,0.03
2,Global,0.259259,0.029557,0.28,0.241379,0.04
3,Global,0.333333,0.034483,0.322581,0.344828,0.05
4,Global,0.41791,0.039409,0.368421,0.482759,0.06
5,Global,0.383562,0.049261,0.318182,0.482759,0.07
6,Global,0.475,0.052545,0.372549,0.655172,0.08
7,Global,0.413043,0.07225,0.301587,0.655172,0.1
8,Global,0.358974,0.162562,0.220472,0.965517,0.2
9,Global,0.254545,0.267652,0.146597,0.965517,0.3


In [19]:
from sklearn.decomposition import PCA
pca = PCA(n_components=20)
pca.fit(matrix)

reducedMatrixPCA = pca.transform(matrix)
print(reducedMatrixPCA.shape)

(638, 20)


In [20]:
resultsPca = getResults(reducedMatrixPCA,threshHolds,iterN=5,n_com=1)

Threshhold:  0.01
Iteration 1
Iteration 2
Convergence in 2 iterations
Threshhold:  0.03
Iteration 1
Iteration 2
Iteration 3
Iteration 4
Iteration 5
Threshhold:  0.04
Iteration 1
Iteration 2
Iteration 3
Iteration 4
Iteration 5
Convergence in 5 iterations
Threshhold:  0.05
Iteration 1
Iteration 2
Iteration 3
Iteration 4
Convergence in 4 iterations
Threshhold:  0.06
Iteration 1
Iteration 2
Iteration 3
Iteration 4
Convergence in 4 iterations
Threshhold:  0.07
Iteration 1
Iteration 2
Iteration 3
Iteration 4
Iteration 5
Threshhold:  0.08
Iteration 1
Iteration 2
Iteration 3
Iteration 4
Iteration 5
Threshhold:  0.1
Iteration 1
Iteration 2
Iteration 3
Iteration 4
Iteration 5
Threshhold:  0.2
Iteration 1
Iteration 2
Iteration 3
Iteration 4
Iteration 5
Threshhold:  0.3
Iteration 1
Iteration 2
Iteration 3
Iteration 4
Iteration 5
Convergence in 5 iterations
Threshhold:  0.4
Iteration 1
Iteration 2
Iteration 3
Iteration 4
Iteration 5
Threshhold:  0.5
Iteration 1
Iteration 2
Iteration 3
Iteration 4
I

In [21]:
resultsPca

Unnamed: 0,Cat,F1,FPR,Precision,TPR,th
0,Global,0.114286,0.006568,0.333333,0.068966,0.01
1,Global,0.333333,0.018062,0.421053,0.275862,0.03
2,Global,0.333333,0.026273,0.36,0.310345,0.04
3,Global,0.3,0.036125,0.290323,0.310345,0.05
4,Global,0.298507,0.045977,0.263158,0.344828,0.06
5,Global,0.328767,0.052545,0.272727,0.413793,0.07
6,Global,0.3,0.064039,0.235294,0.413793,0.08
7,Global,0.282609,0.082102,0.206349,0.448276,0.1
8,Global,0.320513,0.167488,0.19685,0.862069,0.2
9,Global,0.254545,0.267652,0.146597,0.965517,0.3


In [22]:
## Autoencoder

In [23]:
import torch
import torch.nn as nn
import torch.utils.data

In [24]:
class autoencoder(nn.Module):
    def __init__(self,inputD,encoding_dim):
        super(autoencoder, self).__init__()
        
        self.encoder = nn.Sequential()
        
        self.encoder.add_module("enc_0", nn.Linear(inputD,encoding_dim[0]))
        self.encoder.add_module("relu_0", nn.ReLU())
          
        for l in range(1,len(encoding_dim)):
            self.encoder.add_module("enc_"+str(l), nn.Linear(encoding_dim[l-1],encoding_dim[l]))
            self.encoder.add_module("encrelu_"+str(l), nn.ReLU())
                                    
        self.decoder = nn.Sequential()
        
        for l in range(len(encoding_dim)-1,0,-1):
            self.decoder.add_module("dec_"+str(l), nn.Linear(encoding_dim[l],encoding_dim[l-1]))
            self.decoder.add_module("decrelu_"+str(l), nn.ReLU())
            
        self.decoder.add_module("dec_0", nn.Linear(encoding_dim[0],inputD))
        self.decoder.add_module("decrelu_0", nn.Sigmoid())
        
        self.encoder.apply(self.init_weights)
        self.decoder.apply(self.init_weights)

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x
    
    def init_weights(self,m):
        if type(m) == nn.Linear:
            nn.init.xavier_uniform_(m.weight)
            m.bias.data.fill_(0.01)
    
    def representation(self, x):
        x = self.encoder(x)
        return x