In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from skmultiflow.anomaly_detection import HalfSpaceTrees
import glob
from collections import deque
import dask.dataframe as da
import math
import itertools
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import classification_report
import time
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import f1_score, recall_score, precision_score
import warnings
warnings.filterwarnings("ignore")

# Load data


In [4]:
dat1 = pd.read_csv('../datasets/revised_sub20_data_Ishu_modification.csv')

In [5]:
# Only sit and stand are outliers
dat1[dat1['outlier'] == 1]['type'].unique()

array(['sit', 'std'], dtype=object)

In [6]:
# ~1% are outliers. Their percentage have been decreased
dat1['outlier'].value_counts() 

0    32900
1      332
Name: outlier, dtype: int64

# Read Data chunk by chunk

In [7]:
import re
numbers = re.compile(r'(\d+)')

In [8]:
def numericalSort(value):
    parts = numbers.split(value)
    parts[1::2] = map(int, parts[1::2])
    return parts

In [11]:
all_files = sorted(glob.glob('../datasets/revised_sub20_data_Ishu_modification.csv'), key=numericalSort)
li = []

In [13]:
dat1.shape

(33232, 19)

In [14]:
all_files

['../datasets/revised_sub20_data_Ishu_modification.csv']

In [15]:
window_len = 1000
buffer = deque()
for file in all_files:
    window_start = 0
    dfs = pd.read_csv(all_files[0], iterator = True, chunksize = window_len)
    for idx, df in enumerate(dfs):
        ## Use this df chunk
        buffer.append(df)

In [16]:
len(buffer)

34

In [17]:
buffer[-1].shape

(232, 19)

In [18]:
# Buffer is a deque of pandas dataframe. While its use is like a list, we made it a deque in case in future we need to append at the end or left or need to pop from left or right and we can do it in less time complexity

In [19]:
math.ceil(dat1.shape[0] / window_len) * len(all_files) == len(buffer), len(buffer)

(True, 34)

In [20]:
# Checking if we are able to divide the total rows in our data exactly by the window size

In [21]:
# std at 19771 and sit at 9479 is when they start in subject 1 data

In [22]:
outlier_buffer = deque()
reference = pd.DataFrame()
reference_end = 3
reference = pd.concat(list(itertools.islice(buffer, 0, reference_end))) # concat dataframes from start to end-1

In [23]:
reference

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,Unnamed: 0.1.1.1,attitude.roll,attitude.pitch,attitude.yaw,gravity.x,gravity.y,gravity.z,rotationRate.x,rotationRate.y,rotationRate.z,userAcceleration.x,userAcceleration.y,userAcceleration.z,type,row_num,outlier
0,0,0,0,0,1.172703,-1.217849,0.633634,0.318633,0.938358,-0.134000,-1.221689,0.095745,0.650700,0.597170,0.081509,-0.071948,dws,0,0
1,1,1,1,1,1.230989,-1.218272,0.702610,0.325523,0.938504,-0.115079,-0.724416,-0.825562,0.181063,0.140922,2.183841,-0.698596,dws,1,0
2,2,2,2,2,1.220374,-1.217347,0.695971,0.325099,0.938184,-0.118826,0.424864,0.286293,0.057343,0.091450,-0.118314,0.177435,dws,2,0
3,3,3,3,3,1.196626,-1.215197,0.674119,0.324063,0.937438,-0.127249,0.289479,-0.389842,-0.170267,-0.001020,0.076020,0.182624,dws,3,0
4,4,4,4,4,1.183103,-1.220570,0.669148,0.317645,0.939295,-0.129714,-0.193776,-0.518626,-0.312347,0.015118,-0.019785,0.194854,dws,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,2995,2995,759,759,0.575103,-1.080960,0.040832,0.255905,0.882410,-0.394798,-1.236159,-1.000644,-0.801171,-1.700119,0.092459,1.011589,dws,2995,0
2996,2996,2996,760,760,0.497197,-1.075108,0.002675,0.226861,0.879642,-0.418048,1.222504,-1.723262,-0.445513,0.163031,-0.387912,0.543246,dws,2996,0
2997,2997,2997,761,761,0.485638,-1.072850,-0.004095,0.222941,0.878565,-0.422398,-0.745496,0.017073,0.009596,0.711355,0.090398,0.632999,dws,2997,0
2998,2998,2998,762,762,0.496527,-1.102632,0.012019,0.214963,0.892398,-0.396757,-1.595081,-0.344315,-0.520372,0.314654,0.753812,0.348081,dws,2998,0


In [24]:
change_reference = 1
# 0 means don't change the reference on which the model was trained (static)
# 1 means change it and make it equal to the previous chunk
# 2 means grow the reference. So it has all the previous chunks. This may slow down the training but sometimes gives better performance

In [25]:
accuracy_only_outliers, accuracy = [], []

In [26]:
columns_to_use_numerical_all = [i for i in range(4, 16)] # Only use these columns to train
columns_to_use_numerical_better = [5,7,8] # Using parsimonious columns gives better results. These 3 perform better than giving all the columns
columns_to_use = columns_to_use_numerical_better # Using parsimonious columns to get better results

In [27]:
start = time.time()
for i in range(len(buffer)):
    if i < reference_end:
        continue # Start predicting when we are not in the reference
    lof_novelty = LocalOutlierFactor(n_neighbors=10, novelty=True).fit(reference.iloc[:, columns_to_use]) # Train on reference
    prediction_novelty = lof_novelty.predict(buffer[i].iloc[:, columns_to_use]) # Predict for the next chunk
    # Change the anomalies' values to make it consistent with the true values
    prediction_novelty = [1 if i==-1 else 0 for i in prediction_novelty] # Change -1 to 1 and 1 to 0
    
    ones_zeroes_series = pd.Series(prediction_novelty) # convert this 1 and 0 array to pd.Series to later convert to a dataframe
    series_value_counts = pd.Series(prediction_novelty).value_counts(dropna = False) # Get value counts of 0s and 1s
    series_total = series_value_counts.sum() # Basically, total (0s + 1s)
    buffer[i]['pred'] = ones_zeroes_series.values # Prediction column
    buffer[i]['matched'] = np.where(buffer[i]['outlier'] == buffer[i]['pred'], 1, 0) # Matched to the label or not. 1 denotes successfull match and 0 denotes mismatch
    
    # Check the model performance. Commented out to get clean output
    
    # print("**************************************************************")
    # print(f"Results for idx = {i} and rows from {buffer[i].index.min()} to {buffer[i].index.max()} is")
    # print(series_value_counts)
    # print(f'Performance for idx = {i} and rows from {buffer[i].index.min()} to {buffer[i].index.max()} is')
    # print(buffer[i]['matched'].value_counts(dropna = False))
    # print("**************************************************************")
    # print(f'Ground truth for idx = {i} and rows from {buffer[i].index.min()} to {buffer[i].index.max()} is')
    # print(buffer[i]['outlier'].value_counts(dropna = False))
    # display(buffer[i])
    # print(buffer[i].matched.value_counts(dropna = False))
    # print(buffer[i].pred.value_counts(dropna = False))
    
    print("**************************************************************")
    print(f"Results for idx = {i} and rows from {buffer[i].index.min()} to {buffer[i].index.max()} is")
    
    acc = sum(buffer[i]['matched'])/len(buffer[i]['matched']) # Accuracy is matched == 1 / total
    print('Accuracy is ' + str(acc))
    accuracy.append(acc)
    
    # Check if the chunk had outlier to get outlier detection accuracy and append it to the accuracy_only_outliers else append -1
    if len(buffer[i][buffer[i]['outlier'] == 1]) > 0:
        acc_only_outliers = sum(buffer[i][(buffer[i]['matched'] == 1) & (buffer[i]['outlier'] == 1)]['matched']) / len(buffer[i][buffer[i]['outlier'] == 1])
        print('Outlier accuracy is ' + str(acc_only_outliers))
        accuracy_only_outliers.append(acc_only_outliers)
    else:
        print("No outliers in this buffer chunk")
        accuracy_only_outliers.append(-1)
        
    if change_reference == 1:
        reference = buffer[i]
    elif change_reference == 2:
        reference = pd.concat([reference, buffer[i]])
    elif change_reference == 0:
        pass
    else:
        raise("Unimplemented error for change_reference flag meaning")
        

end = time.time()
total_time = end - start

**************************************************************
Results for idx = 3 and rows from 3000 to 3999 is
Accuracy is 0.991
No outliers in this buffer chunk
**************************************************************
Results for idx = 4 and rows from 4000 to 4999 is
Accuracy is 0.988
No outliers in this buffer chunk
**************************************************************
Results for idx = 5 and rows from 5000 to 5999 is
Accuracy is 0.865
No outliers in this buffer chunk
**************************************************************
Results for idx = 6 and rows from 6000 to 6999 is
Accuracy is 0.987
No outliers in this buffer chunk
**************************************************************
Results for idx = 7 and rows from 7000 to 7999 is
Accuracy is 0.853
No outliers in this buffer chunk
**************************************************************
Results for idx = 8 and rows from 8000 to 8999 is
Accuracy is 0.951
No outliers in this buffer chunk
****************

In [28]:
np.mean(accuracy)

0.8538509454949944

In [29]:
np.mean([i for i in accuracy_only_outliers if i != -1]) # Accuracy of only outliers

0.8113878080415046

In [30]:
print(f"Total time taken is {total_time} seconds")

Total time taken is 0.4248628616333008 seconds


In [31]:
truth = []
prediction = []
for i in range(reference_end,len(buffer)):
    truth += list(buffer[i].outlier)
    prediction += list(buffer[i].pred)

In [32]:
print(pd.DataFrame(confusion_matrix(truth,prediction)))

       0     1
0  25405  4495
1     29   303


In [33]:
print('F1 Score is {:.5}'.format(f1_score(truth,prediction)))

F1 Score is 0.11813


In [34]:
print('Recall is {:.5}'.format(recall_score(truth,prediction)))

Recall is 0.91265


In [35]:
print('Precision is {:.5}'.format(precision_score(truth,prediction)))

Precision is 0.063151
