# Anomaly detection - Gabbar

In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import IsolationForest

In [4]:
pd.set_option('display.precision', 2)
pd.set_option('display.max_columns', None)

In [5]:
random_state = 5
cv = 10

In [6]:
non_training_attributes = ['changeset_id', 'changeset_harmful', 'feature_id', 'feature_type']

## Prepare datasets

In [7]:
labelled_path = '../downloads/anomaly-detection/labelled/'

In [8]:
labelled = pd.read_csv(labelled_path + 'attributes.csv')
print(labelled.shape)

# Sort the dataset randomly.
labelled = labelled.sample(labelled.shape[0], random_state=random_state)
labelled.sample(10)

(2272, 14)


Unnamed: 0,changeset_id,changeset_harmful,feature_id,feature_type,action_create,action_modify,action_delete,feature_version,highway_tag_created,highway_tag_deleted,highway_value_difference,primary_tags_difference,area_of_feature_bbox,length_of_longest_segment
89,48394822,1,29015240,way,0,1,0,2,0,0,0,0,71.5,0.01
1945,46604435,1,2642938,relation,0,1,0,2,0,0,0,0,18100.0,0.0
483,47540344,1,432464028,way,0,1,0,3,0,0,0,0,5190.0,0.06
484,47540338,1,173095696,way,0,1,0,5,0,0,0,0,3270.0,0.04
36,48676544,1,10375182,way,0,1,0,4,0,0,0,0,2920.0,0.06
650,47526660,1,1588547871,node,0,1,0,4,0,1,3,-1,0.0,0.0
2254,44956930,1,98736078,way,0,1,0,9,0,0,-4,0,331000.0,0.11
665,47516493,1,246286774,way,0,1,0,3,0,0,0,0,255.0,0.03
790,47510050,1,333492762,way,0,1,0,5,0,0,0,0,1120000.0,0.11
732,47510935,1,35609160,way,0,1,0,8,0,0,0,0,7520.0,0.06


In [9]:
# Drop all duplicate samples.
print('Shape before dropping duplicates: {}'.format(labelled.shape))
labelled = labelled.drop_duplicates(subset=['changeset_id', 'feature_id'])
print('Shape after dropping duplicates: {}'.format(labelled.shape))

Shape before dropping duplicates: (2272, 14)
Shape after dropping duplicates: (2272, 14)


In [10]:
labelled.describe()

Unnamed: 0,changeset_id,changeset_harmful,feature_id,action_create,action_modify,action_delete,feature_version,highway_tag_created,highway_tag_deleted,highway_value_difference,primary_tags_difference,area_of_feature_bbox,length_of_longest_segment
count,2270.0,2272.0,2270.0,2270.0,2272.0,2272.0,2272.0,2272.0,2272.0,2272.0,2272.0,2270.0,2272.0
mean,47300000.0,0.95,275000000.0,0.00264,1.0,0.0,6.07,0.04,0.02,-0.14,0.04,16600000.0,0.16
std,679000.0,0.31,594000000.0,0.0513,0.05,0.0,5.32,0.2,0.13,6.91,0.24,324000000.0,1.07
min,44800000.0,-1.0,173000.0,0.0,0.0,0.0,2.0,0.0,0.0,-41.0,-1.0,0.0,0.0
25%,47300000.0,1.0,28900000.0,0.0,1.0,0.0,3.0,0.0,0.0,0.0,0.0,391.0,0.02
50%,47400000.0,1.0,148000000.0,0.0,1.0,0.0,4.0,0.0,0.0,0.0,0.0,3240.0,0.06
75%,47500000.0,1.0,334000000.0,0.0,1.0,0.0,8.0,0.0,0.0,0.0,0.0,22500.0,0.11
max,49700000.0,1.0,4850000000.0,1.0,1.0,0.0,62.0,1.0,1.0,39.0,3.0,12800000000.0,30.89


In [11]:
labelled.sum(numeric_only=True)

changeset_id                 1.08e+11
changeset_harmful            2.16e+03
feature_id                   6.24e+11
action_create                6.00e+00
action_modify                2.27e+03
action_delete                0.00e+00
feature_version              1.38e+04
highway_tag_created          9.40e+01
highway_tag_deleted          3.70e+01
highway_value_difference    -3.29e+02
primary_tags_difference      8.00e+01
area_of_feature_bbox         3.78e+10
length_of_longest_segment    3.62e+02
dtype: float64

In [12]:
inliers = labelled[labelled['changeset_harmful'] == 1]
print('Total inliers: {}'.format(inliers.shape))

outliers = labelled[labelled['changeset_harmful'] == -1]
print('Total outliers: {}'.format(outliers.shape))

Total inliers: (2217, 14)
Total outliers: (55, 14)


In [13]:
total = labelled.shape[0]

# 80% of the inliers will be used for training.
training = inliers.iloc[0:round(0.8 * total)]
training = training.reset_index(drop=True)
print('Training dataset (only inliers): {}'.format(training.shape))

# 20% of the inliers will be used for validation.
validation = inliers.iloc[round(0.8 * total):]
validation = validation.reset_index(drop=True)
print('Validation dataset (only inliers): {}'.format(validation.shape))

# 100% of the outliers will be used for validation too.
validation = pd.concat([validation, outliers])
validation = validation.reset_index(drop=True)
print('Validation dataset (inliers + outliers): {}'.format(validation.shape))

Training dataset (only inliers): (1818, 14)
Validation dataset (only inliers): (399, 14)
Validation dataset (inliers + outliers): (454, 14)


## Model training

In [14]:
X = training.drop(non_training_attributes, axis=1)
y = training['changeset_harmful']

In [15]:
model = IsolationForest(random_state=random_state)

In [16]:
model = IsolationForest(random_state=random_state)
model.fit(X)

IsolationForest(bootstrap=False, contamination=0.1, max_features=1.0,
        max_samples='auto', n_estimators=100, n_jobs=1, random_state=5,
        verbose=0)

In [17]:
training['prediction'] = model.predict(X)
training.to_csv(labelled_path + 'training-review.csv', index=False)

In [18]:
from sklearn.externals import joblib

model_path = '../gabbar/trained/model.pkl'
joblib.dump(model, model_path, compress=True)

['../gabbar/trained/model.pkl']

In [19]:
from sklearn.metrics import confusion_matrix
matrix = confusion_matrix(y, training['prediction'])
matrix = pd.DataFrame(matrix, index=['Labelled harmful', 'Labelled good'], columns=['Predicted harmful', 'Predicted good'])
matrix

Unnamed: 0,Predicted harmful,Predicted good
Labelled harmful,0,0
Labelled good,182,1636


In [20]:
from sklearn.metrics import classification_report

report = classification_report(y, training['prediction'])
print(report)

             precision    recall  f1-score   support

         -1       0.00      0.00      0.00         0
          1       1.00      0.90      0.95      1818

avg / total       1.00      0.90      0.95      1818



## Model validation

In [21]:
validation.groupby('changeset_harmful').size()

changeset_harmful
-1     55
 1    399
dtype: int64

In [22]:
vX = validation.drop(non_training_attributes, axis=1)
vy = validation['changeset_harmful']

In [23]:
validation['prediction'] = model.predict(vX)
validation.to_csv(labelled_path + 'validation-review.csv', index=False)

In [24]:
from sklearn.metrics import confusion_matrix
matrix = confusion_matrix(vy, validation['prediction'])
matrix = pd.DataFrame(matrix, index=['Labelled harmful', 'Labelled good'], columns=['Predicted harmful', 'Predicted good'])
matrix

Unnamed: 0,Predicted harmful,Predicted good
Labelled harmful,40,15
Labelled good,41,358


In [25]:
from sklearn.metrics import classification_report

report = classification_report(vy, validation['prediction'])
print(report)

             precision    recall  f1-score   support

         -1       0.49      0.73      0.59        55
          1       0.96      0.90      0.93       399

avg / total       0.90      0.88      0.89       454



## Model testing

In [26]:
unlabelled_path = testing_path = '../downloads/anomaly-detection/unlabelled/'

In [27]:
unlabelled = pd.read_csv(unlabelled_path + 'attributes.csv')
print(unlabelled.shape)
unlabelled.sample(10)

(121, 14)


Unnamed: 0,changeset_id,changeset_harmful,feature_id,feature_type,action_create,action_modify,action_delete,feature_version,highway_tag_created,highway_tag_deleted,highway_value_difference,primary_tags_difference,area_of_feature_bbox,length_of_longest_segment
116,49176015,,491569013,way,0,1,0,2,0,0,6,0,3452.44,0.09
45,49178701,,470737498,way,0,1,0,3,0,0,0,0,2498.07,0.05
82,49177236,,497501477,way,1,0,0,1,1,0,0,1,5745.54,0.03
57,49178100,,441496260,way,0,1,0,2,0,0,0,0,905.09,0.05
96,49176660,,103705779,way,0,1,0,4,0,0,0,0,91.4,0.01
103,49176385,,447119456,way,0,1,0,3,0,0,0,0,20420.57,0.05
8,49180363,,497517834,way,1,0,0,2,1,0,0,1,852.83,0.03
93,49176720,,125299795,way,0,1,0,6,0,0,0,0,1066.76,0.17
53,49178304,,5484749,way,0,1,0,6,0,0,0,0,8611.57,0.28
90,49176815,,180687803,way,0,1,0,7,0,0,-19,0,70.27,0.03


In [28]:
# Drop all duplicate samples.
print('Shape before dropping duplicates: {}'.format(unlabelled.shape))
unlabelled = unlabelled.drop_duplicates(subset=['changeset_id', 'feature_id'])
print('Shape after dropping duplicates: {}'.format(unlabelled.shape))

Shape before dropping duplicates: (121, 14)
Shape after dropping duplicates: (121, 14)


In [29]:
# Using all of the unlabelled dataset for testing.
testing = unlabelled.sample(unlabelled.shape[0])

In [30]:
testing.describe()

Unnamed: 0,changeset_id,changeset_harmful,feature_id,action_create,action_modify,action_delete,feature_version,highway_tag_created,highway_tag_deleted,highway_value_difference,primary_tags_difference,area_of_feature_bbox,length_of_longest_segment
count,121.0,0.0,121.0,121.0,121.0,121.0,121.0,121.0,121.0,121.0,121.0,121.0,121.0
mean,49200000.0,,434000000.0,0.03,0.96,0.00826,4.15,0.07,0.04,0.07,0.02,4720000.0,0.11
std,1410.0,,835000000.0,0.18,0.2,0.0909,2.9,0.25,0.2,10.32,0.33,38000000.0,0.24
min,49200000.0,,4400000.0,0.0,0.0,0.0,1.0,0.0,0.0,-36.0,-1.0,0.0,0.0
25%,49200000.0,,104000000.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,427.0,0.03
50%,49200000.0,,333000000.0,0.0,1.0,0.0,3.0,0.0,0.0,0.0,0.0,6220.0,0.05
75%,49200000.0,,451000000.0,0.0,1.0,0.0,5.0,0.0,0.0,0.0,0.0,37800.0,0.1
max,49200000.0,,4890000000.0,1.0,1.0,1.0,18.0,1.0,1.0,38.0,1.0,404000000.0,1.74


In [31]:
testing.sum(numeric_only=True)

changeset_id                 5.95e+09
changeset_harmful            0.00e+00
feature_id                   5.25e+10
action_create                4.00e+00
action_modify                1.16e+02
action_delete                1.00e+00
feature_version              5.02e+02
highway_tag_created          8.00e+00
highway_tag_deleted          5.00e+00
highway_value_difference     8.00e+00
primary_tags_difference      3.00e+00
area_of_feature_bbox         5.72e+08
length_of_longest_segment    1.37e+01
dtype: float64

In [32]:
tX = testing.drop(non_training_attributes, axis=1)
ty = testing['changeset_harmful']

In [33]:
testing['prediction'] = model.predict(tX)
testing.to_csv(unlabelled_path + 'testing-review.csv', index=False)

In [34]:
tharmful_count = testing[testing['prediction'] == -1].shape[0]
tnot_harmful_count = testing[testing['prediction'] == 1].shape[0]

print('Predicted good: {}'.format(tnot_harmful_count))
print('Predicted harmful: {}'.format(tharmful_count))

print('Percentage harmful: {}%'.format(round(100.0 * tharmful_count / testing.shape[0], 2)))

Predicted good: 98
Predicted harmful: 23
Percentage harmful: 19.01%
