# Anomaly detection - Gabbar

In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import IsolationForest

In [4]:
pd.set_option('display.precision', 2)
pd.set_option('display.max_columns', None)

In [5]:
random_state = 5
cv = 10

In [6]:
non_training_attributes = ['changeset_id', 'changeset_harmful', 'feature_id', 'feature_type']

## Prepare datasets

In [7]:
labelled_path = '../downloads/anomaly-detection/labelled/'

In [8]:
labelled = pd.read_csv(labelled_path + 'attributes.csv')
print(labelled.shape)

# Sort the dataset randomly.
labelled = labelled.sample(labelled.shape[0], random_state=random_state)
labelled.sample(10)

(2272, 14)


Unnamed: 0,changeset_id,changeset_harmful,feature_id,feature_type,action_create,action_modify,action_delete,feature_version,highway_tag_created,highway_tag_deleted,highway_value_difference,primary_tags_difference,area_of_feature_bbox,length_of_longest_segment
648,47527246,1,22650068,way,0,1,0,7,0,0,0,0,221.36,0.0034
911,47496095,1,28798331,way,0,1,0,21,0,0,0,0,728.45,0.0531
230,47974373,1,460606347,way,0,1,0,4,0,0,0,0,2541.26,0.0595
1041,47448859,1,157719746,way,0,1,0,3,0,0,0,0,215734.15,0.12
587,47533332,1,37534833,way,0,1,0,9,0,0,0,0,16166.94,0.193
545,47538950,1,286494106,way,0,1,0,2,0,0,0,0,4896.89,0.0799
1072,47438116,1,207807965,way,0,1,0,3,0,0,0,0,7780.09,0.0509
1386,47375160,1,147200516,way,0,1,0,3,0,0,0,0,5380.96,0.0
323,47690703,1,485660593,way,0,1,0,2,0,0,0,0,3170.62,0.048
1820,47027745,1,481692436,way,0,1,0,4,0,0,0,0,1516.47,0.0099


In [9]:
# Drop all duplicate samples.
print('Shape before dropping duplicates: {}'.format(labelled.shape))
labelled = labelled.drop_duplicates(subset=['changeset_id', 'feature_id'])
print('Shape after dropping duplicates: {}'.format(labelled.shape))

Shape before dropping duplicates: (2272, 14)
Shape after dropping duplicates: (2272, 14)


In [36]:
labelled.describe?

In [10]:
labelled.describe()

Unnamed: 0,changeset_id,changeset_harmful,feature_id,action_create,action_modify,action_delete,feature_version,highway_tag_created,highway_tag_deleted,highway_value_difference,primary_tags_difference,area_of_feature_bbox,length_of_longest_segment
count,2270.0,2272.0,2270.0,2270.0,2272.0,2272.0,2272.0,2272.0,2272.0,2272.0,2272.0,2270.0,2272.0
mean,47300000.0,0.95,275000000.0,0.00264,1.0,0.0,6.07,0.04,0.02,-0.14,0.04,16600000.0,0.16
std,679000.0,0.31,594000000.0,0.0513,0.05,0.0,5.32,0.2,0.13,6.91,0.24,324000000.0,1.07
min,44800000.0,-1.0,173000.0,0.0,0.0,0.0,2.0,0.0,0.0,-41.0,-1.0,0.0,0.0
25%,47300000.0,1.0,28900000.0,0.0,1.0,0.0,3.0,0.0,0.0,0.0,0.0,391.0,0.02
50%,47400000.0,1.0,148000000.0,0.0,1.0,0.0,4.0,0.0,0.0,0.0,0.0,3240.0,0.06
75%,47500000.0,1.0,334000000.0,0.0,1.0,0.0,8.0,0.0,0.0,0.0,0.0,22500.0,0.11
max,49700000.0,1.0,4850000000.0,1.0,1.0,0.0,62.0,1.0,1.0,39.0,3.0,12800000000.0,30.89


In [11]:
labelled.sum()

changeset_id                                                      107510630884
changeset_harmful                                                         2162
feature_id                                                        624409661355
feature_type                 waynodewaywaywaywaywaywaywaywaywaywaywaywayway...
action_create                                                                6
action_modify                                                             2266
action_delete                                                                0
feature_version                                                          13795
highway_tag_created                                                         94
highway_tag_deleted                                                         37
highway_value_difference                                                  -329
primary_tags_difference                                                     80
area_of_feature_bbox                                

In [12]:
inliers = labelled[labelled['changeset_harmful'] == 1]
print('Total inliers: {}'.format(inliers.shape))

outliers = labelled[labelled['changeset_harmful'] == -1]
print('Total outliers: {}'.format(outliers.shape))

Total inliers: (2217, 14)
Total outliers: (55, 14)


In [13]:
total = labelled.shape[0]

# 80% of the inliers will be used for training.
training = inliers.iloc[0:round(0.8 * total)]
training = training.reset_index(drop=True)
print('Training dataset (only inliers): {}'.format(training.shape))

# 20% of the inliers will be used for validation.
validation = inliers.iloc[round(0.8 * total):]
validation = validation.reset_index(drop=True)
print('Validation dataset (only inliers): {}'.format(validation.shape))

# 100% of the outliers will be used for validation too.
validation = pd.concat([validation, outliers])
validation = validation.reset_index(drop=True)
print('Validation dataset (inliers + outliers): {}'.format(validation.shape))

Training dataset (only inliers): (1818, 14)
Validation dataset (only inliers): (399, 14)
Validation dataset (inliers + outliers): (454, 14)


## Model training

In [14]:
X = training.drop(non_training_attributes, axis=1)
y = training['changeset_harmful']

In [15]:
model = IsolationForest(random_state=random_state)

In [16]:
scores = cross_val_score(model, X, y, cv=cv, scoring='precision')
print('Precision on training: {} ({})'.format(round(scores.mean(), 2), round(scores.std(), 2)))

scores = cross_val_score(model, X, y, cv=cv, scoring='recall')
print('Recall on training: {} ({})'.format(round(scores.mean(), 2), round(scores.std(), 2)))

scores = cross_val_score(model, X, y, cv=cv, scoring='f1')
print('F1 score on training: {} ({})'.format(round(scores.mean(), 2), round(scores.std(), 2)))

Precision on training: 1.0 (0.0)
Recall on training: 0.9 (0.03)
F1 score on training: 0.95 (0.02)


In [17]:
model = IsolationForest(random_state=random_state)
model.fit(X)

IsolationForest(bootstrap=False, contamination=0.1, max_features=1.0,
        max_samples='auto', n_estimators=100, n_jobs=1, random_state=5,
        verbose=0)

In [18]:
training['prediction'] = model.predict(X)
training.to_csv(labelled_path + 'training-review.csv', index=False)

In [19]:
from sklearn.externals import joblib

model_path = '../gabbar/trained/model.pkl'
joblib.dump(model, model_path, compress=True)

['../gabbar/trained/model.pkl']

In [20]:
from sklearn.metrics import confusion_matrix
matrix = confusion_matrix(y, training['prediction'])
matrix = pd.DataFrame(matrix, index=['Labelled harmful', 'Labelled good'], columns=['Predicted harmful', 'Predicted good'])
matrix

Unnamed: 0,Predicted harmful,Predicted good
Labelled harmful,0,0
Labelled good,182,1636


In [21]:
from sklearn.metrics import classification_report

report = classification_report(y, training['prediction'])
print(report)

             precision    recall  f1-score   support

         -1       0.00      0.00      0.00         0
          1       1.00      0.90      0.95      1818

avg / total       1.00      0.90      0.95      1818



## Model validation

In [22]:
validation.groupby('changeset_harmful').size()

changeset_harmful
-1     55
 1    399
dtype: int64

In [23]:
vX = validation.drop(non_training_attributes, axis=1)
vy = validation['changeset_harmful']

In [24]:
validation['prediction'] = model.predict(vX)
validation.to_csv(labelled_path + 'validation-review.csv', index=False)

In [25]:
from sklearn.metrics import confusion_matrix
matrix = confusion_matrix(vy, validation['prediction'])
matrix = pd.DataFrame(matrix, index=['Labelled harmful', 'Labelled good'], columns=['Predicted harmful', 'Predicted good'])
matrix

Unnamed: 0,Predicted harmful,Predicted good
Labelled harmful,40,15
Labelled good,41,358


In [26]:
from sklearn.metrics import classification_report

report = classification_report(vy, validation['prediction'])
print(report)

             precision    recall  f1-score   support

         -1       0.49      0.73      0.59        55
          1       0.96      0.90      0.93       399

avg / total       0.90      0.88      0.89       454



## Model testing

In [27]:
unlabelled_path = testing_path = '../downloads/anomaly-detection/unlabelled/'

In [28]:
unlabelled = pd.read_csv(unlabelled_path + 'attributes.csv')
print(unlabelled.shape)
unlabelled.sample(10)

(26639, 14)


Unnamed: 0,changeset_id,changeset_harmful,feature_id,feature_type,action_create,action_modify,action_delete,feature_version,highway_tag_created,highway_tag_deleted,highway_value_difference,primary_tags_difference,area_of_feature_bbox,length_of_longest_segment
11270,49177985,,15692915,way,0,1,0,3,0,0,0,0,932000.0,0.51
20126,49177468,,237577115,way,0,1,0,20,0,0,0,0,546000.0,0.2
21386,49177209,,45067449,way,0,1,0,6,0,0,0,0,68.5,0.01
10202,49178037,,497505366,way,1,0,0,1,1,0,0,1,5860.0,0.04
21583,49177170,,497501191,way,1,0,0,1,1,0,0,1,335.0,0.01
13337,49177985,,15745847,way,0,1,0,8,0,0,0,0,3860.0,0.09
5941,49179427,,176230427,way,0,1,0,7,0,0,0,0,369000.0,0.21
11953,49177985,,15709559,way,0,1,0,7,0,0,0,0,24800.0,0.19
15716,49177985,,15768512,way,0,1,0,4,0,0,0,0,31500000.0,1.58
22415,49176975,,42998630,way,0,1,0,8,0,0,0,0,7090.0,0.06


In [29]:
# Drop all duplicate samples.
print('Shape before dropping duplicates: {}'.format(unlabelled.shape))
unlabelled = unlabelled.drop_duplicates(subset=['changeset_id', 'feature_id'])
print('Shape after dropping duplicates: {}'.format(unlabelled.shape))

Shape before dropping duplicates: (26639, 14)
Shape after dropping duplicates: (26578, 14)


In [30]:
# Using all of the unlabelled dataset for testing.
testing = unlabelled.sample(unlabelled.shape[0])

In [31]:
testing.describe()

Unnamed: 0,changeset_id,changeset_harmful,feature_id,action_create,action_modify,action_delete,feature_version,highway_tag_created,highway_tag_deleted,highway_value_difference,primary_tags_difference,area_of_feature_bbox,length_of_longest_segment
count,26600.0,15.0,26600.0,26578.0,26578.0,26578.0,26578.0,26578.0,26578.0,26578.0,26578.0,26600.0,26578.0
mean,49200000.0,1.0,350000000.0,0.28,0.71,0.02,3.64,0.28,0.02,0.42,0.27,2530000.0,0.16
std,1290.0,0.0,688000000.0,0.45,0.45,0.13,3.45,0.45,0.13,5.21,0.48,45100000.0,0.27
min,49200000.0,1.0,2950000.0,0.0,0.0,0.0,1.0,0.0,0.0,-39.0,-2.0,0.0,0.0
25%,49200000.0,1.0,15800000.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,720.0,0.04
50%,49200000.0,1.0,232000000.0,0.0,1.0,0.0,3.0,0.0,0.0,0.0,0.0,6770.0,0.08
75%,49200000.0,1.0,497000000.0,1.0,1.0,0.0,4.0,1.0,0.0,0.0,1.0,65400.0,0.17
max,49200000.0,1.0,4890000000.0,1.0,1.0,1.0,60.0,1.0,1.0,41.0,3.0,4990000000.0,7.28


In [32]:
testing.sum()

changeset_id                                                     1307058994515
changeset_harmful                                                           15
feature_id                                                       9295031220250
feature_type                 waywaywaywaywaywaywaywaywaywaywaywaywaywaywayw...
action_create                                                             7330
action_modify                                                            18817
action_delete                                                              431
feature_version                                                          96682
highway_tag_created                                                       7553
highway_tag_deleted                                                        458
highway_value_difference                                                 11175
primary_tags_difference                                                   7120
area_of_feature_bbox                                

In [33]:
tX = testing.drop(non_training_attributes, axis=1)
ty = testing['changeset_harmful']

In [34]:
testing['prediction'] = model.predict(tX)
testing.to_csv(unlabelled_path + 'testing-review.csv', index=False)

In [35]:
tharmful_count = testing[testing['prediction'] == -1].shape[0]
tnot_harmful_count = testing[testing['prediction'] == 1].shape[0]

print('Predicted good: {}'.format(tnot_harmful_count))
print('Predicted harmful: {}'.format(tharmful_count))

print('Percentage harmful: {}%'.format(round(100.0 * tharmful_count / testing.shape[0], 2)))

Predicted good: 16375
Predicted harmful: 10203
Percentage harmful: 38.39%
