# Anomaly detection - Gabbar

In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import IsolationForest

In [4]:
pd.set_option('display.precision', 2)
pd.set_option('display.max_columns', None)

In [5]:
random_state = 5
cv = 10

In [6]:
non_training_attributes = ['changeset_id', 'changeset_harmful', 'feature_id']

## Prepare datasets

In [7]:
labelled_path = '../downloads/anomaly-detection/labelled/'

In [8]:
labelled = pd.read_csv(labelled_path + 'attributes.csv')
print(labelled.shape)

# Sort the dataset randomly.
labelled = labelled.sample(labelled.shape[0], random_state=random_state)
labelled.sample(10)

(2272, 13)


Unnamed: 0,changeset_id,changeset_harmful,feature_id,action_create,action_modify,action_delete,geometry_type_node,geometry_type_way,geometry_type_relation,feature_version,highway_tag_deleted,highway_value_difference,primary_tags_difference
925,47492943,1,484439171,0,1,0,0,1,0,2,0,0,0
1497,47335473,1,222773875,0,1,0,0,1,0,2,0,0,0
1750,47202543,1,227044070,0,1,0,0,1,0,4,0,0,0
2186,45543420,1,240300038,0,1,0,0,1,0,3,0,-3,0
1232,47403175,1,38846765,0,1,0,0,1,0,5,0,0,0
1780,47118073,1,173851207,0,1,0,0,1,0,6,0,0,0
53,48578093,1,148898384,0,1,0,0,1,0,8,0,-2,0
2139,46332174,1,24788407,0,1,0,0,1,0,11,0,0,0
1796,47067457,1,13224558,0,1,0,0,1,0,7,0,0,0
286,47902615,1,27520214,0,1,0,0,1,0,14,0,0,0


In [9]:
# Drop all duplicate samples.
print('Shape before dropping duplicates: {}'.format(labelled.shape))
labelled = labelled.drop_duplicates(subset=['changeset_id', 'feature_id'])
print('Shape after dropping duplicates: {}'.format(labelled.shape))

Shape before dropping duplicates: (2272, 13)
Shape after dropping duplicates: (2272, 13)


In [10]:
labelled.sum()

changeset_id                107510630884
changeset_harmful                   2162
feature_id                  624409661355
action_create                          6
action_modify                       2266
action_delete                          0
geometry_type_node                   120
geometry_type_way                   2146
geometry_type_relation                 6
feature_version                    13795
highway_tag_deleted                   37
highway_value_difference            -498
primary_tags_difference               80
dtype: int64

In [11]:
inliers = labelled[labelled['changeset_harmful'] == 1]
print('Total inliers: {}'.format(inliers.shape))

outliers = labelled[labelled['changeset_harmful'] == -1]
print('Total outliers: {}'.format(outliers.shape))

Total inliers: (2217, 13)
Total outliers: (55, 13)


In [12]:
total = labelled.shape[0]

# 80% of the inliers will be used for training.
training = inliers.iloc[0:round(0.8 * total)]
training = training.reset_index(drop=True)
print('Training dataset (only inliers): {}'.format(training.shape))

# 20% of the inliers will be used for validation.
validation = inliers.iloc[round(0.8 * total):]
validation = validation.reset_index(drop=True)
print('Validation dataset (only inliers): {}'.format(validation.shape))

# 100% of the outliers will be used for validation too.
validation = pd.concat([validation, outliers])
validation = validation.reset_index(drop=True)
print('Validation dataset (inliers + outliers): {}'.format(validation.shape))

Training dataset (only inliers): (1818, 13)
Validation dataset (only inliers): (399, 13)
Validation dataset (inliers + outliers): (454, 13)


## Model training

In [13]:
X = training.drop(non_training_attributes, axis=1)
y = training['changeset_harmful']

In [14]:
model = IsolationForest(random_state=random_state)
model.fit(X)

IsolationForest(bootstrap=False, contamination=0.1, max_features=1.0,
        max_samples='auto', n_estimators=100, n_jobs=1, random_state=5,
        verbose=0)

In [15]:
training['prediction'] = model.predict(X)
training.to_csv(labelled_path + 'training-review.csv', index=False)

In [16]:
from sklearn.metrics import confusion_matrix
matrix = confusion_matrix(y, training['prediction'])
matrix = pd.DataFrame(matrix, index=['Labelled harmful', 'Labelled good'], columns=['Predicted harmful', 'Predicted good'])
matrix

Unnamed: 0,Predicted harmful,Predicted good
Labelled harmful,0,0
Labelled good,183,1635


In [17]:
from sklearn.metrics import classification_report

report = classification_report(y, training['prediction'])
print(report)

             precision    recall  f1-score   support

         -1       0.00      0.00      0.00         0
          1       1.00      0.90      0.95      1818

avg / total       1.00      0.90      0.95      1818



In [18]:
scores = cross_val_score(model, X, y, cv=cv, scoring='precision')
print('Precision on training: {} ({})'.format(round(scores.mean(), 2), round(scores.std(), 2)))

scores = cross_val_score(model, X, y, cv=cv, scoring='recall')
print('Recall on training: {} ({})'.format(round(scores.mean(), 2), round(scores.std(), 2)))

scores = cross_val_score(model, X, y, cv=cv, scoring='f1')
print('F1 score on training: {} ({})'.format(round(scores.mean(), 2), round(scores.std(), 2)))

Precision on training: 1.0 (0.0)
Recall on training: 0.9 (0.03)
F1 score on training: 0.95 (0.01)


## Model validation

In [19]:
validation.groupby('changeset_harmful').size()

changeset_harmful
-1     55
 1    399
dtype: int64

In [20]:
vX = validation.drop(non_training_attributes, axis=1)
vy = validation['changeset_harmful']

In [21]:
validation['prediction'] = model.predict(vX)
validation.to_csv(labelled_path + 'validation-review.csv', index=False)

In [22]:
from sklearn.metrics import confusion_matrix
matrix = confusion_matrix(vy, validation['prediction'])
matrix = pd.DataFrame(matrix, index=['Labelled harmful', 'Labelled good'], columns=['Predicted harmful', 'Predicted good'])
matrix

Unnamed: 0,Predicted harmful,Predicted good
Labelled harmful,31,24
Labelled good,47,352


In [23]:
from sklearn.metrics import classification_report

report = classification_report(vy, validation['prediction'])
print(report)

             precision    recall  f1-score   support

         -1       0.40      0.56      0.47        55
          1       0.94      0.88      0.91       399

avg / total       0.87      0.84      0.85       454



In [24]:
scores = cross_val_score(model, vX, vy, cv=cv, scoring='precision')
print('Precision on validation: {} ({})'.format(round(scores.mean(), 2), round(scores.std(), 2)))

scores = cross_val_score(model, vX, vy, cv=cv, scoring='recall')
print('Recall on validation: {} ({})'.format(round(scores.mean(), 2), round(scores.std(), 2)))

scores = cross_val_score(model, vX, vy, cv=cv, scoring='f1')
print('F1 score on validation: {} ({})'.format(round(scores.mean(), 2), round(scores.std(), 2)))

Precision on validation: 0.88 (0.3)
Recall on validation: 0.82 (0.28)
F1 score on validation: 0.85 (0.29)


## Model testing

In [25]:
unlabelled_path = testing_path = '../downloads/anomaly-detection/unlabelled/'

In [26]:
unlabelled = pd.read_csv(unlabelled_path + 'attributes.csv')
print(unlabelled.shape)
unlabelled.sample(10)

(121, 13)


Unnamed: 0,changeset_id,changeset_harmful,feature_id,action_create,action_modify,action_delete,geometry_type_node,geometry_type_way,geometry_type_relation,feature_version,highway_tag_deleted,highway_value_difference,primary_tags_difference
38,49178847,,491867965,0,1,0,0,1,0,2,0,0,0
60,49177921,,497504279,0,1,0,0,1,0,2,0,0,0
12,49180089,,384997943,0,1,0,0,1,0,2,0,0,0
49,49178600,,412811196,0,1,0,0,1,0,2,0,0,0
94,49176681,,416258662,0,0,1,0,1,0,2,1,36,-1
46,49178667,,450504413,0,1,0,0,1,0,3,0,2,0
116,49176015,,491569013,0,1,0,0,1,0,2,0,6,0
89,49176920,,469859613,0,1,0,0,1,0,2,0,0,0
55,49178258,,408806086,0,1,0,0,1,0,2,0,0,0
66,49177815,,1657992650,0,1,0,1,0,0,4,0,0,0


In [27]:
# Drop all duplicate samples.
print('Shape before dropping duplicates: {}'.format(unlabelled.shape))
unlabelled = unlabelled.drop_duplicates(subset=['changeset_id', 'feature_id'])
print('Shape after dropping duplicates: {}'.format(unlabelled.shape))

Shape before dropping duplicates: (121, 13)
Shape after dropping duplicates: (121, 13)


In [28]:
# Using all of the unlabelled dataset for testing.
testing = unlabelled.sample(unlabelled.shape[0])

In [29]:
tX = testing.drop(non_training_attributes, axis=1)
ty = testing['changeset_harmful']

In [30]:
testing['prediction'] = model.predict(tX)
testing.to_csv(unlabelled_path + 'testing-review.csv', index=False)

In [31]:
tharmful_count = testing[testing['prediction'] == -1].shape[0]
tnot_harmful_count = testing[testing['prediction'] == 1].shape[0]

print('Predicted good: {}'.format(tnot_harmful_count))
print('Predicted harmful: {}'.format(tharmful_count))

print('Percentage harmful: {}%'.format(round(100.0 * tharmful_count / testing.shape[0], 2)))

Predicted good: 101
Predicted harmful: 20
Percentage harmful: 16.53%
