# Anomaly detection - Gabbar

In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import IsolationForest

In [4]:
pd.set_option('display.precision', 2)
pd.set_option('display.max_columns', None)

In [5]:
random_state = 5
cv = 10

In [6]:
non_training_attributes = ['changeset_id', 'changeset_harmful', 'feature_id']

## Prepare datasets

In [7]:
labelled_path = '../downloads/anomaly-detection/labelled/'

In [8]:
labelled = pd.read_csv(labelled_path + 'attributes.csv')
print(labelled.shape)

# Sort the dataset randomly.
labelled = labelled.sample(labelled.shape[0], random_state=random_state)
labelled.sample(10)

(2272, 13)


Unnamed: 0,changeset_id,changeset_harmful,feature_id,action_create,action_modify,action_delete,feature_version,highway_tag_created,highway_tag_deleted,highway_value_difference,primary_tags_difference,area_of_feature_bbox,length_of_longest_segment
1635,47303089,1,26764775,0,1,0,6,0,0,0,0,5320.0,0.0344
9,49489274,1,113139810,0,1,0,3,0,0,0,0,5700.0,0.147
1109,47431991,1,103917208,0,1,0,3,0,0,0,0,481.0,0.0726
1270,47400565,1,205660934,0,1,0,4,0,0,0,0,18.4,0.0095
1274,47400560,1,101201583,0,1,0,4,0,0,0,0,13.4,0.0072
1414,47364755,1,196929888,0,1,0,3,0,0,0,0,4090.0,0.0612
1604,47304701,1,157756559,0,1,0,4,0,0,0,0,15300.0,0.0821
1726,47215839,1,95253507,0,1,0,2,0,0,0,0,223.0,0.0818
68,48461675,1,11421619,0,1,0,5,0,0,36,0,8420000.0,0.304
1191,47409448,1,30609404,0,1,0,2,0,0,0,0,1860.0,0.0


In [9]:
# Drop all duplicate samples.
print('Shape before dropping duplicates: {}'.format(labelled.shape))
labelled = labelled.drop_duplicates(subset=['changeset_id', 'feature_id'])
print('Shape after dropping duplicates: {}'.format(labelled.shape))

Shape before dropping duplicates: (2272, 13)
Shape after dropping duplicates: (2272, 13)


In [10]:
labelled.sum()

changeset_id                 1.08e+11
changeset_harmful            2.16e+03
feature_id                   6.24e+11
action_create                6.00e+00
action_modify                2.27e+03
action_delete                0.00e+00
feature_version              1.38e+04
highway_tag_created          9.40e+01
highway_tag_deleted          3.70e+01
highway_value_difference    -3.29e+02
primary_tags_difference      8.00e+01
area_of_feature_bbox         3.78e+10
length_of_longest_segment    3.62e+02
dtype: float64

In [11]:
inliers = labelled[labelled['changeset_harmful'] == 1]
print('Total inliers: {}'.format(inliers.shape))

outliers = labelled[labelled['changeset_harmful'] == -1]
print('Total outliers: {}'.format(outliers.shape))

Total inliers: (2217, 13)
Total outliers: (55, 13)


In [12]:
total = labelled.shape[0]

# 80% of the inliers will be used for training.
training = inliers.iloc[0:round(0.8 * total)]
training = training.reset_index(drop=True)
print('Training dataset (only inliers): {}'.format(training.shape))

# 20% of the inliers will be used for validation.
validation = inliers.iloc[round(0.8 * total):]
validation = validation.reset_index(drop=True)
print('Validation dataset (only inliers): {}'.format(validation.shape))

# 100% of the outliers will be used for validation too.
validation = pd.concat([validation, outliers])
validation = validation.reset_index(drop=True)
print('Validation dataset (inliers + outliers): {}'.format(validation.shape))

Training dataset (only inliers): (1818, 13)
Validation dataset (only inliers): (399, 13)
Validation dataset (inliers + outliers): (454, 13)


## Model training

In [13]:
X = training.drop(non_training_attributes, axis=1)
y = training['changeset_harmful']

In [14]:
model = IsolationForest(random_state=random_state)
model.fit(X)

IsolationForest(bootstrap=False, contamination=0.1, max_features=1.0,
        max_samples='auto', n_estimators=100, n_jobs=1, random_state=5,
        verbose=0)

In [15]:
training['prediction'] = model.predict(X)
training.to_csv(labelled_path + 'training-review.csv', index=False)

In [16]:
from sklearn.metrics import confusion_matrix
matrix = confusion_matrix(y, training['prediction'])
matrix = pd.DataFrame(matrix, index=['Labelled harmful', 'Labelled good'], columns=['Predicted harmful', 'Predicted good'])
matrix

Unnamed: 0,Predicted harmful,Predicted good
Labelled harmful,0,0
Labelled good,182,1636


In [17]:
from sklearn.metrics import classification_report

report = classification_report(y, training['prediction'])
print(report)

             precision    recall  f1-score   support

         -1       0.00      0.00      0.00         0
          1       1.00      0.90      0.95      1818

avg / total       1.00      0.90      0.95      1818



In [18]:
scores = cross_val_score(model, X, y, cv=cv, scoring='precision')
print('Precision on training: {} ({})'.format(round(scores.mean(), 2), round(scores.std(), 2)))

scores = cross_val_score(model, X, y, cv=cv, scoring='recall')
print('Recall on training: {} ({})'.format(round(scores.mean(), 2), round(scores.std(), 2)))

scores = cross_val_score(model, X, y, cv=cv, scoring='f1')
print('F1 score on training: {} ({})'.format(round(scores.mean(), 2), round(scores.std(), 2)))

Precision on training: 1.0 (0.0)
Recall on training: 0.9 (0.03)
F1 score on training: 0.95 (0.02)


## Model validation

In [19]:
validation.groupby('changeset_harmful').size()

changeset_harmful
-1     55
 1    399
dtype: int64

In [20]:
vX = validation.drop(non_training_attributes, axis=1)
vy = validation['changeset_harmful']

In [21]:
validation['prediction'] = model.predict(vX)
validation.to_csv(labelled_path + 'validation-review.csv', index=False)

In [22]:
from sklearn.metrics import confusion_matrix
matrix = confusion_matrix(vy, validation['prediction'])
matrix = pd.DataFrame(matrix, index=['Labelled harmful', 'Labelled good'], columns=['Predicted harmful', 'Predicted good'])
matrix

Unnamed: 0,Predicted harmful,Predicted good
Labelled harmful,40,15
Labelled good,41,358


In [23]:
from sklearn.metrics import classification_report

report = classification_report(vy, validation['prediction'])
print(report)

             precision    recall  f1-score   support

         -1       0.49      0.73      0.59        55
          1       0.96      0.90      0.93       399

avg / total       0.90      0.88      0.89       454



In [24]:
scores = cross_val_score(model, vX, vy, cv=cv, scoring='precision')
print('Precision on validation: {} ({})'.format(round(scores.mean(), 2), round(scores.std(), 2)))

scores = cross_val_score(model, vX, vy, cv=cv, scoring='recall')
print('Recall on validation: {} ({})'.format(round(scores.mean(), 2), round(scores.std(), 2)))

scores = cross_val_score(model, vX, vy, cv=cv, scoring='f1')
print('F1 score on validation: {} ({})'.format(round(scores.mean(), 2), round(scores.std(), 2)))

Precision on validation: 0.88 (0.3)
Recall on validation: 0.85 (0.28)
F1 score on validation: 0.86 (0.29)


## Model testing

In [25]:
unlabelled_path = testing_path = '../downloads/anomaly-detection/unlabelled/'

In [26]:
unlabelled = pd.read_csv(unlabelled_path + 'attributes.csv')
print(unlabelled.shape)
unlabelled.sample(10)

(121, 13)


Unnamed: 0,changeset_id,changeset_harmful,feature_id,action_create,action_modify,action_delete,feature_version,highway_tag_created,highway_tag_deleted,highway_value_difference,primary_tags_difference,area_of_feature_bbox,length_of_longest_segment
99,49176540,,497497987,0,1,0,2,0,0,0,0,19518.47,0.11
36,49178949,,496777385,0,1,0,2,0,0,2,0,419.06,0.14
65,49177839,,406374434,0,1,0,4,0,0,0,0,93962.93,0.05
49,49178600,,412811196,0,1,0,2,0,0,0,0,1791.33,0.03
25,49179465,,379184853,0,1,0,3,0,0,0,0,16350.73,0.16
44,49178718,,475566443,0,1,0,2,0,0,0,0,6955.76,0.04
52,49178363,,309626776,0,1,0,3,0,0,-1,0,284.28,0.03
20,49179690,,38301337,0,1,0,7,0,0,-3,0,27907.9,0.16
97,49176643,,198877204,0,1,0,4,0,0,0,0,20681.42,0.12
109,49176189,,100190055,0,1,0,3,0,0,-13,0,58657.6,0.07


In [27]:
# Drop all duplicate samples.
print('Shape before dropping duplicates: {}'.format(unlabelled.shape))
unlabelled = unlabelled.drop_duplicates(subset=['changeset_id', 'feature_id'])
print('Shape after dropping duplicates: {}'.format(unlabelled.shape))

Shape before dropping duplicates: (121, 13)
Shape after dropping duplicates: (121, 13)


In [28]:
# Using all of the unlabelled dataset for testing.
testing = unlabelled.sample(unlabelled.shape[0])

In [29]:
tX = testing.drop(non_training_attributes, axis=1)
ty = testing['changeset_harmful']

In [30]:
testing['prediction'] = model.predict(tX)
testing.to_csv(unlabelled_path + 'testing-review.csv', index=False)

In [31]:
tharmful_count = testing[testing['prediction'] == -1].shape[0]
tnot_harmful_count = testing[testing['prediction'] == 1].shape[0]

print('Predicted good: {}'.format(tnot_harmful_count))
print('Predicted harmful: {}'.format(tharmful_count))

print('Percentage harmful: {}%'.format(round(100.0 * tharmful_count / testing.shape[0], 2)))

Predicted good: 98
Predicted harmful: 23
Percentage harmful: 19.01%
