# Anomaly detection - Gabbar

In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import IsolationForest

In [4]:
pd.set_option('display.precision', 2)
pd.set_option('display.max_columns', None)

In [5]:
random_state = 5
cv = 10

In [6]:
non_training_attributes = ['changeset_id', 'changeset_harmful', 'feature_id']

## Prepare datasets

In [7]:
labelled_path = '../downloads/anomaly-detection/labelled/'

In [8]:
labelled = pd.read_csv(labelled_path + 'attributes.csv')
print(labelled.shape)

# Sort the dataset randomly.
labelled = labelled.sample(labelled.shape[0], random_state=random_state)
labelled.head()

(2272, 12)


Unnamed: 0,changeset_id,changeset_harmful,feature_id,action_create,action_modify,action_delete,geometry_type_node,geometry_type_way,geometry_type_relation,feature_version,highway_tag_deleted,highway_value_difference
1503,47332261,1,22753759,0,1,0,0,1,0,5,0,0
2162,46038962,1,4072253881,0,1,0,1,0,0,2,0,-9
544,47538955,1,39383065,0,1,0,0,1,0,4,0,0
1002,47455929,1,22944171,0,1,0,0,1,0,25,0,0
1022,47451595,1,173303722,0,1,0,0,1,0,3,0,0


In [9]:
# Drop all duplicate samples.
print('Shape before dropping duplicates: {}'.format(labelled.shape))
labelled = labelled.drop_duplicates(subset=['changeset_id', 'feature_id'])
print('Shape after dropping duplicates: {}'.format(labelled.shape))

Shape before dropping duplicates: (2272, 12)
Shape after dropping duplicates: (2272, 12)


In [10]:
inliers = labelled[labelled['changeset_harmful'] == 1]
print('Total inliers: {}'.format(inliers.shape))

outliers = labelled[labelled['changeset_harmful'] == -1]
print('Total outliers: {}'.format(outliers.shape))

Total inliers: (2217, 12)
Total outliers: (55, 12)


In [11]:
total = labelled.shape[0]

# 80% of the inliers will be used for training.
training = inliers.iloc[0:round(0.8 * total)]
training = training.reset_index(drop=True)
print('Training dataset (only inliers): {}'.format(training.shape))

# 20% of the inliers will be used for validation.
validation = inliers.iloc[round(0.8 * total):]
validation = validation.reset_index(drop=True)
print('Validation dataset (only inliers): {}'.format(validation.shape))

# 100% of the outliers will be used for validation too.
validation = pd.concat([validation, outliers])
validation = validation.reset_index(drop=True)
print('Validation dataset (inliers + outliers): {}'.format(validation.shape))

Training dataset (only inliers): (1818, 12)
Validation dataset (only inliers): (399, 12)
Validation dataset (inliers + outliers): (454, 12)


## Model training

In [12]:
X = training.drop(non_training_attributes, axis=1)
y = training['changeset_harmful']

In [13]:
model = IsolationForest()
model.fit(X)

IsolationForest(bootstrap=False, contamination=0.1, max_features=1.0,
        max_samples='auto', n_estimators=100, n_jobs=1, random_state=None,
        verbose=0)

In [14]:
training['prediction'] = model.predict(X)
training.to_csv(labelled_path + 'training-review.csv', index=False)

In [15]:
from sklearn.metrics import confusion_matrix
matrix = confusion_matrix(y, training['prediction'])
matrix = pd.DataFrame(matrix, index=['Labelled harmful', 'Labelled good'], columns=['Predicted harmful', 'Predicted good'])
matrix

Unnamed: 0,Predicted harmful,Predicted good
Labelled harmful,0,0
Labelled good,182,1636


In [16]:
from sklearn.metrics import classification_report

report = classification_report(y, training['prediction'])
print(report)

             precision    recall  f1-score   support

         -1       0.00      0.00      0.00         0
          1       1.00      0.90      0.95      1818

avg / total       1.00      0.90      0.95      1818



In [17]:
scores = cross_val_score(model, X, y, cv=cv, scoring='precision')
print('Precision on training: {} ({})'.format(round(scores.mean(), 2), round(scores.std(), 2)))

scores = cross_val_score(model, X, y, cv=cv, scoring='recall')
print('Recall on training: {} ({})'.format(round(scores.mean(), 2), round(scores.std(), 2)))

scores = cross_val_score(model, X, y, cv=cv, scoring='f1')
print('F1 score on training: {} ({})'.format(round(scores.mean(), 2), round(scores.std(), 2)))

Precision on training: 1.0 (0.0)
Recall on training: 0.9 (0.03)
F1 score on training: 0.95 (0.01)


## Model validation

In [18]:
validation.groupby('changeset_harmful').size()

changeset_harmful
-1     55
 1    399
dtype: int64

In [19]:
vX = validation.drop(non_training_attributes, axis=1)
vy = validation['changeset_harmful']

In [20]:
validation['prediction'] = model.predict(vX)
validation.to_csv(labelled_path + 'validation-review.csv', index=False)

In [21]:
from sklearn.metrics import confusion_matrix
matrix = confusion_matrix(vy, validation['prediction'])
matrix = pd.DataFrame(matrix, index=['Labelled harmful', 'Labelled good'], columns=['Predicted harmful', 'Predicted good'])
matrix

Unnamed: 0,Predicted harmful,Predicted good
Labelled harmful,27,28
Labelled good,46,353


In [22]:
from sklearn.metrics import classification_report

report = classification_report(vy, validation['prediction'])
print(report)

             precision    recall  f1-score   support

         -1       0.37      0.49      0.42        55
          1       0.93      0.88      0.91       399

avg / total       0.86      0.84      0.85       454



In [23]:
scores = cross_val_score(model, vX, vy, cv=cv, scoring='precision')
print('Precision on validation: {} ({})'.format(round(scores.mean(), 2), round(scores.std(), 2)))

scores = cross_val_score(model, vX, vy, cv=cv, scoring='recall')
print('Recall on validation: {} ({})'.format(round(scores.mean(), 2), round(scores.std(), 2)))

scores = cross_val_score(model, vX, vy, cv=cv, scoring='f1')
print('F1 score on validation: {} ({})'.format(round(scores.mean(), 2), round(scores.std(), 2)))

Precision on validation: 0.88 (0.3)
Recall on validation: 0.82 (0.28)
F1 score on validation: 0.85 (0.29)


## Model testing

In [24]:
unlabelled_path = testing_path = '../downloads/anomaly-detection/unlabelled/'

In [25]:
unlabelled = pd.read_csv(unlabelled_path + 'attributes.csv')
print(unlabelled.shape)
unlabelled.head()

(121, 11)


Unnamed: 0,changeset_id,changeset_harmful,feature_id,action_create,action_modify,action_delete,geometry_type_node,geometry_type_way,geometry_type_relation,feature_version,highway_tag_deleted
0,49180736,,497516862,0,1,0,0,1,0,2,0
1,49180683,,30613789,0,1,0,0,1,0,4,0
2,49180628,,103763048,0,1,0,0,1,0,11,0
3,49180608,,4482158804,0,1,0,1,0,0,2,0
4,49180580,,374883613,0,1,0,0,1,0,2,0


In [26]:
# Drop all duplicate samples.
print('Shape before dropping duplicates: {}'.format(unlabelled.shape))
unlabelled = unlabelled.drop_duplicates(subset=['changeset_id', 'feature_id'])
print('Shape after dropping duplicates: {}'.format(unlabelled.shape))

Shape before dropping duplicates: (121, 11)
Shape after dropping duplicates: (121, 11)


In [27]:
# Using all of the unlabelled dataset for testing.
testing = unlabelled.sample(unlabelled.shape[0])

In [28]:
tX = testing.drop(non_training_attributes, axis=1)
ty = testing['changeset_harmful']

In [29]:
testing['prediction'] = model.predict(tX)
testing.to_csv(unlabelled_path + 'testing-review.csv', index=False)

ValueError: Number of features of the model must match the input. Model n_features is 9 and input n_features is 8 

In [None]:
tharmful_count = testing[testing['prediction'] == -1].shape[0]
tnot_harmful_count = testing[testing['prediction'] == 1].shape[0]

print('Predicted good: {}'.format(tnot_harmful_count))
print('Predicted harmful: {}'.format(tharmful_count))

print('Percentage harmful: {}%'.format(round(100.0 * tharmful_count / testing.shape[0], 2)))