# Anomaly detection - Gabbar

In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('ticks')

In [3]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import learning_curve

In [4]:
pd.set_option('display.precision', 2)
pd.set_option('display.max_columns', None)

In [5]:
random_state = 5
cv = 10

In [6]:
non_training_attributes = ['changeset_id', 'changeset_harmful', 'feature_id', 'new_tags', 'old_tags']

## Labelled dataset

In [7]:
labelled_path = '../downloads/highway-classifier/labelled/'

In [8]:
labelled = pd.read_csv(labelled_path + 'attributes.csv')
print(labelled.shape)

# Sort the dataset randomly.
labelled = labelled.sample(labelled.shape[0], random_state=random_state)
labelled.head()

(2732, 16)


Unnamed: 0,changeset_id,changeset_harmful,feature_id,feature_version,action_create,action_modify,action_delete,geometry_type_node,geometry_type_way,geometry_type_relation,geometry_line_distance,geometry_kinks,old_user_mapping_days,new_user_mapping_days,old_tags,new_tags
2024,47069264,0,4748048048,2,0,1,0,1,0,0,0.0,0,11,11,,{operator=SPTrans}
2311,46527869,0,62794308,3,0,1,0,1,0,0,0.0,0,187,579,,{highway=turning_loop}
1585,47361809,0,23651285,6,0,1,0,0,1,0,0.16,0,1281,10,,{surface=asphalt}
2166,46892332,0,45105818,6,0,1,0,0,1,0,0.03,0,1,606,{highway=footway},{highway=cycleway}
1323,47419169,0,73058399,18,0,1,0,0,1,0,0.25,0,477,1498,,{surface=asphalt}


In [9]:
def one_to_minus_one(number):
    if number == 1: return -1
    else: return number

In [10]:
def zero_to_one(number):
    if number == 0: return 1
    else: return number

In [11]:
labelled['changeset_harmful'] = labelled['changeset_harmful'].apply(one_to_minus_one)
labelled['changeset_harmful'] = labelled['changeset_harmful'].apply(zero_to_one)

In [12]:
# Drop all duplicate samples.
print('Shape before dropping duplicates: {}'.format(labelled.shape))
labelled = labelled.drop_duplicates(subset=['changeset_id', 'feature_id'])
print('Shape after dropping duplicates: {}'.format(labelled.shape))

Shape before dropping duplicates: (2732, 16)
Shape after dropping duplicates: (2732, 16)


In [13]:
# Fill null values in tags with empty string.
labelled['old_tags'] = labelled['old_tags'].fillna('')
labelled['new_tags'] = labelled['new_tags'].fillna('')

In [14]:
# Distribution of good and harmful changesets?
labelled.groupby('changeset_harmful').size()

changeset_harmful
-1      77
 1    2655
dtype: int64

In [15]:
X = labelled.drop(non_training_attributes, axis=1)
y = labelled['changeset_harmful']

In [16]:
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

In [18]:
model = IsolationForest(max_samples=100, random_state=random_state)
model.fit(X_train)

IsolationForest(bootstrap=False, contamination=0.1, max_features=1.0,
        max_samples=100, n_estimators=100, n_jobs=1, random_state=5,
        verbose=0)

In [19]:
y_model = model.predict(X_test)

In [20]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_model))

             precision    recall  f1-score   support

         -1       0.06      0.32      0.10        19
          1       0.98      0.88      0.93       801

avg / total       0.96      0.87      0.91       820



In [21]:
from sklearn.metrics import confusion_matrix
matrix = confusion_matrix(y_test, y_model)
matrix = pd.DataFrame(matrix, index=['Labelled harmful', 'Labelled good'], columns=['Predicted harmful', 'Predicted good'])
matrix

Unnamed: 0,Predicted harmful,Predicted good
Labelled harmful,6,13
Labelled good,94,707
