# Local outlier factor - Gabbar

In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
from sklearn.externals import joblib
from sklearn.neighbors import NearestNeighbors

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [4]:
pd.set_option('display.precision', 2)
pd.set_option('display.max_columns', None)

In [5]:
random_state = 5
cv = 10

In [6]:
non_training_attributes = ['changeset_id', 'changeset_harmful', 'feature_id', 'feature_type']

## Prepare datasets

In [7]:
labelled_path = '../downloads/anomaly-detection/labelled/'

In [8]:
labelled = pd.read_csv(labelled_path + 'attributes.csv')
print(labelled.shape)

# Sort the dataset randomly.
labelled = labelled.sample(labelled.shape[0], random_state=random_state)
labelled.sample(10)

(2152, 15)


Unnamed: 0,changeset_id,changeset_harmful,feature_id,feature_type,action_create,action_modify,action_delete,feature_version,highway_tag_created,highway_tag_deleted,highway_value_difference,primary_tags_difference,area_of_feature_bbox,length_of_longest_segment,feature_name_touched
819,47509417,1,173213864,way,0,1,0,5,0,0,0,0,3074.38,0.0808,0
1957,46428224,1,28084835,way,0,1,0,7,0,0,0,0,81860.36,0.0776,0
22,48793277,1,493973632,way,0,1,0,2,0,0,0,0,83641.75,0.229,0
616,47531648,1,4409709,way,0,1,0,9,0,0,0,0,9303.95,0.125,0
1563,47303259,1,39560764,way,0,1,0,5,0,0,0,0,1216.48,0.0795,0
1340,47375444,1,147200518,way,0,1,0,3,0,0,0,0,5954.32,0.0,0
1645,47271076,1,235175103,way,0,1,0,6,0,0,0,0,27.42,0.0086,0
1507,47307997,1,332091899,way,0,1,0,2,0,0,0,0,37047.15,0.274,0
816,47509435,1,54378535,way,0,1,0,7,0,0,0,0,160485.22,0.191,0
930,47480390,1,39473445,way,0,1,0,6,0,0,0,0,616.93,0.0254,0


In [9]:
# Drop all duplicate samples.
print('Shape before dropping duplicates: {}'.format(labelled.shape))
labelled = labelled.drop_duplicates(subset=['changeset_id', 'feature_id'])
print('Shape after dropping duplicates: {}'.format(labelled.shape))

Shape before dropping duplicates: (2152, 15)
Shape after dropping duplicates: (2152, 15)


In [10]:
inliers = labelled[labelled['changeset_harmful'] == 1]
print('Total inliers: {}'.format(inliers.shape))

outliers = labelled[labelled['changeset_harmful'] == -1]
print('Total outliers: {}'.format(outliers.shape))

Total inliers: (2099, 15)
Total outliers: (53, 15)


## Model training

In [11]:
model = NearestNeighbors()
model.fit(labelled.drop(non_training_attributes, axis=1))

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=1, n_neighbors=5, p=2, radius=1.0)

## Inliers

In [12]:
labelled[labelled.changeset_harmful == -1].head()

Unnamed: 0,changeset_id,changeset_harmful,feature_id,feature_type,action_create,action_modify,action_delete,feature_version,highway_tag_created,highway_tag_deleted,highway_value_difference,primary_tags_difference,area_of_feature_bbox,length_of_longest_segment,feature_name_touched
2140,44929925,-1,21868906,way,0,1,0,5,0,0,16,0,1080.0,0.14,0
126,48255854,-1,490323405,way,0,1,0,2,0,0,0,1,94800.0,0.05,0
2130,45018087,-1,247378462,way,0,1,0,15,0,0,11,0,5190000.0,0.08,0
2131,45017819,-1,456458523,way,0,1,0,4,0,0,1,0,10100000.0,0.13,0
73,48388526,-1,491268465,way,0,1,0,2,0,0,-1,0,17200.0,0.13,0


In [13]:
model.kneighbors(labelled[labelled.changeset_id == 44929925].drop(non_training_attributes, axis=1))

(array([[  0.        ,  16.01992315,  16.05116334,  17.69844641,
          18.80077688]]), array([[ 114, 1447, 1441, 1986, 1661]]))

In [18]:
labelled.iloc[1661]

changeset_id                  47432011
changeset_harmful                    1
feature_id                   343328885
feature_type                       way
action_create                        0
action_modify                        1
action_delete                        0
feature_version                      3
highway_tag_created                  0
highway_tag_deleted                  0
highway_value_difference             0
primary_tags_difference              0
area_of_feature_bbox           1.1e+03
length_of_longest_segment        0.038
feature_name_touched                 0
Name: 1071, dtype: object