# Bag of Tags - Gabbar

In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import warnings
warnings.filterwarnings("ignore")

In [2]:
import json

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set_style('ticks')

In [4]:
pd.set_option('display.precision', 2)
pd.set_option('display.max_columns', None)

In [5]:
path = '../downloads/bag-of-tags/'

## Training dataset

In [6]:
# Loading the training dataset.
training = pd.read_csv(path + 'training.csv')

In [7]:
# Drop all duplicate attributes.
print('Shape before dropping duplicates: {}'.format(training.shape))
training = training.drop_duplicates(subset='changeset_id')
print('Shape after dropping duplicates: {}'.format(training.shape))

Shape before dropping duplicates: (1897, 14)
Shape after dropping duplicates: (1897, 14)


In [8]:
# How many good and how many problematic changesets?
training.groupby('changeset_harmful').size()

changeset_harmful
False    1831
True       66
dtype: int64

In [9]:
good = training[training['changeset_harmful'] == 0]
harmful = training[training['changeset_harmful'] == 1]

In [10]:
items = harmful.shape[0]
training = pd.concat([good, harmful])
# Reset index to start from zero.
training = training.reset_index(drop=True)
print(training.shape)

(1897, 14)


In [11]:
# Handling missing values.
training['new_tags'] = training['new_tags'].fillna('')
training['old_tags'] = training['old_tags'].fillna('')

In [12]:
# View some good changesets
training[training['changeset_harmful'] == 0].head()

Unnamed: 0,changeset_id,changeset_harmful,created,modified,deleted,type_node,type_way,type_relation,line_length,kinks,old_user_mapping_days,new_user_mapping_days,old_tags,new_tags
0,47934716,False,0,1,0,0,1,0,0.227,0,33,1,,{access=no}
1,47451579,False,0,1,0,0,1,0,0.0237,0,2813,1498,,{surface=asphalt}
2,46467355,False,0,1,0,0,1,0,0.00807,0,643,643,,
3,47921554,False,0,1,0,0,1,0,0.741,0,3,3,,{maxspeed=25 mph}
4,46415466,False,0,1,0,1,0,0,0.0,0,758,7,,{bench=no} {shelter=no}


In [13]:
# View some harmful changesets
training[training['changeset_harmful'] == 1].head()

Unnamed: 0,changeset_id,changeset_harmful,created,modified,deleted,type_node,type_way,type_relation,line_length,kinks,old_user_mapping_days,new_user_mapping_days,old_tags,new_tags
1831,46569558,True,0,1,0,0,1,0,4.63,0,1169,7,,{highway=footway}
1832,47569786,True,0,1,0,1,0,0,0.0,0,876,36,,
1833,44882430,True,0,1,0,0,1,0,0.87,0,9,4,{highway=residential},{bicycle=yes} {horse=yes} {highway=footway} {f...
1834,45529453,True,0,1,0,0,1,0,3.63,40,2,2,,{surface=unpaved}
1835,45067856,True,0,1,0,0,1,0,0.2,0,17,32,,


In [14]:
from sklearn.feature_extraction.text import CountVectorizer

In [15]:
new_vectorizer = CountVectorizer(
    analyzer='word',
    tokenizer=None,
    lowercase=False,
    token_pattern=r'\{([^=]+=[^=]+)\}',  # One or more non-whitespace character.
)
new_vectorizer.fit(training[training['changeset_harmful'] == 1]['new_tags'])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='\\{([^=]+=[^=]+)\\}',
        tokenizer=None, vocabulary=None)

In [16]:
# Looking at some feature names.
print('Number of vectors in new: {}'.format(len(new_vectorizer.get_feature_names())))
new_vectorizer.get_feature_names()

Number of vectors in new: 45


['bicycle=no',
 'bicycle=yes',
 'bridge=yes',
 'construction=motorway',
 'foot=yes',
 'footway=sidewalk',
 'highway=footway',
 'highway=path',
 'highway=secondary',
 'highway=service',
 'highway=tertiary',
 'highway=track',
 'highway=unclassified',
 'horse=no',
 'horse=yes',
 'landuse=forest',
 'landuse=recreation_ground',
 'landuse_1=park',
 'landuse_1=recreation_ground',
 'landuse_2=festival area',
 'landuse_3=dog park',
 'landuse_3=recreation',
 'landuse_4=recreation_ground',
 'landuse_5=water_park',
 'lanes=1',
 'layer=1',
 'leisure=park',
 'lit=yes',
 'maxspeed=10 mph',
 'maxspeed=20',
 'natural=footway',
 'natural=tree_row',
 'noname=yes',
 'oneway=no',
 'oneway=yes',
 'surface=asphalt',
 'surface=dirt',
 'surface=gravel',
 'surface=unpaved',
 'surface_1=asphalt',
 'surface_1=ground',
 'surface_2=unpaved',
 'surface_2=wood',
 'tourism=attraction',
 'tracktype=grade3']

In [17]:
# Convert to a Pandas DataFrame to make merging easier.
new_vectorized = pd.DataFrame(
    new_vectorizer.transform(training['new_tags']).toarray(),
    columns=new_vectorizer.get_feature_names()
)

In [18]:
# Add "new" prefix to column names.
new_vectorized.columns = ['new_{}'.format(item) for item in new_vectorized.columns]
new_vectorized.head()

Unnamed: 0,new_bicycle=no,new_bicycle=yes,new_bridge=yes,new_construction=motorway,new_foot=yes,new_footway=sidewalk,new_highway=footway,new_highway=path,new_highway=secondary,new_highway=service,new_highway=tertiary,new_highway=track,new_highway=unclassified,new_horse=no,new_horse=yes,new_landuse=forest,new_landuse=recreation_ground,new_landuse_1=park,new_landuse_1=recreation_ground,new_landuse_2=festival area,new_landuse_3=dog park,new_landuse_3=recreation,new_landuse_4=recreation_ground,new_landuse_5=water_park,new_lanes=1,new_layer=1,new_leisure=park,new_lit=yes,new_maxspeed=10 mph,new_maxspeed=20,new_natural=footway,new_natural=tree_row,new_noname=yes,new_oneway=no,new_oneway=yes,new_surface=asphalt,new_surface=dirt,new_surface=gravel,new_surface=unpaved,new_surface_1=asphalt,new_surface_1=ground,new_surface_2=unpaved,new_surface_2=wood,new_tourism=attraction,new_tracktype=grade3
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [19]:
old_vectorizer = CountVectorizer(
    analyzer='word',
    tokenizer=None,
    lowercase=False,
    token_pattern=r'\{([^=]+=[^=]+)\}',  # One or more non-whitespace character.
)
old_vectorizer.fit(training[training['changeset_harmful'] == 1]['old_tags'])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='\\{([^=]+=[^=]+)\\}',
        tokenizer=None, vocabulary=None)

In [20]:
# Looking at some feature names.
print('Number of vectors in old: {}'.format(len(old_vectorizer.get_feature_names())))
old_vectorizer.get_feature_names()

Number of vectors in old: 12


['construction=path',
 'embankment=yes',
 'highway=construction',
 'highway=pedestrian',
 'highway=primary',
 'highway=residential',
 'highway=service',
 'highway=tertiary',
 'highway=unclassified',
 'lit=no',
 'oneway=yes',
 'width=0']

In [21]:
# Convert to a Pandas DataFrame to make merging easier.
old_vectorized = pd.DataFrame(
    old_vectorizer.transform(training['old_tags']).toarray(),
    columns=old_vectorizer.get_feature_names()
)

In [22]:
# Add "old" prefix to column names.
old_vectorized.columns = ['old_{}'.format(item) for item in old_vectorized.columns]
old_vectorized.head()

Unnamed: 0,old_construction=path,old_embankment=yes,old_highway=construction,old_highway=pedestrian,old_highway=primary,old_highway=residential,old_highway=service,old_highway=tertiary,old_highway=unclassified,old_lit=no,old_oneway=yes,old_width=0
0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0


In [23]:
# Concatenate both initial training data and vectorized data.
training = pd.concat([training, new_vectorized, old_vectorized], axis=1)
print(training.shape)
training.head()

(1897, 71)


Unnamed: 0,changeset_id,changeset_harmful,created,modified,deleted,type_node,type_way,type_relation,line_length,kinks,old_user_mapping_days,new_user_mapping_days,old_tags,new_tags,new_bicycle=no,new_bicycle=yes,new_bridge=yes,new_construction=motorway,new_foot=yes,new_footway=sidewalk,new_highway=footway,new_highway=path,new_highway=secondary,new_highway=service,new_highway=tertiary,new_highway=track,new_highway=unclassified,new_horse=no,new_horse=yes,new_landuse=forest,new_landuse=recreation_ground,new_landuse_1=park,new_landuse_1=recreation_ground,new_landuse_2=festival area,new_landuse_3=dog park,new_landuse_3=recreation,new_landuse_4=recreation_ground,new_landuse_5=water_park,new_lanes=1,new_layer=1,new_leisure=park,new_lit=yes,new_maxspeed=10 mph,new_maxspeed=20,new_natural=footway,new_natural=tree_row,new_noname=yes,new_oneway=no,new_oneway=yes,new_surface=asphalt,new_surface=dirt,new_surface=gravel,new_surface=unpaved,new_surface_1=asphalt,new_surface_1=ground,new_surface_2=unpaved,new_surface_2=wood,new_tourism=attraction,new_tracktype=grade3,old_construction=path,old_embankment=yes,old_highway=construction,old_highway=pedestrian,old_highway=primary,old_highway=residential,old_highway=service,old_highway=tertiary,old_highway=unclassified,old_lit=no,old_oneway=yes,old_width=0
0,47934716,False,0,1,0,0,1,0,0.227,0,33,1,,{access=no},0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,47451579,False,0,1,0,0,1,0,0.0237,0,2813,1498,,{surface=asphalt},0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,46467355,False,0,1,0,0,1,0,0.00807,0,643,643,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,47921554,False,0,1,0,0,1,0,0.741,0,3,3,,{maxspeed=25 mph},0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,46415466,False,0,1,0,1,0,0,0.0,0,758,7,,{bench=no} {shelter=no},0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [24]:
# Add good way to check if all properties are ok.
training.sum()

changeset_id                                                       89614776915
changeset_harmful                                                           66
created                                                                      2
modified                                                                  1895
deleted                                                                      0
type_node                                                                   99
type_way                                                                  1793
type_relation                                                                5
line_length                                                            1.8e+03
kinks                                                                      302
old_user_mapping_days                                                  1236436
new_user_mapping_days                                                   553131
old_tags                     {highway=residential}{o

In [25]:
non_training_attributes = ['changeset_id', 'changeset_harmful', 'new_tags', 'old_tags']

X = training.drop(non_training_attributes, axis=1)
y = training['changeset_harmful']

In [26]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()

In [27]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'random_state': [5],
}
grid = GridSearchCV(estimator=model, param_grid=param_grid)

grid.fit(X, y)

GridSearchCV(cv=None, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
       fit_params={}, iid=True, n_jobs=1, param_grid={'random_state': [5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [28]:
print('Best score: {}'.format(grid.best_score_))
print('Best params: {}'.format(grid.best_params_))

Best score: 0.9630996309963099
Best params: {'random_state': 5}


In [29]:
# Set model to the best estimator.
model = grid.best_estimator_

# What attributes are important?
importances = pd.DataFrame(
    list(zip(X.columns, model.feature_importances_)),
    columns=['feature', 'importance']
)
importances.sort(columns='importance', ascending=False).head(10)

Unnamed: 0,feature,importance
8,old_user_mapping_days,0.26
6,line_length,0.25
9,new_user_mapping_days,0.14
23,new_horse=no,0.04
22,new_highway=unclassified,0.03
29,new_landuse_2=festival area,0.03
11,new_bicycle=yes,0.03
60,old_highway=residential,0.03
16,new_highway=footway,0.02
36,new_leisure=park,0.02


In [30]:
training['prediction'] = model.predict(X)
training.to_csv(path + 'training-review.csv', index=False)

In [31]:
from sklearn.metrics import confusion_matrix
matrix = confusion_matrix(y, training['prediction'])
matrix = pd.DataFrame(matrix, index=['Labelled good', 'Labelled harmful'], columns=['Predicted good', 'Predicted harmful'])
matrix

Unnamed: 0,Predicted good,Predicted harmful
Labelled good,1831,0
Labelled harmful,2,64


In [32]:
from sklearn.metrics import classification_report

report = classification_report(y, training['prediction'])
print(report)

             precision    recall  f1-score   support

      False       1.00      1.00      1.00      1831
       True       1.00      0.97      0.98        66

avg / total       1.00      1.00      1.00      1897



In [33]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X, y, cv=5, scoring='roc_auc')

print('Scores: {} ({})'.format(round(scores.mean(), 2), round(scores.std(), 2)))

Scores: 0.64 (0.05)


## Validation dataset

In [34]:
validation = pd.read_csv(path + 'validation.csv')
validation.head()

Unnamed: 0,changeset_id,changeset_harmful,created,modified,deleted,type_node,type_way,type_relation,line_length,kinks,old_user_mapping_days,new_user_mapping_days,old_tags,new_tags
0,46533291,False,0,1,0,0,1,0,0.08,0,1916,1916,{access=no},
1,48450065,False,0,1,0,0,1,0,1.32,0,2482,2482,{designation=restricted_byway} {is_in:county=C...,{designation=byway_open_to_all_traffic} {sourc...
2,48019635,False,0,1,0,0,1,0,0.04,0,38,1,,{surface=paved}
3,47328556,False,0,1,0,0,1,0,0.02,0,1146,4,,{surface=asphalt}
4,47396444,False,0,1,0,0,1,0,0.1,0,174,21,,{surface=asphalt}


In [35]:
# Drop all duplicate attributes.
print('Shape before dropping duplicates: {}'.format(validation.shape))
validation = validation.drop_duplicates(subset='changeset_id')
print('Shape after dropping duplicates: {}'.format(validation.shape))

Shape before dropping duplicates: (814, 14)
Shape after dropping duplicates: (814, 14)


In [36]:
validation.groupby('changeset_harmful').size()

changeset_harmful
False    793
True      21
dtype: int64

In [37]:
# Handling missing values.
validation['new_tags'] = validation['new_tags'].fillna('')
validation['old_tags'] = validation['old_tags'].fillna('')

In [38]:
# View some good changesets
validation[validation['changeset_harmful'] == 1].head()

Unnamed: 0,changeset_id,changeset_harmful,created,modified,deleted,type_node,type_way,type_relation,line_length,kinks,old_user_mapping_days,new_user_mapping_days,old_tags,new_tags
20,46494527,True,0,1,0,0,1,0,0.19,0,2,1,,
117,46480391,True,0,1,0,0,1,0,0.16,0,639,2,{highway=service},{highway=residential}
159,47633670,True,0,1,0,0,1,0,0.76,0,280,2,,{highway=footway} {landuse=recreation_ground}
194,48740042,True,0,1,0,0,1,0,46.93,0,124,16,{highway=tertiary},{highway=unclassified}
198,48655954,True,0,1,0,0,1,0,62.06,0,532,2,{natural=coastline},{highway=coastline}


In [39]:
# View some harmful changesets
validation[validation['changeset_harmful'] == 1].head()

Unnamed: 0,changeset_id,changeset_harmful,created,modified,deleted,type_node,type_way,type_relation,line_length,kinks,old_user_mapping_days,new_user_mapping_days,old_tags,new_tags
20,46494527,True,0,1,0,0,1,0,0.19,0,2,1,,
117,46480391,True,0,1,0,0,1,0,0.16,0,639,2,{highway=service},{highway=residential}
159,47633670,True,0,1,0,0,1,0,0.76,0,280,2,,{highway=footway} {landuse=recreation_ground}
194,48740042,True,0,1,0,0,1,0,46.93,0,124,16,{highway=tertiary},{highway=unclassified}
198,48655954,True,0,1,0,0,1,0,62.06,0,532,2,{natural=coastline},{highway=coastline}


In [40]:
new_vectorized = pd.DataFrame(new_vectorizer.transform(validation['new_tags']).toarray(), columns=new_vectorizer.get_feature_names())
new_vectorized.columns = ['new_{}'.format(item) for item in new_vectorized.columns]

old_vectorized = pd.DataFrame(old_vectorizer.transform(validation['old_tags']).toarray(), columns=old_vectorizer.get_feature_names())
old_vectorized.columns = ['old_{}'.format(item) for item in old_vectorized.columns]

In [41]:
# Concatenate both initial validation data and vectorized data.
validation = pd.concat([validation, new_vectorized, old_vectorized], axis=1)
print(validation.shape)
validation.head()

(814, 71)


Unnamed: 0,changeset_id,changeset_harmful,created,modified,deleted,type_node,type_way,type_relation,line_length,kinks,old_user_mapping_days,new_user_mapping_days,old_tags,new_tags,new_bicycle=no,new_bicycle=yes,new_bridge=yes,new_construction=motorway,new_foot=yes,new_footway=sidewalk,new_highway=footway,new_highway=path,new_highway=secondary,new_highway=service,new_highway=tertiary,new_highway=track,new_highway=unclassified,new_horse=no,new_horse=yes,new_landuse=forest,new_landuse=recreation_ground,new_landuse_1=park,new_landuse_1=recreation_ground,new_landuse_2=festival area,new_landuse_3=dog park,new_landuse_3=recreation,new_landuse_4=recreation_ground,new_landuse_5=water_park,new_lanes=1,new_layer=1,new_leisure=park,new_lit=yes,new_maxspeed=10 mph,new_maxspeed=20,new_natural=footway,new_natural=tree_row,new_noname=yes,new_oneway=no,new_oneway=yes,new_surface=asphalt,new_surface=dirt,new_surface=gravel,new_surface=unpaved,new_surface_1=asphalt,new_surface_1=ground,new_surface_2=unpaved,new_surface_2=wood,new_tourism=attraction,new_tracktype=grade3,old_construction=path,old_embankment=yes,old_highway=construction,old_highway=pedestrian,old_highway=primary,old_highway=residential,old_highway=service,old_highway=tertiary,old_highway=unclassified,old_lit=no,old_oneway=yes,old_width=0
0,46533291,False,0,1,0,0,1,0,0.08,0,1916,1916,{access=no},,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,48450065,False,0,1,0,0,1,0,1.32,0,2482,2482,{designation=restricted_byway} {is_in:county=C...,{designation=byway_open_to_all_traffic} {sourc...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,48019635,False,0,1,0,0,1,0,0.04,0,38,1,,{surface=paved},0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,47328556,False,0,1,0,0,1,0,0.02,0,1146,4,,{surface=asphalt},0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,47396444,False,0,1,0,0,1,0,0.1,0,174,21,,{surface=asphalt},0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [42]:
# Add good way to check if all properties are ok.
validation.sum()

changeset_id                                                       38485879979
changeset_harmful                                                           21
created                                                                      4
modified                                                                   810
deleted                                                                      0
type_node                                                                   49
type_way                                                                   764
type_relation                                                                1
line_length                                                            7.2e+02
kinks                                                                       28
old_user_mapping_days                                                   551597
new_user_mapping_days                                                   239794
old_tags                     {access=no}{designation

In [43]:
vX = validation.drop(non_training_attributes, axis=1)
vy = validation['changeset_harmful']

In [44]:
validation['prediction'] = model.predict(vX)
validation.to_csv(path + 'validation-review.csv', index=False)

In [45]:
from sklearn.metrics import confusion_matrix
matrix = confusion_matrix(vy, validation['prediction'])
matrix = pd.DataFrame(matrix, index=['Labelled good', 'Labelled harmful'], columns=['Predicted good', 'Predicted harmful'])
matrix

Unnamed: 0,Predicted good,Predicted harmful
Labelled good,778,15
Labelled harmful,14,7


In [46]:
from sklearn.metrics import classification_report

report = classification_report(vy, validation['prediction'])
print(report)

             precision    recall  f1-score   support

      False       0.98      0.98      0.98       793
       True       0.32      0.33      0.33        21

avg / total       0.97      0.96      0.96       814



In [47]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, vX, vy, cv=5, scoring='roc_auc')

print('Scores: {} ({})'.format(round(scores.mean(), 2), round(scores.std(), 2)))

Scores: 0.59 (0.09)


## Testing dataset

In [48]:
testing_path = '../downloads/unlabelled/'

In [49]:
testing = pd.read_csv(testing_path + 'attributes.csv')
testing.head()

Unnamed: 0,changeset_id,changeset_harmful,created,modified,deleted,type_node,type_way,type_relation,line_length,kinks,old_user_mapping_days,new_user_mapping_days,old_tags,new_tags
0,49180736,,0,1,0,0,1,0,0.03,0,780,780,,{access=no} {foot=permissive} {surface=fine_gr...
1,49180683,,0,1,0,0,1,0,0.04,0,662,1,,{motor_vehicle=permissive}
2,49180666,,0,1,0,0,1,0,0.91,0,837,226,,{oneway=no}
3,49180628,,0,1,0,0,1,0,4.88,0,150,150,,
4,49180608,,0,1,0,1,0,0,0.0,0,11,11,,{highway=turning_circle}


In [50]:
# Drop all duplicate attributes.
print('Shape before dropping duplicates: {}'.format(testing.shape))
testing = testing.drop_duplicates(subset='changeset_id')
print('Shape after dropping duplicates: {}'.format(testing.shape))

Shape before dropping duplicates: (149, 14)
Shape after dropping duplicates: (149, 14)


In [51]:
# Handling missing values.
testing['new_tags'] = testing['new_tags'].fillna('')
testing['old_tags'] = testing['old_tags'].fillna('')

In [52]:
new_vectorized = pd.DataFrame(new_vectorizer.transform(testing['new_tags']).toarray(), columns=new_vectorizer.get_feature_names())
new_vectorized.columns = ['new_{}'.format(item) for item in new_vectorized.columns]

old_vectorized = pd.DataFrame(old_vectorizer.transform(testing['old_tags']).toarray(), columns=old_vectorizer.get_feature_names())
old_vectorized.columns = ['old_{}'.format(item) for item in old_vectorized.columns]

In [53]:
# Concatenate both initial validation data and vectorized data.
testing = pd.concat([testing, new_vectorized, old_vectorized], axis=1)
print(testing.shape)
testing.head()

(149, 71)


Unnamed: 0,changeset_id,changeset_harmful,created,modified,deleted,type_node,type_way,type_relation,line_length,kinks,old_user_mapping_days,new_user_mapping_days,old_tags,new_tags,new_bicycle=no,new_bicycle=yes,new_bridge=yes,new_construction=motorway,new_foot=yes,new_footway=sidewalk,new_highway=footway,new_highway=path,new_highway=secondary,new_highway=service,new_highway=tertiary,new_highway=track,new_highway=unclassified,new_horse=no,new_horse=yes,new_landuse=forest,new_landuse=recreation_ground,new_landuse_1=park,new_landuse_1=recreation_ground,new_landuse_2=festival area,new_landuse_3=dog park,new_landuse_3=recreation,new_landuse_4=recreation_ground,new_landuse_5=water_park,new_lanes=1,new_layer=1,new_leisure=park,new_lit=yes,new_maxspeed=10 mph,new_maxspeed=20,new_natural=footway,new_natural=tree_row,new_noname=yes,new_oneway=no,new_oneway=yes,new_surface=asphalt,new_surface=dirt,new_surface=gravel,new_surface=unpaved,new_surface_1=asphalt,new_surface_1=ground,new_surface_2=unpaved,new_surface_2=wood,new_tourism=attraction,new_tracktype=grade3,old_construction=path,old_embankment=yes,old_highway=construction,old_highway=pedestrian,old_highway=primary,old_highway=residential,old_highway=service,old_highway=tertiary,old_highway=unclassified,old_lit=no,old_oneway=yes,old_width=0
0,49180736,,0,1,0,0,1,0,0.03,0,780,780,,{access=no} {foot=permissive} {surface=fine_gr...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,49180683,,0,1,0,0,1,0,0.04,0,662,1,,{motor_vehicle=permissive},0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,49180666,,0,1,0,0,1,0,0.91,0,837,226,,{oneway=no},0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,49180628,,0,1,0,0,1,0,4.88,0,150,150,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,49180608,,0,1,0,1,0,0,0.0,0,11,11,,{highway=turning_circle},0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [54]:
testing.sum()

changeset_id                                                        7327523837
changeset_harmful            nullnullnullnullnullnullnullnullnullnullnullnu...
created                                                                      4
modified                                                                   145
deleted                                                                      0
type_node                                                                   14
type_way                                                                   135
type_relation                                                                0
line_length                                                            1.9e+02
kinks                                                                        0
old_user_mapping_days                                                    60437
new_user_mapping_days                                                    39082
old_tags                     {maxspeed=45 mph}{maxsp

In [55]:
tX = testing.drop(non_training_attributes, axis=1)

In [56]:
testing['prediction'] = model.predict(tX)
testing.to_csv(testing_path + 'testing-review.csv', index=False)

In [57]:
tharmful_count = testing[testing['prediction'] == True].shape[0]
tnot_harmful_count = testing[testing['prediction'] == False].shape[0]

print('Predicted good: {}'.format(tnot_harmful_count))
print('Predicted harmful: {}'.format(tharmful_count))

print('Percentage harmful: {}%'.format(round(100.0 * tharmful_count / testing.shape[0], 2)))

Predicted good: 141
Predicted harmful: 8
Percentage harmful: 5.37%


In [58]:
# # Visualizing a DecisionTreeClassifier
# import pydotplus
# from sklearn import tree
# from IPython.display import Image
# dot_data = tree.export_graphviz(
#     model,
#     out_file=None,
#     feature_names=new_vectorizer.get_feature_names(),
#     class_names=['good', 'problematic'],
#     filled=True,
#     rounded=True,
#     special_characters=True
# )
# graph = pydotplus.graph_from_dot_data(dot_data)
# Image(graph.create_png())