# Bag of Tags - Gabbar

In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import warnings
warnings.filterwarnings("ignore")

In [2]:
import json

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set_style('ticks')

In [4]:
pd.set_option('display.precision', 2)
pd.set_option('display.max_columns', None)

In [5]:
path = '../downloads/bag-of-tags/'

## Training dataset

In [6]:
# Loading the training dataset.
training = pd.read_csv(path + 'training.csv')

In [7]:
# Drop all duplicate attributes.
print('Shape before dropping duplicates: {}'.format(training.shape))
training = training.drop_duplicates(subset='changeset_id')
print('Shape after dropping duplicates: {}'.format(training.shape))

Shape before dropping duplicates: (1805, 11)
Shape after dropping duplicates: (1805, 11)


In [8]:
# How many good and how many problematic changesets?
training.groupby('changeset_harmful').size()

changeset_harmful
False    1755
True       50
dtype: int64

In [9]:
good = training[training['changeset_harmful'] == 0]
harmful = training[training['changeset_harmful'] == 1]

In [10]:
items = harmful.shape[0]
training = pd.concat([good, harmful])
# Reset index to start from zero.
training = training.reset_index(drop=True)
print(training.shape)

(1805, 11)


In [11]:
# Handling missing values.
training['new_tags'] = training['new_tags'].fillna('')
training['old_tags'] = training['old_tags'].fillna('')

In [12]:
# View some good changesets
training[training['changeset_harmful'] == 0].head()

Unnamed: 0,changeset_id,changeset_harmful,type_node,type_way,type_relation,line_length,kinks,new_user_mapping_days,old_user_mapping_days,old_tags,new_tags
0,47379432,False,0,1,0,0,0,0,1516,,
1,47974373,False,0,1,0,0,0,0,676,{cycleway:left=track},{cycleway:left=lane}
2,47480425,False,0,1,0,0,0,0,945,,{surface=asphalt}
3,47510849,False,0,1,0,0,0,0,148,{surface=asphalt},
4,48456618,False,0,1,0,0,0,0,10,{highway=residential},{highway=steps}


In [13]:
# View some harmful changesets
training[training['changeset_harmful'] == 1].head()

Unnamed: 0,changeset_id,changeset_harmful,type_node,type_way,type_relation,line_length,kinks,new_user_mapping_days,old_user_mapping_days,old_tags,new_tags
1755,48255884,True,0,1,0,0,0,0,27,,{landuse=forest}
1756,46569558,True,0,1,0,0,0,0,1169,,{highway=footway}
1757,45228888,True,0,1,0,0,0,0,17,,
1758,48104772,True,0,1,0,0,0,0,746,,{oneway=yes}
1759,48314028,True,0,1,0,0,0,0,2401,{highway=residential} {maxspeed=30},{highway=living_street} {maxspeed=8}


In [14]:
from sklearn.feature_extraction.text import CountVectorizer

In [15]:
new_vectorizer = CountVectorizer(
    analyzer='word',
    tokenizer=None,
    lowercase=False,
    token_pattern=r'\{([^=]+=[^=]+)\}',  # One or more non-whitespace character.
)
new_vectorizer.fit(training[training['changeset_harmful'] == 1]['new_tags'])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='\\{([^=]+=[^=]+)\\}',
        tokenizer=None, vocabulary=None)

In [16]:
# Looking at some feature names.
print('Number of vectors in new: {}'.format(len(new_vectorizer.get_feature_names())))
new_vectorizer.get_feature_names()

Number of vectors in new: 40


['bicycle=no',
 'bicycle=yes',
 'bridge=yes',
 'construction=motorway',
 'foot=yes',
 'footway=sidewalk',
 'highway=footway',
 'highway=living_street',
 'highway=motorway',
 'highway=path',
 'highway=primary',
 'highway=residential',
 'highway=road',
 'highway=secondary',
 'highway=tertiary',
 'highway=track',
 'highway=unclassified',
 'horse=no',
 'landuse=forest',
 'landuse=recreation_ground',
 'landuse_1=recreation_ground',
 'landuse_2=festival area',
 'landuse_3=recreation',
 'lanes=1',
 'lanes=2',
 'layer=1',
 'maxspeed=10 mph',
 'maxspeed=20',
 'maxspeed=8',
 'natural=footway',
 'natural=tree_row',
 'oneway=no',
 'oneway=yes',
 'surface=asphalt',
 'surface=dirt',
 'surface=gravel',
 'surface=unpaved',
 'surface_1=ground',
 'surface_2=unpaved',
 'tracktype=grade3']

In [17]:
# Convert to a Pandas DataFrame to make merging easier.
new_vectorized = pd.DataFrame(
    new_vectorizer.transform(training['new_tags']).toarray(),
    columns=new_vectorizer.get_feature_names()
)

In [18]:
# Add "new" prefix to column names.
new_vectorized.columns = ['new_{}'.format(item) for item in new_vectorized.columns]
new_vectorized.head()

Unnamed: 0,new_bicycle=no,new_bicycle=yes,new_bridge=yes,new_construction=motorway,new_foot=yes,new_footway=sidewalk,new_highway=footway,new_highway=living_street,new_highway=motorway,new_highway=path,new_highway=primary,new_highway=residential,new_highway=road,new_highway=secondary,new_highway=tertiary,new_highway=track,new_highway=unclassified,new_horse=no,new_landuse=forest,new_landuse=recreation_ground,new_landuse_1=recreation_ground,new_landuse_2=festival area,new_landuse_3=recreation,new_lanes=1,new_lanes=2,new_layer=1,new_maxspeed=10 mph,new_maxspeed=20,new_maxspeed=8,new_natural=footway,new_natural=tree_row,new_oneway=no,new_oneway=yes,new_surface=asphalt,new_surface=dirt,new_surface=gravel,new_surface=unpaved,new_surface_1=ground,new_surface_2=unpaved,new_tracktype=grade3
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [19]:
old_vectorizer = CountVectorizer(
    analyzer='word',
    tokenizer=None,
    lowercase=False,
    token_pattern=r'\{([^=]+=[^=]+)\}',  # One or more non-whitespace character.
)
old_vectorizer.fit(training[training['changeset_harmful'] == 1]['old_tags'])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='\\{([^=]+=[^=]+)\\}',
        tokenizer=None, vocabulary=None)

In [20]:
# Looking at some feature names.
print('Number of vectors in old: {}'.format(len(old_vectorizer.get_feature_names())))
old_vectorizer.get_feature_names()

Number of vectors in old: 11


['construction=path',
 'embankment=yes',
 'highway=construction',
 'highway=pedestrian',
 'highway=primary',
 'highway=residential',
 'highway=service',
 'highway=tertiary',
 'highway=unclassified',
 'maxspeed=30',
 'oneway=yes']

In [21]:
# Convert to a Pandas DataFrame to make merging easier.
old_vectorized = pd.DataFrame(
    old_vectorizer.transform(training['old_tags']).toarray(),
    columns=old_vectorizer.get_feature_names()
)

In [22]:
# Add "old" prefix to column names.
old_vectorized.columns = ['old_{}'.format(item) for item in old_vectorized.columns]
old_vectorized.head()

Unnamed: 0,old_construction=path,old_embankment=yes,old_highway=construction,old_highway=pedestrian,old_highway=primary,old_highway=residential,old_highway=service,old_highway=tertiary,old_highway=unclassified,old_maxspeed=30,old_oneway=yes
0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,0


In [23]:
# Concatenate both initial training data and vectorized data.
training = pd.concat([training, new_vectorized, old_vectorized], axis=1)
print(training.shape)
training.head()

(1805, 62)


Unnamed: 0,changeset_id,changeset_harmful,type_node,type_way,type_relation,line_length,kinks,new_user_mapping_days,old_user_mapping_days,old_tags,new_tags,new_bicycle=no,new_bicycle=yes,new_bridge=yes,new_construction=motorway,new_foot=yes,new_footway=sidewalk,new_highway=footway,new_highway=living_street,new_highway=motorway,new_highway=path,new_highway=primary,new_highway=residential,new_highway=road,new_highway=secondary,new_highway=tertiary,new_highway=track,new_highway=unclassified,new_horse=no,new_landuse=forest,new_landuse=recreation_ground,new_landuse_1=recreation_ground,new_landuse_2=festival area,new_landuse_3=recreation,new_lanes=1,new_lanes=2,new_layer=1,new_maxspeed=10 mph,new_maxspeed=20,new_maxspeed=8,new_natural=footway,new_natural=tree_row,new_oneway=no,new_oneway=yes,new_surface=asphalt,new_surface=dirt,new_surface=gravel,new_surface=unpaved,new_surface_1=ground,new_surface_2=unpaved,new_tracktype=grade3,old_construction=path,old_embankment=yes,old_highway=construction,old_highway=pedestrian,old_highway=primary,old_highway=residential,old_highway=service,old_highway=tertiary,old_highway=unclassified,old_maxspeed=30,old_oneway=yes
0,47379432,False,0,1,0,0,0,0,1516,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,47974373,False,0,1,0,0,0,0,676,{cycleway:left=track},{cycleway:left=lane},0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,47480425,False,0,1,0,0,0,0,945,,{surface=asphalt},0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,47510849,False,0,1,0,0,0,0,148,{surface=asphalt},,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,48456618,False,0,1,0,0,0,0,10,{highway=residential},{highway=steps},0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


In [24]:
non_training_attributes = ['changeset_id', 'changeset_harmful', 'new_tags', 'old_tags']

X = training.drop(non_training_attributes, axis=1)
y = training['changeset_harmful']

In [25]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()

In [26]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'random_state': [5],
}
grid = GridSearchCV(estimator=model, param_grid=param_grid)

grid.fit(X, y)

GridSearchCV(cv=None, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
       fit_params={}, iid=True, n_jobs=1, param_grid={'random_state': [5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [27]:
print('Best score: {}'.format(grid.best_score_))
print('Best params: {}'.format(grid.best_params_))

Best score: 0.9695290858725761
Best params: {'random_state': 5}


In [28]:
# Set model to the best estimator.
model = grid.best_estimator_

# What attributes are important?
importances = pd.DataFrame(
    list(zip(X.columns, model.feature_importances_)),
    columns=['feature', 'importance']
)
importances.sort(columns='importance', ascending=False).head(10)

Unnamed: 0,feature,importance
6,old_user_mapping_days,0.47
23,new_highway=unclassified,0.04
25,new_landuse=forest,0.04
26,new_landuse=recreation_ground,0.04
22,new_highway=track,0.04
53,old_highway=service,0.04
17,new_highway=primary,0.03
55,old_highway=unclassified,0.03
8,new_bicycle=yes,0.03
19,new_highway=road,0.02


In [29]:
training['prediction'] = model.predict(X)
training.to_csv(path + 'training-review.csv', index=False)

In [30]:
from sklearn.metrics import confusion_matrix
matrix = confusion_matrix(y, training['prediction'])
matrix = pd.DataFrame(matrix, index=['Labelled good', 'Labelled harmful'], columns=['Predicted good', 'Predicted harmful'])
matrix

Unnamed: 0,Predicted good,Predicted harmful
Labelled good,1752,3
Labelled harmful,3,47


In [31]:
from sklearn.metrics import classification_report

report = classification_report(y, training['prediction'])
print(report)

             precision    recall  f1-score   support

      False       1.00      1.00      1.00      1755
       True       0.94      0.94      0.94        50

avg / total       1.00      1.00      1.00      1805



In [32]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X, y, cv=5, scoring='roc_auc')

print('Scores: {} ({})'.format(round(scores.mean(), 2), round(scores.std(), 2)))

Scores: 0.7 (0.08)


## Validation dataset

In [33]:
validation = pd.read_csv(path + 'validation.csv')
validation.head()

Unnamed: 0,changeset_id,changeset_harmful,type_node,type_way,type_relation,line_length,kinks,new_user_mapping_days,old_user_mapping_days,old_tags,new_tags
0,47427013,False,0,1,0,0,0,0,1045,,{surface=asphalt}
1,47247583,False,0,1,0,0,0,0,484,,{sidewalk=both}
2,46380240,False,0,1,0,0,0,0,98,,
3,46692547,True,0,1,0,0,0,0,579,{highway=residential},{highway=footway}
4,47510909,False,0,1,0,0,0,0,782,,{surface=asphalt}


In [34]:
# Drop all duplicate attributes.
print('Shape before dropping duplicates: {}'.format(validation.shape))
validation = validation.drop_duplicates(subset='changeset_id')
print('Shape after dropping duplicates: {}'.format(validation.shape))

Shape before dropping duplicates: (775, 11)
Shape after dropping duplicates: (775, 11)


In [35]:
validation.groupby('changeset_harmful').size()

changeset_harmful
False    742
True      33
dtype: int64

In [36]:
# Handling missing values.
validation['new_tags'] = validation['new_tags'].fillna('')
validation['old_tags'] = validation['old_tags'].fillna('')

In [37]:
# View some good changesets
validation[validation['changeset_harmful'] == 1].head()

Unnamed: 0,changeset_id,changeset_harmful,type_node,type_way,type_relation,line_length,kinks,new_user_mapping_days,old_user_mapping_days,old_tags,new_tags
3,46692547,True,0,1,0,0,0,0,579,{highway=residential},{highway=footway}
19,47569786,True,1,0,0,0,0,0,876,,
24,45040942,True,0,1,0,0,0,0,17,,
50,45529453,True,0,1,0,0,0,0,2,,{surface=unpaved}
68,47625854,True,0,1,0,0,0,0,2,{lit=no} {width=0},{highway=footway} {lit=yes}


In [38]:
# View some harmful changesets
validation[validation['changeset_harmful'] == 1].head()

Unnamed: 0,changeset_id,changeset_harmful,type_node,type_way,type_relation,line_length,kinks,new_user_mapping_days,old_user_mapping_days,old_tags,new_tags
3,46692547,True,0,1,0,0,0,0,579,{highway=residential},{highway=footway}
19,47569786,True,1,0,0,0,0,0,876,,
24,45040942,True,0,1,0,0,0,0,17,,
50,45529453,True,0,1,0,0,0,0,2,,{surface=unpaved}
68,47625854,True,0,1,0,0,0,0,2,{lit=no} {width=0},{highway=footway} {lit=yes}


In [39]:
new_vectorized = pd.DataFrame(new_vectorizer.transform(validation['new_tags']).toarray(), columns=new_vectorizer.get_feature_names())
new_vectorized.columns = ['new_{}'.format(item) for item in new_vectorized.columns]

old_vectorized = pd.DataFrame(old_vectorizer.transform(validation['old_tags']).toarray(), columns=old_vectorizer.get_feature_names())
old_vectorized.columns = ['old_{}'.format(item) for item in old_vectorized.columns]

In [40]:
# Concatenate both initial validation data and vectorized data.
validation = pd.concat([validation, new_vectorized, old_vectorized], axis=1)
print(validation.shape)
validation.head()

(775, 62)


Unnamed: 0,changeset_id,changeset_harmful,type_node,type_way,type_relation,line_length,kinks,new_user_mapping_days,old_user_mapping_days,old_tags,new_tags,new_bicycle=no,new_bicycle=yes,new_bridge=yes,new_construction=motorway,new_foot=yes,new_footway=sidewalk,new_highway=footway,new_highway=living_street,new_highway=motorway,new_highway=path,new_highway=primary,new_highway=residential,new_highway=road,new_highway=secondary,new_highway=tertiary,new_highway=track,new_highway=unclassified,new_horse=no,new_landuse=forest,new_landuse=recreation_ground,new_landuse_1=recreation_ground,new_landuse_2=festival area,new_landuse_3=recreation,new_lanes=1,new_lanes=2,new_layer=1,new_maxspeed=10 mph,new_maxspeed=20,new_maxspeed=8,new_natural=footway,new_natural=tree_row,new_oneway=no,new_oneway=yes,new_surface=asphalt,new_surface=dirt,new_surface=gravel,new_surface=unpaved,new_surface_1=ground,new_surface_2=unpaved,new_tracktype=grade3,old_construction=path,old_embankment=yes,old_highway=construction,old_highway=pedestrian,old_highway=primary,old_highway=residential,old_highway=service,old_highway=tertiary,old_highway=unclassified,old_maxspeed=30,old_oneway=yes
0,47427013,False,0,1,0,0,0,0,1045,,{surface=asphalt},0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,47247583,False,0,1,0,0,0,0,484,,{sidewalk=both},0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,46380240,False,0,1,0,0,0,0,98,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,46692547,True,0,1,0,0,0,0,579,{highway=residential},{highway=footway},0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
4,47510909,False,0,1,0,0,0,0,782,,{surface=asphalt},0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [41]:
vX = validation.drop(non_training_attributes, axis=1)
vy = validation['changeset_harmful']

In [42]:
validation['prediction'] = model.predict(vX)
validation.to_csv(path + 'validation-review.csv', index=False)

In [43]:
from sklearn.metrics import confusion_matrix
matrix = confusion_matrix(vy, validation['prediction'])
matrix = pd.DataFrame(matrix, index=['Labelled good', 'Labelled harmful'], columns=['Predicted good', 'Predicted harmful'])
matrix

Unnamed: 0,Predicted good,Predicted harmful
Labelled good,737,5
Labelled harmful,18,15


In [44]:
from sklearn.metrics import classification_report

report = classification_report(vy, validation['prediction'])
print(report)

             precision    recall  f1-score   support

      False       0.98      0.99      0.98       742
       True       0.75      0.45      0.57        33

avg / total       0.97      0.97      0.97       775



In [45]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, vX, vy, cv=5, scoring='roc_auc')

print('Scores: {} ({})'.format(round(scores.mean(), 2), round(scores.std(), 2)))

Scores: 0.65 (0.1)


## Testing dataset

In [46]:
testing_path = '../downloads/unlabelled/'

In [47]:
testing = pd.read_csv(testing_path + 'attributes.csv')
testing.head()

Unnamed: 0,changeset_id,changeset_harmful,type_node,type_way,type_relation,line_length,kinks,old_user_mapping_days,new_user_mapping_days,old_tags,new_tags
0,49180736,,0,1,0,0,0,780,0,,{access=no} {foot=permissive} {surface=fine_gr...
1,49180683,,0,1,0,0,0,662,0,,{motor_vehicle=permissive}
2,49180666,,0,1,0,0,0,837,0,,{oneway=no}
3,49180628,,0,1,0,0,0,150,0,,
4,49180580,,0,1,0,0,0,621,0,,{access=private}


In [48]:
# Drop all duplicate attributes.
print('Shape before dropping duplicates: {}'.format(testing.shape))
testing = testing.drop_duplicates(subset='changeset_id')
print('Shape after dropping duplicates: {}'.format(testing.shape))

Shape before dropping duplicates: (97, 11)
Shape after dropping duplicates: (97, 11)


In [49]:
# Handling missing values.
testing['new_tags'] = testing['new_tags'].fillna('')
testing['old_tags'] = testing['old_tags'].fillna('')

In [50]:
new_vectorized = pd.DataFrame(new_vectorizer.transform(testing['new_tags']).toarray(), columns=new_vectorizer.get_feature_names())
new_vectorized.columns = ['new_{}'.format(item) for item in new_vectorized.columns]

old_vectorized = pd.DataFrame(old_vectorizer.transform(testing['old_tags']).toarray(), columns=old_vectorizer.get_feature_names())
old_vectorized.columns = ['old_{}'.format(item) for item in old_vectorized.columns]

In [51]:
# Concatenate both initial validation data and vectorized data.
testing = pd.concat([testing, new_vectorized, old_vectorized], axis=1)
print(testing.shape)
testing.head()

(97, 62)


Unnamed: 0,changeset_id,changeset_harmful,type_node,type_way,type_relation,line_length,kinks,old_user_mapping_days,new_user_mapping_days,old_tags,new_tags,new_bicycle=no,new_bicycle=yes,new_bridge=yes,new_construction=motorway,new_foot=yes,new_footway=sidewalk,new_highway=footway,new_highway=living_street,new_highway=motorway,new_highway=path,new_highway=primary,new_highway=residential,new_highway=road,new_highway=secondary,new_highway=tertiary,new_highway=track,new_highway=unclassified,new_horse=no,new_landuse=forest,new_landuse=recreation_ground,new_landuse_1=recreation_ground,new_landuse_2=festival area,new_landuse_3=recreation,new_lanes=1,new_lanes=2,new_layer=1,new_maxspeed=10 mph,new_maxspeed=20,new_maxspeed=8,new_natural=footway,new_natural=tree_row,new_oneway=no,new_oneway=yes,new_surface=asphalt,new_surface=dirt,new_surface=gravel,new_surface=unpaved,new_surface_1=ground,new_surface_2=unpaved,new_tracktype=grade3,old_construction=path,old_embankment=yes,old_highway=construction,old_highway=pedestrian,old_highway=primary,old_highway=residential,old_highway=service,old_highway=tertiary,old_highway=unclassified,old_maxspeed=30,old_oneway=yes
0,49180736,,0,1,0,0,0,780,0,,{access=no} {foot=permissive} {surface=fine_gr...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,49180683,,0,1,0,0,0,662,0,,{motor_vehicle=permissive},0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,49180666,,0,1,0,0,0,837,0,,{oneway=no},0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,49180628,,0,1,0,0,0,150,0,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,49180580,,0,1,0,0,0,621,0,,{access=private},0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [52]:
tX = testing.drop(non_training_attributes, axis=1)

In [53]:
testing['prediction'] = model.predict(tX)
testing.to_csv(testing_path + 'testing-review.csv', index=False)

In [54]:
tharmful_count = testing[testing['prediction'] == True].shape[0]
tnot_harmful_count = testing[testing['prediction'] == False].shape[0]

print('Predicted good: {}'.format(tnot_harmful_count))
print('Predicted harmful: {}'.format(tharmful_count))

print('Percentage harmful: {}%'.format(round(100.0 * tharmful_count / testing.shape[0], 2)))

Predicted good: 94
Predicted harmful: 3
Percentage harmful: 3.09%


In [55]:
# # Visualizing a DecisionTreeClassifier
# import pydotplus
# from sklearn import tree
# from IPython.display import Image
# dot_data = tree.export_graphviz(
#     model,
#     out_file=None,
#     feature_names=new_vectorizer.get_feature_names(),
#     class_names=['good', 'problematic'],
#     filled=True,
#     rounded=True,
#     special_characters=True
# )
# graph = pydotplus.graph_from_dot_data(dot_data)
# Image(graph.create_png())