# Bag of Tags - Gabbar

In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import warnings
warnings.filterwarnings("ignore")

In [2]:
import json

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set_style('ticks')

In [4]:
pd.set_option('display.precision', 2)
pd.set_option('display.max_columns', None)

In [5]:
path = '../downloads/bag-of-tags/'

## Training dataset

In [6]:
# Loading the training dataset.
training = pd.read_csv(path + 'training.csv')

In [7]:
# Drop all duplicate attributes.
print('Shape before dropping duplicates: {}'.format(training.shape))
training = training.drop_duplicates(subset='changeset_id')
print('Shape after dropping duplicates: {}'.format(training.shape))

Shape before dropping duplicates: (1605, 3)
Shape after dropping duplicates: (1605, 3)


In [8]:
training.head()

Unnamed: 0,changeset_id,changeset_harmful,tags
0,46417194,False,source maxspeed highway name oneway
1,47364836,False,cycleway highway lanes name sidewalk surface t...
2,47514474,False,highway name oneway surface
3,47668474,False,highway local_ref name naptan:AtcoCode naptan:...
4,47507198,False,highway


In [9]:
# How many good and how many problematic changesets?
training.groupby('changeset_harmful').size()

changeset_harmful
False    1573
True       32
dtype: int64

In [10]:
good = training[training['changeset_harmful'] == 0]
harmful = training[training['changeset_harmful'] == 1]

In [11]:
# Train with eqaul number of good and harmful changesets.
# NOTE: Using all harmful changesets and only a subset of good changesets
items = harmful.shape[0]
training = pd.concat([good.sample(items), harmful.sample(items)])
print(training.shape)
training = training.reset_index(drop=True)
training.head(10)

(64, 3)


Unnamed: 0,changeset_id,changeset_harmful,tags
0,46360924,False,bicycle cycleway highway oneway
1,47658480,False,highway name
2,47403178,False,abutters highway maxspeed name oneway surface
3,48316803,False,access bicycle foot highway horse maxspeed name
4,47034621,False,highway name oneway
5,47474093,False,highway lanes lit maxspeed name oneway surface...
6,47427014,False,description highway lanes lit maxspeed oneway ...
7,47539009,False,bicycle foot highway lit maxspeed name oneway ...
8,47346761,False,highway name surface
9,47531669,False,highway maxspeed name surface


In [12]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(
    analyzer='word',
    tokenizer=None,
    lowercase=False,
    token_pattern=r'\S+',  # One or more non-whitespace character.
)

vectorizer.fit(training[training['changeset_harmful'] == 1]['tags'])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='\\S+', tokenizer=None,
        vocabulary=None)

In [13]:
# Transform tags into a numericals.
X = vectorizer.transform(training['tags'])

In [14]:
# Looking at some vectorized values.
X.toarray()[0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [15]:
# Looking at some feature names.
print('Number of vectors: {}'.format(len(vectorizer.get_feature_names())))
vectorizer.get_feature_names()

Number of vectors: 67


['STOP_id',
 'access',
 'addr:city',
 'addr:housenumber',
 'addr:postcode',
 'addr:state',
 'addr:street',
 'area',
 'bench',
 'bicycle',
 'bus',
 'description',
 'ele',
 'email',
 'foot',
 'footway',
 'gnis:county_id',
 'gnis:created',
 'gnis:feature_id',
 'gnis:state_id',
 'highway',
 'horse',
 'landuse',
 'landuse_1',
 'landuse_2',
 'landuse_3',
 'landuse_4',
 'landuse_5',
 'lanes',
 'lcn_ref',
 'leisure',
 'lit',
 'maxspeed',
 'motor_vehicle',
 'name',
 'name:en',
 'name:ru',
 'natural',
 'network',
 'note',
 'old_name',
 'oneway',
 'operator',
 'park',
 'phone',
 'public_transport',
 'route_ref',
 'service',
 'shelter',
 'smoothness',
 'source',
 'source:maxspeed',
 'source_ref',
 'surface',
 'surface_1',
 'surface_2',
 'tiger:cfcc',
 'tiger:county',
 'tiger:name_base',
 'tiger:name_type',
 'tiger:reviewed',
 'tiger:zip_left',
 'tiger:zip_right',
 'tourism',
 'turn:lanes:forward',
 'website',
 'width']

In [16]:
# Convert to a Pandas DataFrame to make merging easier.
vectorized = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
print(vectorized.shape)
vectorized.head()

(64, 67)


Unnamed: 0,STOP_id,access,addr:city,addr:housenumber,addr:postcode,addr:state,addr:street,area,bench,bicycle,bus,description,ele,email,foot,footway,gnis:county_id,gnis:created,gnis:feature_id,gnis:state_id,highway,horse,landuse,landuse_1,landuse_2,landuse_3,landuse_4,landuse_5,lanes,lcn_ref,leisure,lit,maxspeed,motor_vehicle,name,name:en,name:ru,natural,network,note,old_name,oneway,operator,park,phone,public_transport,route_ref,service,shelter,smoothness,source,source:maxspeed,source_ref,surface,surface_1,surface_2,tiger:cfcc,tiger:county,tiger:name_base,tiger:name_type,tiger:reviewed,tiger:zip_left,tiger:zip_right,tourism,turn:lanes:forward,website,width
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [17]:
# Concatenate both initial training data and vectorized data.
training = pd.concat([training, vectorized], axis=1)
print(training.shape)
training.head()

(64, 70)


Unnamed: 0,changeset_id,changeset_harmful,tags,STOP_id,access,addr:city,addr:housenumber,addr:postcode,addr:state,addr:street,area,bench,bicycle,bus,description,ele,email,foot,footway,gnis:county_id,gnis:created,gnis:feature_id,gnis:state_id,highway,horse,landuse,landuse_1,landuse_2,landuse_3,landuse_4,landuse_5,lanes,lcn_ref,leisure,lit,maxspeed,motor_vehicle,name,name:en,name:ru,natural,network,note,old_name,oneway,operator,park,phone,public_transport,route_ref,service,shelter,smoothness,source,source:maxspeed,source_ref,surface,surface_1,surface_2,tiger:cfcc,tiger:county,tiger:name_base,tiger:name_type,tiger:reviewed,tiger:zip_left,tiger:zip_right,tourism,turn:lanes:forward,website,width
0,46360924,False,bicycle cycleway highway oneway,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,47658480,False,highway name,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,47403178,False,abutters highway maxspeed name oneway surface,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,48316803,False,access bicycle foot highway horse maxspeed name,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,47034621,False,highway name oneway,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [18]:
non_training_attributes = ['changeset_id', 'changeset_harmful', 'tags']

X = training.drop(non_training_attributes, axis=1)
y = training['changeset_harmful']

In [19]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()

In [20]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'random_state': [5],
}
grid = GridSearchCV(estimator=model, param_grid=param_grid)

grid.fit(X, y)

GridSearchCV(cv=None, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
       fit_params={}, iid=True, n_jobs=1, param_grid={'random_state': [5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [21]:
print('Best score: {}'.format(grid.best_score_))
print('Best params: {}'.format(grid.best_params_))

Best score: 0.453125
Best params: {'random_state': 5}


In [22]:
# Set model to the best estimator.
model = grid.best_estimator_

# What attributes are important?
importances = pd.DataFrame(
    list(zip(X.columns, model.feature_importances_)),
    columns=['feature', 'importance']
)
importances.sort(columns='importance', ascending=False).head(10)

Unnamed: 0,feature,importance
32,maxspeed,0.12
35,name:en,0.11
56,tiger:cfcc,0.08
41,oneway,0.08
1,access,0.08
22,landuse,0.07
14,foot,0.05
50,source,0.04
53,surface,0.04
49,smoothness,0.03


In [23]:
training['prediction'] = model.predict(X)
training.to_csv(path + 'training-review.csv', index=False)

In [24]:
from sklearn.metrics import confusion_matrix
matrix = confusion_matrix(y, training['prediction'])
matrix = pd.DataFrame(matrix, index=['Labelled good', 'Labelled harmful'], columns=['Predicted good', 'Predicted harmful'])
matrix

Unnamed: 0,Predicted good,Predicted harmful
Labelled good,32,0
Labelled harmful,6,26


In [25]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X, y, cv=5, scoring='roc_auc')

print('Scores: {} ({})'.format(round(scores.mean(), 2), round(scores.std(), 2)))

Scores: 0.39 (0.17)


## Validation dataset

In [26]:
validation = pd.read_csv(path + 'validation.csv')
validation.head()

Unnamed: 0,changeset_id,changeset_harmful,tags
0,47380180,False,cycleway:right highway maxspeed name oneway su...
1,46636971,False,highway name surface
2,47246586,False,cycleway highway lit maxspeed name oneway oper...
3,46855406,False,highway name source source_ref
4,46989023,False,bicycle foot highway name


In [27]:
# Drop all duplicate attributes.
print('Shape before dropping duplicates: {}'.format(validation.shape))
validation = validation.drop_duplicates(subset='changeset_id')
print('Shape after dropping duplicates: {}'.format(validation.shape))

Shape before dropping duplicates: (689, 3)
Shape after dropping duplicates: (689, 3)


In [28]:
validation.groupby('changeset_harmful').size()

changeset_harmful
False    677
True      12
dtype: int64

In [29]:
vX = vectorizer.transform(validation['tags'])

# Convert to a Pandas DataFrame to make merging easier.
vectorized = pd.DataFrame(vX.toarray(), columns=vectorizer.get_feature_names())

In [30]:
# Concatenate both initial validation data and vectorized data.
validation = pd.concat([validation, vectorized], axis=1)
print(validation.shape)
validation.head()

(689, 70)


Unnamed: 0,changeset_id,changeset_harmful,tags,STOP_id,access,addr:city,addr:housenumber,addr:postcode,addr:state,addr:street,area,bench,bicycle,bus,description,ele,email,foot,footway,gnis:county_id,gnis:created,gnis:feature_id,gnis:state_id,highway,horse,landuse,landuse_1,landuse_2,landuse_3,landuse_4,landuse_5,lanes,lcn_ref,leisure,lit,maxspeed,motor_vehicle,name,name:en,name:ru,natural,network,note,old_name,oneway,operator,park,phone,public_transport,route_ref,service,shelter,smoothness,source,source:maxspeed,source_ref,surface,surface_1,surface_2,tiger:cfcc,tiger:county,tiger:name_base,tiger:name_type,tiger:reviewed,tiger:zip_left,tiger:zip_right,tourism,turn:lanes:forward,website,width
0,47380180,False,cycleway:right highway maxspeed name oneway su...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,46636971,False,highway name surface,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,47246586,False,cycleway highway lit maxspeed name oneway oper...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,46855406,False,highway name source source_ref,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,46989023,False,bicycle foot highway name,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [31]:
# TODO: Replace with a pipeline
vX = validation.drop(non_training_attributes, axis=1)
vy = validation['changeset_harmful']

In [None]:
validation['prediction'] = model.predict(vX)
validation.to_csv(path + 'validation-review.csv', index=False)

In [None]:
from sklearn.metrics import confusion_matrix
matrix = confusion_matrix(vy, validation['prediction'])
matrix = pd.DataFrame(matrix, index=['Labelled good', 'Labelled harmful'], columns=['Predicted good', 'Predicted harmful'])
matrix

Unnamed: 0,Predicted good,Predicted harmful
Labelled good,499,178
Labelled harmful,8,4


## Testing dataset

In [None]:
testing = pd.read_csv(path + 'testing.csv')
testing.head()

In [None]:
# Drop all duplicate attributes.
print('Shape before dropping duplicates: {}'.format(testing.shape))
testing = testing.drop_duplicates(subset='changeset_id')
print('Shape after dropping duplicates: {}'.format(testing.shape))

In [None]:
tX = vectorizer.transform(testing['tags'])

# Convert to a Pandas DataFrame to make merging easier.
vectorized = pd.DataFrame(tX.toarray(), columns=vectorizer.get_feature_names())

In [None]:
# Concatenate both initial testing data and vectorized data.
testing = pd.concat([testing, vectorized], axis=1)
print(testing.shape)
testing.head()

In [None]:
tX = testing.drop(non_training_attributes, axis=1)

In [None]:
testing['prediction'] = model.predict(tX)

In [None]:
tharmful_count = testing[testing['prediction'] == True].shape[0]
tnot_harmful_count = testing[testing['prediction'] == False].shape[0]

print('Predicted good: {}'.format(tnot_harmful_count))
print('Predicted harmful: {}'.format(tharmful_count))

print('Percentage harmful: {}%'.format(round(100.0 * tharmful_count / testing.shape[0], 2)))

In [None]:
testing.to_csv(path + 'testing-review.csv', index=False)

In [None]:
# import pydotplus
# from sklearn import tree
# from IPython.display import Image
# dot_data = tree.export_graphviz(
#     model,
#     out_file=None,
#     feature_names=vectorizer.get_feature_names(),
#     class_names=['good', 'problematic'],
#     filled=True,
#     rounded=True,
#     special_characters=True
# )
# graph = pydotplus.graph_from_dot_data(dot_data)
# Image(graph.create_png())