# Highway classifier - Gabbar

## Step 1. Prepare problem

#### 1a. Load libraries

In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set_style('ticks')

In [3]:
pd.set_option('display.precision', 2)
pd.set_option('display.max_columns', None)

In [4]:
path = '../downloads/highway-classifier/'

#### 1b. Load dataset

In [5]:
# Loading the training dataset.
training = pd.read_csv(path + 'training.csv')

In [6]:
# Drop all duplicate attributes.
print('Shape before dropping duplicates: {}'.format(training.shape))
training = training.drop_duplicates(subset='changeset_id')
print('Shape after dropping duplicates: {}'.format(training.shape))

Shape before dropping duplicates: (1631, 107)
Shape after dropping duplicates: (1631, 107)


In [7]:
# Creating a smaller sample to speed up workflow.
# training = training[:500]

In [8]:
training.head()

Unnamed: 0,changeset_id,changeset_harmful,changeset_bbox_area,user_changesets,user_features,user_mapping_days,user_days_since_first_edit,feature_action_create,feature_action_modify,feature_action_delete,feature_version,feature_total_tags,feature_personal_count,feature_similar_tags_count,feature_point,feature_linestring,feature_polygon,feature_distance,feature_line_distance,feature_node_count,feature_node_distance_mean,feature_node_distance_stddev,feature_kinks,feature_area,iD,JOSM,MAPS.ME,Potlatch,Redaction,Vespucci,OsmAnd,Merkaartor,GNOME,aerialway,aeroway,amenity,barrier,boundary,building,craft,emergency,geological,highway,historic,landuse,leisure,man_made,military,natural,office,place,power,public_transport,railway,route,shop,sport,tourism,waterway,residential,service,track,unclassified,footway,path,tertiary,secondary,crossing,primary,bus_stop,turning_circle,other,name,source,surface,tiger:cfcc,tiger:county,tiger:reviewed,oneway,tiger:name_base,maxspeed,lanes,tiger:name_type,ref,service.1,tiger:source,tiger:tlid,tracktype,access,tiger:upload_uuid,yh:WIDTH,tiger:zip_left,tiger:separated,tiger:zip_right,foot,bicycle,yh:TOTYUMONO,yh:WIDTH_RANK,yh:STRUCTURE,yh:TYPE,bridge,layer,lit,crossing.1,tiger:name_direction_prefix,width,other.1
0,47341487,0.0,192.0,845,7390,291,2753.0,0,1,0,4,8,0,3,0,1,0,0.03,0.03,3,0.01,0.00231,0,289.0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,0.0,0,1.81,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,1,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
1,47276287,0.0,9150000.0,92193,2135664,518,745.0,0,1,0,6,5,0,0,0,1,0,5.91,6.05,37,0.17,0.226,0,12100000.0,0,1,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,0.0,0,1.81,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2
2,47509309,0.0,3250.0,1502,10167,148,691.0,0,1,0,2,5,0,0,0,1,0,0.07,0.07,2,0.07,0.0,0,2140.0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,0.0,0,32.55,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,47870896,0.0,0.0,645,1617,34,35.0,0,1,0,2,5,0,0,1,0,0,0.0,0.0,1,0.0,0.0,0,0.0,0,0,1,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,0.0,0,1.64,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4
4,47276294,0.0,5850000.0,92193,2135664,518,745.0,0,1,0,9,3,0,0,0,1,0,3.95,3.95,14,0.3,0.17,0,7800000.0,0,1,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,0.0,0,0.72,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


## 2. Summarize data

#### 2a. Descriptive statistics

In [9]:
print('Shape: {}'.format(training.shape))

Shape: (1631, 107)


In [10]:
training.dtypes.head()

changeset_id             int64
changeset_harmful      float64
changeset_bbox_area    float64
user_changesets          int64
user_features            int64
dtype: object

In [11]:
training.describe()

Unnamed: 0,changeset_id,changeset_harmful,changeset_bbox_area,user_changesets,user_features,user_mapping_days,user_days_since_first_edit,feature_action_create,feature_action_modify,feature_action_delete,feature_version,feature_total_tags,feature_personal_count,feature_similar_tags_count,feature_point,feature_linestring,feature_polygon,feature_distance,feature_line_distance,feature_node_count,feature_node_distance_mean,feature_node_distance_stddev,feature_kinks,feature_area,iD,JOSM,MAPS.ME,Potlatch,Redaction,Vespucci,OsmAnd,Merkaartor,GNOME,aerialway,aeroway,amenity,barrier,boundary,building,craft,emergency,geological,highway,historic,landuse,leisure,man_made,military,natural,office,place,power,public_transport,railway,route,shop,sport,tourism,waterway,residential,service,track,unclassified,footway,path,tertiary,secondary,crossing,primary,bus_stop,turning_circle,other,name,source,surface,tiger:cfcc,tiger:county,tiger:reviewed,oneway,tiger:name_base,maxspeed,lanes,tiger:name_type,ref,service.1,tiger:source,tiger:tlid,tracktype,access,tiger:upload_uuid,yh:WIDTH,tiger:zip_left,tiger:separated,tiger:zip_right,foot,bicycle,yh:TOTYUMONO,yh:WIDTH_RANK,yh:STRUCTURE,yh:TYPE,bridge,layer,lit,crossing.1,tiger:name_direction_prefix,width,other.1
count,1630.0,1631.0,1630.0,1631.0,1630.0,1631.0,1630.0,1630.0,1631.0,1631.0,1631.0,1631.0,1630.0,1631.0,1631.0,1631.0,1631.0,1631.0,1631.0,1631.0,1631.0,1631.0,1631.0,1630.0,1631.0,1631.0,1631.0,1631.0,1631.0,1630.0,1631.0,1630.0,1631.0,1631.0,1631.0,1630.0,1631.0,1631.0,1631.0,1631.0,1631.0,1631.0,1631.0,1630.0,1631.0,1631.0,1631.0,1631.0,1631.0,1631.0,1630.0,1631.0,1631.0,1631.0,1630.0,1631.0,1630.0,1631.0,1631.0,1631.0,1631.0,1631.0,1631.0,1631.0,1631.0,1631.0,1631.0,1630.0,1631.0,1631.0,1630.0,1631.0,1631.0,1631.0,1631.0,1631.0,1631.0,1631.0,1631.0,1631.0,1631.0,1631.0,1631.0,1631.0,1631.0,1630.0,1630.0,1631.0,1631.0,1630.0,1631.0,1631.0,1630.0,1631.0,1631.0,1631.0,1631.0,1631.0,1631.0,1631.0,1631.0,1631.0,1631.0,1630.0,1631.0,1631.0,1631.0
mean,47400000.0,0.02,3690000.0,7243.35,306000.0,297.45,1067.19,0.00184,1.0,0.0,5.79,5.74,0.00736,0.27,0.05,0.93,0.02,0.62,0.78,12.52,0.06,0.03,0.02,5860000.0,0.31,0.13,0.01,0.02,0.0,0.00552,0.0,0.000613,0.0,0.0,0.0,0.000215,0.0,0.11,0.0,0.0,0.0,0.0,14.16,0.000699,0.03,0.01,0.01,0.0,0.0,0.0,0.00011,0.0,0.55,0.0,0.000797,0.0,0.000644,0.03,0.0,0.35,0.04,0.02,0.06,0.04,0.02,0.08,0.07,0.00429,0.1,0.03,0.00123,0.17,0.8,0.15,0.6,0.1,0.11,0.06,0.43,0.1,0.34,0.16,0.1,0.14,0.02,0.00184,0.00184,0.01,0.04,0.00123,0.0,0.07,0.00123,0.07,0.07,0.08,0.0,0.0,0.0,0.0,0.03,0.04,0.09,0.00429,0.02,0.02,2.09
std,494000.0,0.14,46800000.0,22321.72,878000.0,500.84,1179.92,0.0429,0.04,0.0,4.74,3.31,0.0855,0.87,0.22,0.26,0.15,4.54,5.44,50.34,0.4,0.28,0.3,92400000.0,0.46,0.34,0.11,0.14,0.0,0.0741,0.0,0.0248,0.0,0.0,0.0,0.00867,0.0,3.13,0.0,0.0,0.0,0.0,13.99,0.0163,0.73,0.41,0.39,0.0,0.0,0.0,0.00315,0.0,5.24,0.0,0.0322,0.0,0.015,0.51,0.0,0.48,0.2,0.14,0.25,0.2,0.14,0.28,0.25,0.0654,0.29,0.18,0.035,0.38,0.4,0.36,0.49,0.3,0.31,0.23,0.49,0.3,0.47,0.37,0.3,0.34,0.13,0.0429,0.0429,0.1,0.2,0.035,0.0,0.26,0.035,0.26,0.26,0.27,0.0,0.0,0.0,0.0,0.16,0.2,0.29,0.0654,0.15,0.14,1.76
min,44800000.0,0.0,0.0,0.0,0.0,0.0,-877.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,47200000.0,0.0,434.0,20.0,81.0,4.0,6.0,0.0,1.0,0.0,3.0,3.0,0.0,0.0,0.0,1.0,0.0,0.03,0.04,2.0,0.01,0.0,0.0,408.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.81,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,47400000.0,0.0,3270.0,571.0,6940.0,50.0,691.0,0.0,1.0,0.0,4.0,5.0,0.0,0.0,0.0,1.0,0.0,0.11,0.13,4.0,0.03,0.01,0.0,3440.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
75%,47500000.0,0.0,20400.0,2479.0,59400.0,278.0,1818.75,0.0,1.0,0.0,7.0,7.0,0.0,0.0,0.0,1.0,0.0,0.26,0.3,9.0,0.05,0.03,0.0,22400.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,32.55,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
max,48800000.0,1.0,866000000.0,283525.0,14800000.0,3420.0,4222.0,1.0,1.0,0.0,40.0,29.0,1.0,8.0,1.0,1.0,1.0,101.83,110.35,1311.0,15.88,10.62,4.0,2890000000.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.35,0.0,89.37,0.0,0.0,0.0,0.0,32.55,0.38,17.12,16.57,12.25,0.0,0.0,0.0,0.09,0.0,50.83,0.0,1.3,0.0,0.35,8.62,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,27.0


In [12]:
training.groupby('changeset_harmful').size()

changeset_harmful
0.0    1596
1.0      35
dtype: int64

In [13]:
training.corr(method='pearson').head()

Unnamed: 0,changeset_id,changeset_harmful,changeset_bbox_area,user_changesets,user_features,user_mapping_days,user_days_since_first_edit,feature_action_create,feature_action_modify,feature_action_delete,feature_version,feature_total_tags,feature_personal_count,feature_similar_tags_count,feature_point,feature_linestring,feature_polygon,feature_distance,feature_line_distance,feature_node_count,feature_node_distance_mean,feature_node_distance_stddev,feature_kinks,feature_area,iD,JOSM,MAPS.ME,Potlatch,Redaction,Vespucci,OsmAnd,Merkaartor,GNOME,aerialway,aeroway,amenity,barrier,boundary,building,craft,emergency,geological,highway,historic,landuse,leisure,man_made,military,natural,office,place,power,public_transport,railway,route,shop,sport,tourism,waterway,residential,service,track,unclassified,footway,path,tertiary,secondary,crossing,primary,bus_stop,turning_circle,other,name,source,surface,tiger:cfcc,tiger:county,tiger:reviewed,oneway,tiger:name_base,maxspeed,lanes,tiger:name_type,ref,service.1,tiger:source,tiger:tlid,tracktype,access,tiger:upload_uuid,yh:WIDTH,tiger:zip_left,tiger:separated,tiger:zip_right,foot,bicycle,yh:TOTYUMONO,yh:WIDTH_RANK,yh:STRUCTURE,yh:TYPE,bridge,layer,lit,crossing.1,tiger:name_direction_prefix,width,other.1
changeset_id,1.0,0.03,0.114,-0.0395,-0.0658,-0.104,-0.1,0.0101,-0.0101,,0.0202,0.03,-0.0165,0.0174,-0.09,0.07,0.000525,0.0859,0.0857,0.0466,0.0629,0.06,0.0067,0.0746,-0.13,-0.19,0.0278,-0.077,,-0.079,,0.0685,,,,-0.0477,,-0.0327,,,,,0.01,0.00997,0.0905,0.0578,-0.0204,,,,0.00254,,-0.0879,,-0.0225,,0.0619,-0.0641,,0.02,-0.0674,-0.09,0.0672,-0.00548,-0.0316,0.0201,0.04,-0.00895,0.08,-0.06,-0.023,-0.06,0.05,0.01,0.21,-0.00425,-0.00972,0.00713,0.0595,-0.000447,0.0546,-0.0204,0.00798,0.05,-0.03,0.0279,0.0279,-0.03,0.0326,0.0119,,-0.00843,0.0237,-0.00422,0.02,0.0135,,,,,-0.00533,-0.02,-0.00278,-0.00895,-0.05,-0.0275,-0.0469
changeset_harmful,0.03,1.0,0.142,-0.0422,-0.0178,-0.0512,-0.1,-0.00636,0.00636,,0.0539,-0.04,0.0863,0.0176,0.03,-0.02,0.00387,0.112,0.21,0.253,0.00436,0.02,0.0163,0.0577,0.15,-0.02,0.0604,-0.0213,,-0.011,,-0.00367,,,,-0.00367,,-0.00529,,,,,-0.02,-0.00636,0.29,-0.00367,-0.00516,,,,-0.00519,,0.0254,,-0.00367,,-0.00636,-0.00903,,-0.03,0.00988,0.04,-0.00471,0.0961,0.00956,-0.0298,-0.02,0.055,-0.03,0.02,-0.00519,0.01,-0.07,0.02,-0.09,0.0212,0.016,0.00087,-0.0592,-0.00759,-0.0518,-0.0302,-0.00529,-0.06,0.01,-0.00636,-0.00636,0.03,-0.0105,-0.00519,,0.00614,-0.00519,-0.00805,-0.02,-0.0289,,,,,-0.0247,-0.03,0.0107,0.055,-0.02,0.0104,0.0216
changeset_bbox_area,0.11,0.14,1.0,-0.00384,-0.00535,-0.00303,-0.01,-0.00338,0.00338,,0.107,-0.03,-0.00677,0.00121,-0.02,0.02,-0.0125,0.938,0.906,0.364,0.527,0.65,-0.00637,0.882,0.05,0.03,-0.00879,-0.0111,,-0.00587,,0.456,,,,-0.00195,,-0.00277,,,,,-0.06,-0.00338,-0.00325,-0.00194,-0.00265,,,,-0.00276,,-0.00831,,-0.000938,,-0.0029,-0.00475,,-0.06,-0.0146,0.04,0.0141,0.0452,-0.0108,0.00767,-0.02,-0.00518,0.04,-0.01,-0.00276,0.02,-0.12,0.08,-0.07,0.0104,0.00766,0.0286,-0.0437,0.0101,-0.0517,-0.0333,0.0114,0.04,-0.01,-0.00338,-0.00338,0.06,-0.0145,-0.00276,,-0.0222,-0.00276,-0.0217,-0.02,-0.0229,,,,,-0.013,-0.02,-0.0248,-0.00518,-0.01,-0.00475,0.00208
user_changesets,-0.04,-0.04,-0.00384,1.0,0.757,0.36,0.1,-0.0135,0.0135,,-0.0359,-0.1,-0.00992,-0.0497,-0.04,0.05,-0.0269,0.0077,-0.000725,-0.0238,0.0571,0.02,-0.023,-0.00642,-0.16,0.58,-0.0342,-0.00825,,0.00619,,0.0251,,,,-0.00804,,0.00186,,,,,-0.12,0.00489,-0.0138,-0.00804,-0.00948,,,,0.0289,,-0.0202,,-0.00569,,-0.012,-0.0153,,-0.11,0.0713,-0.03,-0.0143,-0.0101,-0.000932,-0.0153,-0.03,-0.016,-0.01,-0.04,-0.0114,0.2,-0.22,-0.03,-0.23,-0.0437,-0.0391,-0.0543,0.0671,-0.0489,-0.0691,-0.0195,-0.0455,-0.03,-0.03,-0.0139,-0.0139,-0.02,0.00886,-0.0114,,-0.052,-0.0114,-0.0593,0.01,0.00481,,,,,0.139,0.29,-0.0298,-0.016,-0.04,-0.0388,-0.0388
user_features,-0.07,-0.02,-0.00535,0.757,1.0,0.544,0.28,-0.0149,0.0149,,0.00609,-0.04,0.0167,-0.0189,-0.05,0.05,-0.00892,-0.00242,-0.000866,-0.00498,0.044,0.03,-0.0216,-0.00805,-0.16,0.43,-0.0387,0.0359,,0.0149,,0.0244,,,,-0.00864,,0.0867,,,,,-0.1,0.0256,-0.0149,-0.00864,-0.00837,,,,0.00248,,-0.0274,,-0.00357,,-0.00404,-0.0155,,-0.1,0.0655,-0.05,0.0154,-0.0209,0.00534,0.00182,0.02,-0.0206,-0.01,-0.06,-0.0122,0.13,-0.15,-0.03,-0.17,-0.0107,-0.00741,-0.0295,0.00274,-0.0138,-0.00361,0.00315,-0.00856,0.04,-0.02,-0.015,-0.015,-0.03,0.00771,-0.0122,,-0.0173,-0.0122,-0.0374,-0.02,-3.94e-05,,,,,0.0978,0.2,0.0576,-0.0206,-0.04,-0.0466,-0.004


In [14]:
skew = training.skew()
skew.sort(ascending=False)
skew.head()

route                         40.39
amenity                       40.39
leisure                       40.39
Merkaartor                    40.39
feature_node_distance_mean    37.98
dtype: float64

#### 2b. Data visualizations

In [15]:
# Histograms.
# training.hist(figsize=(20, 20));

In [16]:
# Density plot.
# training.plot.density(figsize=(20, 50), subplots=True, sharex=False);

In [17]:
# training.plot.box(layout=(5, 4), figsize=(20, 25), subplots=True, sharex=False, sharey=False);

In [18]:
# sns.heatmap(training.corr(method='pearson'));

## 3. Prepare data

#### 3a. Data cleaning

#### 3b. Feature selection

In [19]:
non_training_attributes = ['changeset_id', 'changeset_harmful']
training_attributes = list(set(training.columns) - set(non_training_attributes))

# Drop rows with any null in training attributes.
training = training.dropna(subset=training_attributes)

X = training.drop(non_training_attributes, axis=1)
y = training['changeset_harmful']

#### 3c. Data transforms

In [20]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X)
X_scaled = scaler.transform(X)

In [21]:
# Estimate importance of all features.
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier()
model.fit(X_scaled, y)

importances = pd.DataFrame(
    list(zip(X.columns, model.feature_importances_)),
    columns=['feature', 'importance']
)
importances.sort(columns='importance', ascending=False).head()

Unnamed: 0,feature,importance
4,user_days_since_first_edit,0.06
17,feature_node_count,0.06
16,feature_line_distance,0.06
0,changeset_bbox_area,0.06
1,user_changesets,0.05


## 4. Evaluate algorithms

#### 4a. Split-out validation dataset
- We have a seperate validation dataset. Yay!

#### 4b. Spot Check Algorithms
- Running algorithms in a loop below.

#### 4c. Compare Algorithms

In [22]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

models = []
models.append(('LR', LogisticRegression()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('SVM', SVC()))
models.append(('RFC', RandomForestClassifier()))
models.append(('GBC', GradientBoostingClassifier()))

results = []
names = []
for (name, model) in models:
    kfold = KFold(n_splits=2, random_state=7)
    result = cross_val_score(model, X_scaled, y, cv=kfold, scoring='roc_auc')
    results.append(result)
    names.append(name)
    
    print('{}: {} ({})'.format(name, round(result.mean(), 2), round(result.std(), 2)))

LR: 0.78 (0.05)
CART: 0.6 (0.01)
KNN: 0.66 (0.01)
SVM: 0.81 (0.03)
RFC: 0.67 (0.03)
GBC: 0.76 (0.02)


In [23]:
# NOTE: Temporarily turning off the graph.
# fig, ax = plt.subplots(1, 1)
# plt.boxplot(results)
# ax.set_xticklabels(names)
# ax.set_ylabel('roc_auc')
# ax.set_ylim((0, 1));

## 5. Improve Accuracy

#### 5a. Algorithm Tuning

In [24]:
harmful_count = training[training['changeset_harmful'] == True].shape[0]
not_harmful_count = training[training['changeset_harmful'] == False].shape[0]

# Making this a float to prevent truncation due to integer division.
count = 1.0 * (harmful_count + not_harmful_count)

# Calculate sample weights based on number of occourances.
sample_weight = [count if 10 else 1 for l in y]

In [25]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier()
param_grid = {
    'n_estimators': [300, 500, 700],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.05, 0.1,  0.5],
    'max_features': ['log2'],
    'random_state': [5],
}
grid = GridSearchCV(estimator=model, param_grid=param_grid, fit_params={'sample_weight': sample_weight})
grid.fit(X_scaled, y)

print('Best score: {}'.format(grid.best_score_))
print('Best params: {}'.format(grid.best_params_))

Best score: 0.9785276073619632
Best params: {'learning_rate': 0.05, 'random_state': 5, 'max_features': 'log2', 'n_estimators': 300, 'max_depth': 3}


In [26]:
model = grid.best_estimator_
y_model = model.predict(X_scaled)

In [27]:
training['prediction'] = model.predict(X_scaled)
training.to_csv(path + 'training-review.csv', index=False)

#### 5b. Test options and evaluation metric

In [28]:
from sklearn.metrics import confusion_matrix

matrix = confusion_matrix(y, y_model)

matrix = pd.DataFrame(matrix, index=['Labelled good', 'Labelled harmful'], columns=['Predicted good', 'Predicted harmful'])
matrix

Unnamed: 0,Predicted good,Predicted harmful
Labelled good,1595,0
Labelled harmful,12,23


In [29]:
total = matrix.sum().sum()
matrix / total

Unnamed: 0,Predicted good,Predicted harmful
Labelled good,0.979,0.0
Labelled harmful,0.00736,0.01


In [30]:
from sklearn.metrics import classification_report

report = classification_report(y, y_model)
print(report)

             precision    recall  f1-score   support

        0.0       0.99      1.00      1.00      1595
        1.0       1.00      0.66      0.79        35

avg / total       0.99      0.99      0.99      1630



In [31]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X_scaled, y, cv=kfold, scoring='roc_auc')

# An area of 0.5 represents a model that is as good as random.
# An area of 1.0 represents a model that made all predictions perfectly.
print('Scores: {} ({})'.format(round(scores.mean(), 2), round(scores.std(), 2)))

Scores: 0.83 (0.02)


#### 5c. Ensembles
- Todo

## 6. Finalize model

#### 6a. Predictions on validation dataset

In [32]:
validation = pd.read_csv(path + 'validation.csv')
validation.head()

Unnamed: 0,changeset_id,changeset_harmful,changeset_bbox_area,user_changesets,user_features,user_mapping_days,user_days_since_first_edit,feature_action_create,feature_action_modify,feature_action_delete,feature_version,feature_total_tags,feature_personal_count,feature_similar_tags_count,feature_point,feature_linestring,feature_polygon,feature_distance,feature_line_distance,feature_node_count,feature_node_distance_mean,feature_node_distance_stddev,feature_kinks,feature_area,iD,JOSM,MAPS.ME,Potlatch,Redaction,Vespucci,OsmAnd,Merkaartor,GNOME,aerialway,aeroway,amenity,barrier,boundary,building,craft,emergency,geological,highway,historic,landuse,leisure,man_made,military,natural,office,place,power,public_transport,railway,route,shop,sport,tourism,waterway,residential,service,track,unclassified,footway,path,tertiary,secondary,crossing,primary,bus_stop,turning_circle,other,name,source,surface,tiger:cfcc,tiger:county,tiger:reviewed,oneway,tiger:name_base,maxspeed,lanes,tiger:name_type,ref,service.1,tiger:source,tiger:tlid,tracktype,access,tiger:upload_uuid,yh:WIDTH,tiger:zip_left,tiger:separated,tiger:zip_right,foot,bicycle,yh:TOTYUMONO,yh:WIDTH_RANK,yh:STRUCTURE,yh:TYPE,bridge,layer,lit,crossing.1,tiger:name_direction_prefix,width,other.1
0,46415124,0.0,29.06,10678,444702,1062,1819.0,0,1,0,2,7,0,0,0,1,0,0.01,0.01,2,0.01,0.0,0,17.81,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,0.0,0,4.13,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,3
1,47668474,0.0,0.0,1239,21161,426,674.0,0,1,0,4,10,0,0,1,0,0,0.0,0.0,1,0.0,0.0,0,0.0,1,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,0.0,0,1.64,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0,50.83,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9
2,47307997,0.0,27943.33,106026,2794751,737,972.0,0,1,0,2,2,0,0,0,1,0,0.27,0.27,2,0.27,0.0,0,37047.15,0,1,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,0.0,0,0.51,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,47516500,0.0,123.41,9784,828415,1486,3441.0,0,1,0,2,6,0,0,0,1,0,0.02,0.02,2,0.02,0.0,0,76.76,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,0.0,0,4.13,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,2
4,46340958,0.0,20539.0,23,30,1,0.0,0,1,0,7,11,0,3,0,1,0,0.56,0.59,21,0.03,0.02,0,133466.89,1,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,0.0,0,4.13,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,1,1,0,1,1,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1


In [33]:
# Drop all duplicate attributes.
print('Shape before dropping duplicates: {}'.format(validation.shape))
validation = validation.drop_duplicates(subset='changeset_id')
print('Shape after dropping duplicates: {}'.format(validation.shape))

# Drop rows with any null in training attributes.
validation = validation.dropna(subset=training_attributes)

Shape before dropping duplicates: (700, 107)
Shape after dropping duplicates: (700, 107)


In [34]:
validation.groupby('changeset_harmful').size()

changeset_harmful
0.0    690
1.0     10
dtype: int64

In [35]:
# TODO: Replace with a pipeline
vX = validation.drop(non_training_attributes, axis=1)
vX_scaled = scaler.transform(vX)

vy = validation['changeset_harmful']

In [36]:
vy_model = model.predict(vX_scaled)

In [37]:
validation['prediction'] = model.predict(vX_scaled)
validation.to_csv(path + 'validation-review.csv', index=False)

In [38]:
from sklearn.metrics import confusion_matrix
matrix = confusion_matrix(vy, vy_model)
matrix = pd.DataFrame(matrix, index=['Labelled good', 'Labelled harmful'], columns=['Predicted good', 'Predicted harmful'])
matrix

Unnamed: 0,Predicted good,Predicted harmful
Labelled good,689,1
Labelled harmful,9,1


In [39]:
from sklearn.metrics import classification_report

report = classification_report(vy, vy_model)
print(report)

             precision    recall  f1-score   support

        0.0       0.99      1.00      0.99       690
        1.0       0.50      0.10      0.17        10

avg / total       0.98      0.99      0.98       700



In [40]:
scores = cross_val_score(model, vX_scaled, vy, cv=kfold, scoring='roc_auc')
print('Scores: {} ({})'.format(round(scores.mean(), 2), round(scores.std(), 2)))

Scores: 0.87 (0.06)


#### 6b. Create standalone model on entire training dataset
- Todo

#### 6c. Predictions on testing dataset

In [41]:
testing = pd.read_csv(path + 'testing.csv')
testing.head()

Unnamed: 0,changeset_id,changeset_harmful,changeset_bbox_area,user_changesets,user_features,user_mapping_days,user_days_since_first_edit,feature_action_create,feature_action_modify,feature_action_delete,feature_version,feature_total_tags,feature_personal_count,feature_similar_tags_count,feature_point,feature_linestring,feature_polygon,feature_distance,feature_line_distance,feature_node_count,feature_node_distance_mean,feature_node_distance_stddev,feature_kinks,feature_area,iD,JOSM,MAPS.ME,Potlatch,Redaction,Vespucci,OsmAnd,Merkaartor,GNOME,aerialway,aeroway,amenity,barrier,boundary,building,craft,emergency,geological,highway,historic,landuse,leisure,man_made,military,natural,office,place,power,public_transport,railway,route,shop,sport,tourism,waterway,residential,service,track,unclassified,footway,path,tertiary,secondary,crossing,primary,bus_stop,turning_circle,other,name,source,surface,tiger:cfcc,tiger:county,tiger:reviewed,oneway,tiger:name_base,maxspeed,lanes,tiger:name_type,ref,service.1,tiger:source,tiger:tlid,tracktype,access,tiger:upload_uuid,yh:WIDTH,tiger:zip_left,tiger:separated,tiger:zip_right,foot,bicycle,yh:TOTYUMONO,yh:WIDTH_RANK,yh:STRUCTURE,yh:TYPE,bridge,layer,lit,crossing.1,tiger:name_direction_prefix,width,other.1
0,48345326,,192132.11,3486,287581,619,2744.0,0,1,0,4,3,0,0,0,1,0,0.56,0.56,22,0.03,0.0195,0,131409.26,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,0.0,0,11.37,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,48346788,,137.23,11155,229879,1091,1356.0,0,1,0,2,4,0,0,0,1,0,0.01,0.01,2,0.01,0.0,0,95.83,0,0,0,1,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,0.0,0,4.56,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,2
2,48310515,,1180.01,132,1068,50,865.0,0,1,0,3,6,0,0,0,1,0,0.05,0.05,4,0.02,0.00457,0,797.24,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,0.0,0,2.7,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1
3,48309850,,142922.68,19,41,11,1866.0,0,1,0,2,6,0,0,0,1,0,0.44,0.44,2,0.44,0.0,0,86795.81,1,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,0.0,0,6.2,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,3
4,48437377,,1389.89,46227,494734,229,370.0,0,1,0,2,1,0,0,0,1,0,0.11,0.21,8,0.03,0.028,0,9370.47,0,1,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,0.0,0,32.55,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [42]:
# Drop all duplicate attributes.
print('Shape before dropping duplicates: {}'.format(testing.shape))
testing = testing.drop_duplicates(subset='changeset_id')
print('Shape after dropping duplicates: {}'.format(testing.shape))

Shape before dropping duplicates: (6700, 107)
Shape after dropping duplicates: (6700, 107)


In [43]:
testing = testing.dropna(subset=training_attributes)
print('After samples rows with null: {}'.format(testing.shape))

After samples rows with null: (6700, 107)


In [44]:
# TODO: Replace with a pipeline
tX = testing.drop(non_training_attributes, axis=1)
tX_scaled = scaler.transform(tX)
# ty = testing['changeset_harmful']

In [45]:
testing['prediction'] = model.predict(tX_scaled)

In [46]:
tharmful_count = testing[testing['prediction'] == True].shape[0]
tnot_harmful_count = testing[testing['prediction'] == False].shape[0]

print('Predicted good: {}'.format(tnot_harmful_count))
print('Predicted harmful: {}'.format(tharmful_count))

print('Percentage harmful: {}%'.format(round(100.0 * tharmful_count / testing.shape[0], 2)))

Predicted good: 6694
Predicted harmful: 6
Percentage harmful: 0.09%


In [47]:
testing.to_csv(path + 'testing-review.csv', index=False)

#### 6d. Save model for later use

In [48]:
from sklearn.externals import joblib

model_path = '../gabbar/trained/model.pkl'
joblib.dump(model, model_path)

['../gabbar/trained/model.pkl']