# Highway classifier - Gabbar

## Step 1. Prepare problem

#### 1a. Load libraries

In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set_style('ticks')

In [3]:
pd.set_option('display.precision', 2)
pd.set_option('display.max_columns', None)

In [4]:
path = '../downloads/highway-classifier/'

#### 1b. Load dataset

In [5]:
# Loading the training dataset.
training = pd.read_csv(path + 'training.csv')

In [6]:
# Drop all duplicate attributes.
print('Shape before dropping duplicates: {}'.format(training.shape))
training = training.drop_duplicates(subset='changeset_id')
print('Shape after dropping duplicates: {}'.format(training.shape))

Shape before dropping duplicates: (1631, 108)
Shape after dropping duplicates: (1631, 108)


In [7]:
# Creating a smaller sample to speed up workflow.
# training = training[:500]

In [8]:
training.head()

Unnamed: 0,changeset_id,changeset_harmful,changeset_bbox_area,user_changesets,user_features,user_mapping_days,user_days_since_first_edit,feature_action_create,feature_action_modify,feature_action_delete,feature_version,feature_total_tags,feature_personal_count,feature_similar_tags_count,feature_point,feature_linestring,feature_polygon,feature_distance,feature_line_distance,feature_node_count,feature_node_distance_mean,feature_node_distance_stddev,feature_kinks,feature_area,feature_name_profanity,iD,JOSM,MAPS.ME,Potlatch,Redaction,Vespucci,OsmAnd,Merkaartor,GNOME,aerialway,aeroway,amenity,barrier,boundary,building,craft,emergency,geological,highway,historic,landuse,leisure,man_made,military,natural,office,place,power,public_transport,railway,route,shop,sport,tourism,waterway,residential,service,track,unclassified,footway,path,tertiary,secondary,crossing,primary,bus_stop,turning_circle,other,name,source,surface,tiger:cfcc,tiger:county,tiger:reviewed,oneway,tiger:name_base,maxspeed,lanes,tiger:name_type,ref,service.1,tiger:source,tiger:tlid,tracktype,access,tiger:upload_uuid,yh:WIDTH,tiger:zip_left,tiger:separated,tiger:zip_right,foot,bicycle,yh:TOTYUMONO,yh:WIDTH_RANK,yh:STRUCTURE,yh:TYPE,bridge,layer,lit,crossing.1,tiger:name_direction_prefix,width,other.1
0,47403163,0.0,62.02,9784,828415,1486,3438.0,0,1,0,4,3,0,0,0,1,0,0.00987,0.00987,2,0.00987,0.0,0,38.59,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,0.0,0,8.71,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,47305802,0.0,11209.44,58,573,10,0.0,0,1,0,2,3,0,0,0,1,0,0.251,0.255,4,0.0851,0.07,0,19413.46,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,0.0,0,15.06,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,46887225,0.0,0.0,19,63,4,933.0,0,1,0,2,3,0,0,1,0,0,0.0,0.0,1,0.0,0.0,0,0.0,0,1,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,0.0,0,1.64,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2
3,47308005,0.0,16205.88,106026,2794751,737,972.0,0,1,0,2,2,0,0,0,1,0,0.265,0.265,2,0.265,0.0,0,21475.87,0,0,1,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,0.0,0,0.51,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,46472786,0.0,5168.01,11670,375496,1903,2394.0,0,1,0,9,6,0,0,0,1,0,0.155,0.155,2,0.155,0.0,0,8132.71,0,0,0,0,0,0,1,0,0,0,0,0,0.0,0.0,0.0,0.0,0,0.0,0,32.55,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


## 2. Summarize data

#### 2a. Descriptive statistics

In [9]:
print('Shape: {}'.format(training.shape))

Shape: (1631, 108)


In [10]:
training.dtypes.head()

changeset_id             int64
changeset_harmful      float64
changeset_bbox_area    float64
user_changesets          int64
user_features            int64
dtype: object

In [11]:
training.describe()

Unnamed: 0,changeset_id,changeset_harmful,changeset_bbox_area,user_changesets,user_features,user_mapping_days,user_days_since_first_edit,feature_action_create,feature_action_modify,feature_action_delete,feature_version,feature_total_tags,feature_personal_count,feature_similar_tags_count,feature_point,feature_linestring,feature_polygon,feature_distance,feature_line_distance,feature_node_count,feature_node_distance_mean,feature_node_distance_stddev,feature_kinks,feature_area,feature_name_profanity,iD,JOSM,MAPS.ME,Potlatch,Redaction,Vespucci,OsmAnd,Merkaartor,GNOME,aerialway,aeroway,amenity,barrier,boundary,building,craft,emergency,geological,highway,historic,landuse,leisure,man_made,military,natural,office,place,power,public_transport,railway,route,shop,sport,tourism,waterway,residential,service,track,unclassified,footway,path,tertiary,secondary,crossing,primary,bus_stop,turning_circle,other,name,source,surface,tiger:cfcc,tiger:county,tiger:reviewed,oneway,tiger:name_base,maxspeed,lanes,tiger:name_type,ref,service.1,tiger:source,tiger:tlid,tracktype,access,tiger:upload_uuid,yh:WIDTH,tiger:zip_left,tiger:separated,tiger:zip_right,foot,bicycle,yh:TOTYUMONO,yh:WIDTH_RANK,yh:STRUCTURE,yh:TYPE,bridge,layer,lit,crossing.1,tiger:name_direction_prefix,width,other.1
count,1630.0,1631.0,1630.0,1631.0,1630.0,1631.0,1630.0,1630.0,1631.0,1631.0,1631.0,1631.0,1630.0,1631.0,1631.0,1631.0,1631.0,1631.0,1631.0,1631.0,1631.0,1631.0,1631.0,1630.0,1630.0,1631.0,1631.0,1631.0,1631.0,1631.0,1630.0,1631.0,1630.0,1631.0,1631.0,1631.0,1630.0,1630.0,1631.0,1631.0,1631.0,1630.0,1631.0,1631.0,1630.0,1631.0,1631.0,1631.0,1631.0,1630.0,1631.0,1630.0,1631.0,1631.0,1630.0,1630.0,1631.0,1630.0,1631.0,1631.0,1631.0,1631.0,1631.0,1631.0,1631.0,1631.0,1631.0,1631.0,1630.0,1631.0,1631.0,1630.0,1631.0,1631.0,1631.0,1631.0,1631.0,1631.0,1631.0,1631.0,1631.0,1631.0,1631.0,1631.0,1631.0,1631.0,1630.0,1630.0,1630.0,1631.0,1630.0,1631.0,1631.0,1630.0,1631.0,1631.0,1631.0,1631.0,1631.0,1631.0,1631.0,1631.0,1631.0,1631.0,1630.0,1631.0,1631.0,1631.0
mean,47400000.0,0.02,3690000.0,7642.74,314000.0,312.37,1107.75,0.00245,1.0,0.0,5.8,5.71,0.00674,0.27,0.05,0.93,0.02,0.58,0.81,14.08,0.06,0.03,0.02,6130000.0,0.00307,0.29,0.13,0.01,0.02,0.0,0.00674,0.0,0.000613,0.0,0.0,0.0,0.000215,0.000472,0.11,0.0,0.0,0.00443,0.0,14.14,0.00116,0.02,0.04,0.0,0.0,0.000711,0.0,5.52e-05,0.0,0.65,0.00318,0.000797,0.0,0.000429,0.02,0.0,0.35,0.05,0.02,0.07,0.04,0.02,0.08,0.07,0.00368,0.1,0.03,0.00123,0.16,0.79,0.15,0.6,0.1,0.11,0.06,0.42,0.1,0.33,0.17,0.09,0.15,0.02,0.00184,0.00184,0.0092,0.04,0.00123,0.0,0.07,0.00123,0.07,0.07,0.09,0.0,0.0,0.0,0.0,0.03,0.04,0.1,0.00429,0.02,0.02,2.06
std,479000.0,0.14,46400000.0,24217.77,990000.0,521.48,1191.6,0.0495,0.05,0.0,4.89,3.23,0.0819,0.87,0.21,0.25,0.14,4.53,5.82,65.04,0.4,0.28,0.26,94700000.0,0.0553,0.46,0.34,0.1,0.15,0.0,0.0819,0.0,0.0248,0.0,0.0,0.0,0.00867,0.0191,3.13,0.0,0.0,0.179,0.0,13.92,0.021,0.6,0.82,0.0,0.0,0.0287,0.0,0.00223,0.0,5.67,0.129,0.0322,0.0,0.0123,0.43,0.0,0.48,0.21,0.15,0.25,0.21,0.14,0.28,0.25,0.0606,0.3,0.17,0.035,0.37,0.41,0.36,0.49,0.3,0.31,0.23,0.49,0.3,0.47,0.37,0.29,0.36,0.14,0.0429,0.0429,0.0955,0.2,0.035,0.0,0.26,0.035,0.25,0.26,0.29,0.0,0.0,0.0,0.0,0.17,0.2,0.3,0.0654,0.15,0.13,1.6
min,44800000.0,0.0,0.0,0.0,0.0,0.0,-27.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,47200000.0,0.0,412.0,23.0,82.0,5.0,6.25,0.0,1.0,0.0,3.0,3.0,0.0,0.0,0.0,1.0,0.0,0.03,0.04,2.0,0.01,0.0,0.0,400.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.81,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,47400000.0,0.0,3250.0,663.0,8030.0,79.0,691.0,0.0,1.0,0.0,4.0,5.0,0.0,0.0,0.0,1.0,0.0,0.11,0.12,4.0,0.03,0.01,0.0,3270.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
75%,47500000.0,0.0,18500.0,2763.0,67200.0,291.0,2009.0,0.0,1.0,0.0,7.0,7.0,0.0,0.0,0.0,1.0,0.0,0.24,0.29,9.0,0.05,0.03,0.0,19100.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,32.55,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
max,48800000.0,1.0,866000000.0,283525.0,14800000.0,3420.0,4222.0,1.0,1.0,0.0,62.0,24.0,1.0,8.0,1.0,1.0,1.0,101.83,110.35,1562.0,15.88,10.62,4.0,2890000000.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.35,0.77,89.37,0.0,0.0,7.22,0.0,32.55,0.38,17.12,16.57,0.0,0.0,1.16,0.0,0.09,0.0,50.83,5.19,1.3,0.0,0.35,8.62,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,12.0


In [12]:
training.groupby('changeset_harmful').size()

changeset_harmful
0.0    1599
1.0      32
dtype: int64

In [13]:
training.corr(method='pearson').head()

Unnamed: 0,changeset_id,changeset_harmful,changeset_bbox_area,user_changesets,user_features,user_mapping_days,user_days_since_first_edit,feature_action_create,feature_action_modify,feature_action_delete,feature_version,feature_total_tags,feature_personal_count,feature_similar_tags_count,feature_point,feature_linestring,feature_polygon,feature_distance,feature_line_distance,feature_node_count,feature_node_distance_mean,feature_node_distance_stddev,feature_kinks,feature_area,feature_name_profanity,iD,JOSM,MAPS.ME,Potlatch,Redaction,Vespucci,OsmAnd,Merkaartor,GNOME,aerialway,aeroway,amenity,barrier,boundary,building,craft,emergency,geological,highway,historic,landuse,leisure,man_made,military,natural,office,place,power,public_transport,railway,route,shop,sport,tourism,waterway,residential,service,track,unclassified,footway,path,tertiary,secondary,crossing,primary,bus_stop,turning_circle,other,name,source,surface,tiger:cfcc,tiger:county,tiger:reviewed,oneway,tiger:name_base,maxspeed,lanes,tiger:name_type,ref,service.1,tiger:source,tiger:tlid,tracktype,access,tiger:upload_uuid,yh:WIDTH,tiger:zip_left,tiger:separated,tiger:zip_right,foot,bicycle,yh:TOTYUMONO,yh:WIDTH_RANK,yh:STRUCTURE,yh:TYPE,bridge,layer,lit,crossing.1,tiger:name_direction_prefix,width,other.1
changeset_id,1.0,0.02,0.125,-0.0415,-0.0572,-0.0952,-0.08,0.023,-0.023,,-0.03,0.00545,-0.00543,0.00888,-0.08,0.06,0.02,0.0843,0.103,0.0825,0.0629,0.07,0.00564,0.0882,0.112,-0.14,-0.191,-0.00809,-0.0324,,-0.0602,,0.0707,,,,-0.049,0.00189,-0.0336,,,-0.0368,,0.02,0.014,0.0818,0.0351,,,0.0143,,0.0526,,-0.0726,0.00072,-0.0231,,0.0525,-0.0476,,0.03,-0.09,-0.09,0.0697,0.0312,-0.00887,-0.0194,0.0374,0.0248,0.0661,-0.0634,-0.0235,-0.05,0.04,0.0118,0.23,-0.0109,-0.0206,-0.0145,0.03,-0.00922,0.0488,-0.0296,0.00667,0.03,-0.04,0.0287,0.0287,0.00764,-0.00145,0.0125,,0.01,0.0242,0.01,-0.0206,0.0107,,,,,-0.02,-0.02,-0.03,0.0414,-0.05,-0.00143,-0.0784
changeset_harmful,0.02,1.0,0.151,-0.0403,-0.0192,-0.0473,-0.1,-0.00701,0.00701,,0.05,-0.049,0.0424,-0.00818,0.03,-0.07,0.07,0.116,0.208,0.197,0.00453,0.02,0.0232,0.0593,0.0721,0.15,-0.017,0.0697,-0.0213,,-0.0117,,-0.0035,,,,-0.0035,-0.0035,-0.00503,,,-0.0035,,-0.05,-0.00784,0.252,0.172,,,0.175,,-0.0035,,0.0235,-0.0035,-0.0035,,-0.00496,0.0823,,-0.06,0.01,0.04,-0.00211,0.119,0.0119,-0.0265,-0.0382,0.0644,-0.0174,0.000487,-0.00496,0.03,-0.08,0.00264,-0.11,-0.00235,-0.00566,0.00453,-0.06,-0.0322,-0.0623,-0.0275,-0.0306,-0.06,0.01,-0.00607,-0.00607,0.0327,-0.00777,-0.00496,,-0.02,-0.00496,-0.02,-0.0397,-0.0289,,,,,-0.02,-0.03,-0.02,0.0583,-0.02,0.0153,0.0474
changeset_bbox_area,0.13,0.15,1.0,-0.00304,0.00106,-0.00319,-0.02,-0.00394,0.00394,,0.11,-0.0254,-0.00653,0.0011,-0.02,0.02,-0.01,0.932,0.902,0.385,0.528,0.67,-0.00592,0.885,-0.00439,0.07,0.00871,-0.0084,-0.0117,,-0.00655,,0.461,,,,-0.00196,-0.00197,-0.00279,,,-0.00197,,-0.06,-0.00441,-0.00281,-0.00393,,,-0.00197,,-0.00197,,-0.00907,-0.00195,-0.000945,,-0.00221,-0.00397,,-0.06,-0.02,0.03,0.0139,0.0634,-0.0105,-0.0217,-0.00193,-0.00484,0.0414,-0.0142,-0.00279,0.02,-0.1,0.11,-0.07,0.0109,0.00868,0.0292,-0.04,0.0107,-0.0538,-0.0344,0.0121,0.05,-0.01,-0.00341,-0.00341,0.0554,-0.0143,-0.00279,,-0.02,-0.00279,-0.02,-0.0216,-0.0238,,,,,-0.01,-0.02,-0.03,-0.00522,-0.01,-0.00956,0.0103
user_changesets,-0.04,-0.04,-0.00304,1.0,0.809,0.413,0.12,-0.0153,0.0153,,-0.05,-0.0833,0.00311,-0.0476,-0.04,0.05,-0.03,0.0093,-0.00179,-0.0176,0.0478,0.02,-0.0196,-0.00467,0.0189,-0.16,0.557,-0.0301,-0.00753,,0.00294,,0.0227,,,,-0.00782,-0.00781,0.00117,,,0.00311,,-0.09,0.0049,-0.0112,-0.0125,,,-0.0078,,0.0434,,-0.0243,-0.00575,-0.00566,,-0.00974,-0.0126,,-0.11,0.15,-0.03,-0.0121,-0.0056,-0.0036,-0.0149,-0.0393,-0.0163,0.00854,-0.0417,-0.011,0.13,-0.22,-0.0434,-0.19,-0.0454,-0.0401,-0.0363,0.03,-0.0456,-0.0563,-0.00656,-0.0432,-0.03,0.05,-0.0135,-0.0135,-0.0164,0.0706,-0.011,,-0.04,-0.0111,-0.05,0.0239,0.0188,,,,,0.15,0.25,-0.02,-0.0161,-0.04,-0.0336,-0.0324
user_features,-0.06,-0.02,0.00106,0.809,1.0,0.567,0.27,-0.0156,0.0156,,-0.02,-0.048,0.0116,-0.032,-0.05,0.05,-0.02,0.00802,0.00576,0.00389,0.0365,0.02,-0.0165,0.00114,-0.0042,-0.15,0.386,-0.0317,0.0372,,0.0132,,0.0215,,,,-0.00786,-0.00785,0.0767,,,0.00328,,-0.1,0.0288,-0.0113,-0.0127,,,-0.00785,,0.00889,,-0.0314,-0.00753,-0.00336,,0.000508,-0.0128,,-0.12,0.16,-0.03,0.0176,-0.0165,0.0028,0.00565,0.0312,-0.0184,-0.00486,-0.0499,-0.0111,0.08,-0.16,-0.0317,-0.14,-0.0324,-0.0318,-0.0223,-0.02,-0.0329,-0.00234,0.0211,-0.0296,0.03,0.06,-0.0136,-0.0136,-0.0263,0.0647,-0.0111,,-0.03,-0.0111,-0.05,-0.00634,-0.00911,,,,,0.09,0.16,0.05,-0.0199,-0.04,-0.0354,-0.00999


In [14]:
skew = training.skew()
skew.sort(ascending=False)
skew.head()

place      40.39
natural    40.39
route      40.39
barrier    40.39
railway    40.39
dtype: float64

#### 2b. Data visualizations

In [15]:
# Histograms.
# training.hist(figsize=(20, 20));

In [16]:
# Density plot.
# training.plot.density(figsize=(20, 50), subplots=True, sharex=False);

In [17]:
# training.plot.box(layout=(5, 4), figsize=(20, 25), subplots=True, sharex=False, sharey=False);

In [18]:
# sns.heatmap(training.corr(method='pearson'));

## 3. Prepare data

#### 3a. Data cleaning

#### 3b. Feature selection

In [19]:
non_training_attributes = ['changeset_id', 'changeset_harmful']
training_attributes = list(set(training.columns) - set(non_training_attributes))

# Drop rows with any null in training attributes.
training = training.dropna(subset=training_attributes)

X = training.drop(non_training_attributes, axis=1)
y = training['changeset_harmful']

#### 3c. Data transforms

In [20]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X)
X_scaled = scaler.transform(X)

In [21]:
# Estimate importance of all features.
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier()
model.fit(X_scaled, y)

importances = pd.DataFrame(
    list(zip(X.columns, model.feature_importances_)),
    columns=['feature', 'importance']
)
importances.sort(columns='importance', ascending=False).head()

Unnamed: 0,feature,importance
16,feature_line_distance,0.07
43,landuse,0.05
0,changeset_bbox_area,0.05
17,feature_node_count,0.05
1,user_changesets,0.05


## 4. Evaluate algorithms

#### 4a. Split-out validation dataset
- We have a seperate validation dataset. Yay!

#### 4b. Spot Check Algorithms
- Running algorithms in a loop below.

#### 4c. Compare Algorithms

In [22]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

models = []
models.append(('LR', LogisticRegression()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('SVM', SVC()))
models.append(('RFC', RandomForestClassifier()))
models.append(('GBC', GradientBoostingClassifier()))

results = []
names = []
for (name, model) in models:
    kfold = KFold(n_splits=2, random_state=7)
    result = cross_val_score(model, X_scaled, y, cv=kfold, scoring='roc_auc')
    results.append(result)
    names.append(name)
    
    print('{}: {} ({})'.format(name, round(result.mean(), 2), round(result.std(), 2)))

LR: 0.79 (0.03)
CART: 0.59 (0.03)
KNN: 0.62 (0.0)
SVM: 0.8 (0.0)
RFC: 0.68 (0.05)
GBC: 0.84 (0.01)


In [23]:
# NOTE: Temporarily turning off the graph.
# fig, ax = plt.subplots(1, 1)
# plt.boxplot(results)
# ax.set_xticklabels(names)
# ax.set_ylabel('roc_auc')
# ax.set_ylim((0, 1));

## 5. Improve Accuracy

#### 5a. Algorithm Tuning

In [24]:
harmful_count = training[training['changeset_harmful'] == True].shape[0]
not_harmful_count = training[training['changeset_harmful'] == False].shape[0]

# Making this a float to prevent truncation due to integer division.
count = 1.0 * (harmful_count + not_harmful_count)

# Calculate sample weights based on number of occourances.
sample_weight = [count if 10 else 1 for l in y]

In [25]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier()
param_grid = {
    'n_estimators': [300, 500, 700],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.05, 0.1,  0.5],
    'max_features': ['log2'],
    'random_state': [5],
}
grid = GridSearchCV(estimator=model, param_grid=param_grid, fit_params={'sample_weight': sample_weight})
grid.fit(X_scaled, y)

print('Best score: {}'.format(grid.best_score_))
print('Best params: {}'.format(grid.best_params_))

Best score: 0.9815950920245399
Best params: {'max_features': 'log2', 'n_estimators': 300, 'random_state': 5, 'learning_rate': 0.05, 'max_depth': 7}


In [26]:
model = grid.best_estimator_
y_model = model.predict(X_scaled)

In [27]:
training['prediction'] = model.predict(X_scaled)
training.to_csv(path + 'training-review.csv', index=False)

#### 5b. Test options and evaluation metric

In [28]:
from sklearn.metrics import confusion_matrix

matrix = confusion_matrix(y, y_model)

matrix = pd.DataFrame(matrix, index=['Labelled good', 'Labelled harmful'], columns=['Predicted good', 'Predicted harmful'])
matrix

Unnamed: 0,Predicted good,Predicted harmful
Labelled good,1598,0
Labelled harmful,0,32


In [29]:
total = matrix.sum().sum()
matrix / total

Unnamed: 0,Predicted good,Predicted harmful
Labelled good,0.98,0.0
Labelled harmful,0.0,0.02


In [30]:
from sklearn.metrics import classification_report

report = classification_report(y, y_model)
print(report)

             precision    recall  f1-score   support

        0.0       1.00      1.00      1.00      1598
        1.0       1.00      1.00      1.00        32

avg / total       1.00      1.00      1.00      1630



In [31]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X_scaled, y, cv=kfold, scoring='roc_auc')

# An area of 0.5 represents a model that is as good as random.
# An area of 1.0 represents a model that made all predictions perfectly.
print('Scores: {} ({})'.format(round(scores.mean(), 2), round(scores.std(), 2)))

Scores: 0.84 (0.02)


#### 5c. Ensembles
- Todo

## 6. Finalize model

#### 6a. Predictions on validation dataset

In [32]:
validation = pd.read_csv(path + 'validation.csv')
validation.head()

Unnamed: 0,changeset_id,changeset_harmful,changeset_bbox_area,user_changesets,user_features,user_mapping_days,user_days_since_first_edit,feature_action_create,feature_action_modify,feature_action_delete,feature_version,feature_total_tags,feature_personal_count,feature_similar_tags_count,feature_point,feature_linestring,feature_polygon,feature_distance,feature_line_distance,feature_node_count,feature_node_distance_mean,feature_node_distance_stddev,feature_kinks,feature_area,feature_name_profanity,iD,JOSM,MAPS.ME,Potlatch,Redaction,Vespucci,OsmAnd,Merkaartor,GNOME,aerialway,aeroway,amenity,barrier,boundary,building,craft,emergency,geological,highway,historic,landuse,leisure,man_made,military,natural,office,place,power,public_transport,railway,route,shop,sport,tourism,waterway,residential,service,track,unclassified,footway,path,tertiary,secondary,crossing,primary,bus_stop,turning_circle,other,name,source,surface,tiger:cfcc,tiger:county,tiger:reviewed,oneway,tiger:name_base,maxspeed,lanes,tiger:name_type,ref,service.1,tiger:source,tiger:tlid,tracktype,access,tiger:upload_uuid,yh:WIDTH,tiger:zip_left,tiger:separated,tiger:zip_right,foot,bicycle,yh:TOTYUMONO,yh:WIDTH_RANK,yh:STRUCTURE,yh:TYPE,bridge,layer,lit,crossing.1,tiger:name_direction_prefix,width,other.1
0,46399560,0.0,5082.23,7,1233,2,321.0,0,1,0,2,7,0,0,0,1,0,0.0,0.81,76,0.01,0.00553,0,32800.0,0,1,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,0.0,0,6.2,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,2
1,47480404,0.0,671.61,17,21,3,-5.0,0,1,0,6,5,0,0,0,1,0,0.04,0.04,4,0.01,0.00716,0,442.0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,0.0,0,1.81,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,47921583,0.0,5377.24,41,103,3,0.0,0,1,0,7,8,0,2,0,1,0,1.18,1.18,26,0.05,0.0384,0,8060.0,0,1,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,0.0,0,4.13,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,1,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2
3,46472513,0.0,606143.65,85,1961,9,95.0,0,1,0,6,6,0,0,0,1,0,1.68,3.09,87,0.04,0.0234,4,1790000.0,0,0,0,0,1,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,0.0,0,6.2,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,3
4,47308236,0.0,13051.06,106026,2794751,737,972.0,0,1,0,2,2,0,0,0,1,0,0.23,0.23,2,0.23,0.0,0,17300.0,0,0,1,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,0.0,0,0.51,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [33]:
# Drop all duplicate attributes.
print('Shape before dropping duplicates: {}'.format(validation.shape))
validation = validation.drop_duplicates(subset='changeset_id')
print('Shape after dropping duplicates: {}'.format(validation.shape))

# Drop rows with any null in training attributes.
validation = validation.dropna(subset=training_attributes)

Shape before dropping duplicates: (700, 108)
Shape after dropping duplicates: (700, 108)


In [34]:
validation.groupby('changeset_harmful').size()

changeset_harmful
0.0    687
1.0     13
dtype: int64

In [35]:
# TODO: Replace with a pipeline
vX = validation.drop(non_training_attributes, axis=1)
vX_scaled = scaler.transform(vX)

vy = validation['changeset_harmful']

In [36]:
vy_model = model.predict(vX_scaled)

In [37]:
validation['prediction'] = model.predict(vX_scaled)
validation.to_csv(path + 'validation-review.csv', index=False)

In [38]:
from sklearn.metrics import confusion_matrix
matrix = confusion_matrix(vy, vy_model)
matrix = pd.DataFrame(matrix, index=['Labelled good', 'Labelled harmful'], columns=['Predicted good', 'Predicted harmful'])
matrix

Unnamed: 0,Predicted good,Predicted harmful
Labelled good,686,1
Labelled harmful,11,2


In [39]:
from sklearn.metrics import classification_report

report = classification_report(vy, vy_model)
print(report)

             precision    recall  f1-score   support

        0.0       0.98      1.00      0.99       687
        1.0       0.67      0.15      0.25        13

avg / total       0.98      0.98      0.98       700



In [40]:
scores = cross_val_score(model, vX_scaled, vy, cv=kfold, scoring='roc_auc')
print('Scores: {} ({})'.format(round(scores.mean(), 2), round(scores.std(), 2)))

Scores: 0.74 (0.05)


#### 6b. Create standalone model on entire training dataset
- Todo

#### 6c. Predictions on testing dataset

In [41]:
testing = pd.read_csv(path + 'testing.csv')
testing.head()

Unnamed: 0,changeset_id,changeset_harmful,changeset_bbox_area,user_changesets,user_features,user_mapping_days,user_days_since_first_edit,feature_action_create,feature_action_modify,feature_action_delete,feature_version,feature_total_tags,feature_personal_count,feature_similar_tags_count,feature_point,feature_linestring,feature_polygon,feature_distance,feature_line_distance,feature_node_count,feature_node_distance_mean,feature_node_distance_stddev,feature_kinks,feature_area,feature_name_profanity,iD,JOSM,MAPS.ME,Potlatch,Redaction,Vespucci,OsmAnd,Merkaartor,GNOME,aerialway,aeroway,amenity,barrier,boundary,building,craft,emergency,geological,highway,historic,landuse,leisure,man_made,military,natural,office,place,power,public_transport,railway,route,shop,sport,tourism,waterway,residential,service,track,unclassified,footway,path,tertiary,secondary,crossing,primary,bus_stop,turning_circle,other,name,source,surface,tiger:cfcc,tiger:county,tiger:reviewed,oneway,tiger:name_base,maxspeed,lanes,tiger:name_type,ref,service.1,tiger:source,tiger:tlid,tracktype,access,tiger:upload_uuid,yh:WIDTH,tiger:zip_left,tiger:separated,tiger:zip_right,foot,bicycle,yh:TOTYUMONO,yh:WIDTH_RANK,yh:STRUCTURE,yh:TYPE,bridge,layer,lit,crossing.1,tiger:name_direction_prefix,width,other.1
0,48458577,,1680000.0,352,15033,55,597.0,0,1,0,7,1,0,0,0,1,0,1.53,2.07,47,0.0449,0.05,0,1180000.0,0,1,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,0.0,0,11.37,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,48300306,,1640.0,39,60,7,119.0,0,1,0,2,5,0,0,0,1,0,0.051,0.051,2,0.051,0.0,0,1130.0,0,1,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,0.0,0,15.06,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1
2,48392159,,0.252,3943,155606,73,90.0,0,1,0,2,1,0,0,0,1,0,0.00611,0.00611,2,0.00611,0.0,0,0.953,0,1,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,0.0,0,15.06,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,48295980,,426000.0,25973,976168,1933,2878.0,0,1,0,4,7,0,2,0,1,0,0.834,2.61,58,0.0459,0.03,0,616000.0,0,0,1,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,0.0,0,11.37,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,48348606,,10400000.0,12,198,1,0.0,0,1,0,2,1,0,0,0,1,0,5.34,5.58,34,0.169,0.21,0,11200000.0,0,1,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,0.0,0,0.28,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [42]:
# Drop all duplicate attributes.
print('Shape before dropping duplicates: {}'.format(testing.shape))
testing = testing.drop_duplicates(subset='changeset_id')
print('Shape after dropping duplicates: {}'.format(testing.shape))

Shape before dropping duplicates: (6700, 108)
Shape after dropping duplicates: (6700, 108)


In [43]:
testing = testing.dropna(subset=training_attributes)
print('After samples rows with null: {}'.format(testing.shape))

After samples rows with null: (6700, 108)


In [44]:
# TODO: Replace with a pipeline
tX = testing.drop(non_training_attributes, axis=1)
tX_scaled = scaler.transform(tX)
# ty = testing['changeset_harmful']

In [45]:
testing['prediction'] = model.predict(tX_scaled)

In [46]:
tharmful_count = testing[testing['prediction'] == True].shape[0]
tnot_harmful_count = testing[testing['prediction'] == False].shape[0]

print('Predicted good: {}'.format(tnot_harmful_count))
print('Predicted harmful: {}'.format(tharmful_count))

print('Percentage harmful: {}%'.format(round(100.0 * tharmful_count / testing.shape[0], 2)))

Predicted good: 6692
Predicted harmful: 8
Percentage harmful: 0.12%


In [47]:
testing.to_csv(path + 'testing-review.csv', index=False)

#### 6d. Save model for later use

In [48]:
from sklearn.externals import joblib

model_path = '../gabbar/trained/model.pkl'
joblib.dump(model, model_path)

['../gabbar/trained/model.pkl']