# Feature classifier - Gabbar

## Step 1. Prepare problem

#### 1a. Load libraries

In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set_style('ticks')

In [3]:
pd.set_option('display.precision', 2)
pd.set_option('display.max_columns', None)

#### 1b. Load dataset

In [4]:
# Loading the training dataset.
attributes = pd.read_csv('../downloads/feature-classifier-2017-05-25/training/attributes.csv')

In [5]:
# Drop all duplicate attributes.
print('Shape before dropping duplicates: {}'.format(attributes.shape))
attributes = attributes.drop_duplicates(subset='changeset_id')
print('Shape after dropping duplicates: {}'.format(attributes.shape))

Shape before dropping duplicates: (5268, 141)
Shape after dropping duplicates: (5268, 141)


In [6]:
# Creating a smaller sample to speed up workflow.
# attributes = attributes[:500]

In [7]:
attributes.head(10)

Unnamed: 0,changeset_id,changeset_harmful,changeset_features_created,changeset_features_modified,changeset_features_deleted,changeset_has_imagery_used,changeset_has_source,changeset_comment_number_of_words,changeset_comment_naughty_words_count,changeset_bbox_area,changeset_non_open_data_source,changeset_comment_special_characters_count,user_name_naughty_words_count,user_name_special_characters_count,user_changesets_count,user_features_count,user_mapping_days_count,user_discussions_count,user_changesets_with_discussions_count,old_user_name_naughty_words_count,old_user_name_special_characters_count,old_user_changesets_count,old_user_features_count,old_user_mapping_days_count,old_user_discussions_count,old_user_changesets_with_discussions_count,feature_version,feature_name_naughty_words_count,feature_days_since_last_edit,feature_primary_tags,feature_area,feature_property_tags,feature_name_translations_count,feature_has_website,feature_has_wikidata,feature_has_wikipedia,feature_tags_created_count,feature_tags_modified_count,feature_tags_deleted_count,feature_tags_distance,feature_similar_tags_count,feature_name_naughty_words_count_old,feature_primary_tags_old,feature_area_old,feature_property_tags_old,feature_name_translations_count_old,feature_has_website_old,feature_has_wikidata_old,feature_has_wikipedia_old,feature_similar_tags_count_old,iD,JOSM,MAPS.ME,Potlatch,Redaction,Vespucci,OsmAnd,Merkaartor,GNOME,aerialway,aeroway,amenity,barrier,boundary,building,craft,emergency,geological,highway,historic,landuse,leisure,man_made,military,natural,office,place,power,public_transport,railway,route,shop,sport,tourism,waterway,aerialway_old,aeroway_old,amenity_old,barrier_old,boundary_old,building_old,craft_old,emergency_old,geological_old,highway_old,historic_old,landuse_old,leisure_old,man_made_old,military_old,natural_old,office_old,place_old,power_old,public_transport_old,railway_old,route_old,shop_old,sport_old,tourism_old,waterway_old,aerialway_modified,aeroway_modified,amenity_modified,barrier_modified,boundary_modified,building_modified,craft_modified,emergency_modified,geological_modified,highway_modified,historic_modified,landuse_modified,leisure_modified,man_made_modified,military_modified,natural_modified,office_modified,place_modified,power_modified,public_transport_modified,railway_modified,route_modified,shop_modified,sport_modified,tourism_modified,waterway_modified,tag_values_popularity_min,tag_values_popularity_max,tag_values_popularity_mean,tag_values_popularity_stddev
0,48269805,0,0,1,0,0,0,0,0,10504,0,0,0,0,1,1,1,0,0,0,0,2145,9123,213,15,7,7,0,95,2,9350,4,2,0,0,0,0,1,0,1,0,0,2,9350,4,2,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0035,0.81,0.407,0.403
1,48269170,1,0,1,0,1,0,1,0,0,0,0,0,0,2405,125105,400,0,0,0,0,2405,125105,400,0,0,3,0,934,2,0,10,7,0,0,0,1,6,1,8,0,0,1,0,10,8,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0032,0.0043,0.0037,0.0005
2,48267592,1,0,1,0,1,0,4,0,628386,0,0,0,0,11,48,2,0,0,0,0,4364,1115802,1783,16,30,3,0,743,1,853363,1,0,0,0,0,1,0,1,2,0,0,1,853363,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0091,0.0091,0.0091,0.0
3,48266008,0,0,1,0,1,0,2,0,2217,0,0,0,0,20,163,2,0,1,0,0,662,45987,90,0,0,2,0,2147,2,754,4,1,0,0,0,4,0,1,5,0,0,1,754,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0714,0.294,0.183,0.111
4,48265965,0,0,1,0,1,0,2,0,0,0,0,0,0,100,1599,5,0,1,0,0,4393,332332,517,1145,28,37,0,39,1,0,12,8,0,1,1,0,3,0,3,0,0,1,0,12,8,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0221,0.0221,0.0221,0.0
5,48264238,0,0,1,0,1,0,5,0,0,0,0,0,1,43032,2458846,992,8,20,0,0,5,18,3,0,0,8,0,149,1,0,8,5,0,0,0,0,1,0,1,0,0,1,0,8,5,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0221,0.0221,0.0221,0.0
6,48260457,1,0,1,0,1,0,3,0,8602,0,0,0,0,2,15,1,0,0,0,0,2,15,1,0,0,2,0,0,1,16918,6,1,0,0,0,1,0,0,1,0,0,1,16918,5,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0298,0.0298,0.0298,0.0
7,48256321,0,0,1,0,1,0,12,0,4281722,0,0,0,0,1174,320767,118,2,4,0,0,1174,320767,118,2,4,2,0,0,1,0,2,0,0,0,0,0,0,1,1,0,0,1,0,3,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0871,0.0871,0.0871,0.0
8,48255884,1,0,1,0,1,0,4,0,72287,0,0,0,0,59,1021,13,0,0,0,0,59,1021,13,0,0,2,0,0,2,0,4,0,0,0,0,1,0,0,1,0,0,1,0,3,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.062,0.171,0.117,0.0546
9,48255854,1,0,1,0,1,0,4,0,173848,0,0,0,0,59,1021,13,0,0,0,0,59,1021,13,0,0,2,0,0,2,0,4,0,0,0,0,1,0,0,1,0,0,1,0,3,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.062,0.171,0.117,0.0546


## 2. Summarize data

#### 2a. Descriptive statistics

In [8]:
print('Shape: {}'.format(attributes.shape))

Shape: (5268, 141)


In [9]:
attributes.dtypes.head(10)

changeset_id                             int64
changeset_harmful                        int64
changeset_features_created               int64
changeset_features_modified              int64
changeset_features_deleted               int64
changeset_has_imagery_used               int64
changeset_has_source                     int64
changeset_comment_number_of_words        int64
changeset_comment_naughty_words_count    int64
changeset_bbox_area                      int64
dtype: object

In [10]:
attributes.describe()

Unnamed: 0,changeset_id,changeset_harmful,changeset_features_created,changeset_features_modified,changeset_features_deleted,changeset_has_imagery_used,changeset_has_source,changeset_comment_number_of_words,changeset_comment_naughty_words_count,changeset_bbox_area,changeset_non_open_data_source,changeset_comment_special_characters_count,user_name_naughty_words_count,user_name_special_characters_count,user_changesets_count,user_features_count,user_mapping_days_count,user_discussions_count,user_changesets_with_discussions_count,old_user_name_naughty_words_count,old_user_name_special_characters_count,old_user_changesets_count,old_user_features_count,old_user_mapping_days_count,old_user_discussions_count,old_user_changesets_with_discussions_count,feature_version,feature_name_naughty_words_count,feature_days_since_last_edit,feature_primary_tags,feature_area,feature_property_tags,feature_name_translations_count,feature_has_website,feature_has_wikidata,feature_has_wikipedia,feature_tags_created_count,feature_tags_modified_count,feature_tags_deleted_count,feature_tags_distance,feature_similar_tags_count,feature_name_naughty_words_count_old,feature_primary_tags_old,feature_area_old,feature_property_tags_old,feature_name_translations_count_old,feature_has_website_old,feature_has_wikidata_old,feature_has_wikipedia_old,feature_similar_tags_count_old,iD,JOSM,MAPS.ME,Potlatch,Redaction,Vespucci,OsmAnd,Merkaartor,GNOME,aerialway,aeroway,amenity,barrier,boundary,building,craft,emergency,geological,highway,historic,landuse,leisure,man_made,military,natural,office,place,power,public_transport,railway,route,shop,sport,tourism,waterway,aerialway_old,aeroway_old,amenity_old,barrier_old,boundary_old,building_old,craft_old,emergency_old,geological_old,highway_old,historic_old,landuse_old,leisure_old,man_made_old,military_old,natural_old,office_old,place_old,power_old,public_transport_old,railway_old,route_old,shop_old,sport_old,tourism_old,waterway_old,aerialway_modified,aeroway_modified,amenity_modified,barrier_modified,boundary_modified,building_modified,craft_modified,emergency_modified,geological_modified,highway_modified,historic_modified,landuse_modified,leisure_modified,man_made_modified,military_modified,natural_modified,office_modified,place_modified,power_modified,public_transport_modified,railway_modified,route_modified,shop_modified,sport_modified,tourism_modified,waterway_modified,tag_values_popularity_min,tag_values_popularity_max,tag_values_popularity_mean,tag_values_popularity_stddev
count,5270.0,5268.0,5268.0,5268.0,5268.0,5268.0,5268.0,5268.0,5270.0,5270.0,5270.0,5268.0,5270.0,5268.0,5268.0,5270.0,5268.0,5268.0,5268.0,5270.0,5268.0,5268.0,5270.0,5268.0,5268.0,5268.0,5268.0,5270.0,5268.0,5268.0,5270.0,5268.0,5268.0,5268.0,5268.0,5268.0,5268.0,5268.0,5268.0,5268.0,5268.0,5270.0,5268.0,5270.0,5268.0,5268.0,5268.0,5268.0,5268.0,5270.0,5268.0,5268.0,5268.0,5268.0,5268.0,5270.0,5270.0,5270.0,5268.0,5268.0,5270.0,5268.0,5270.0,5268.0,5268.0,5270.0,5270.0,5268.0,5268.0,5268.0,5268.0,5268.0,5270.0,5270.0,5268.0,5270.0,5268.0,5270.0,5270.0,5268.0,5268.0,5268.0,5268.0,5268.0,5270.0,5268.0,5270.0,5268.0,5270.0,5268.0,5268.0,5270.0,5270.0,5268.0,5268.0,5268.0,5268.0,5268.0,5270.0,5270.0,5268.0,5270.0,5268.0,5270.0,5270.0,5270.0,5268.0,5268.0,5268.0,5268.0,5270.0,5268.0,5268.0,5270.0,5270.0,5270.0,5268.0,5268.0,5268.0,5268.0,5268.0,5268.0,5270.0,5270.0,5268.0,5268.0,5270.0,5270.0,5270.0,5268.0,5268.0,5270.0,5268.0,5270.0,5270.0,5270.0,5270.0,5268.0,5268.0,5268.0,5268.0
mean,47300000.0,0.08,0.0,1.0,0.0,0.47,0.37,4.21,0.00114,12800000000.0,0.000759,0.84,0.00019,0.37,5034.71,257000.0,260.57,23.54,108.37,0.00171,0.44,6703.41,1350000.0,559.31,51.36,20.74,6.27,0.0038,639.46,1.15,15400000000.0,6.57,1.55,0.11,0.06,0.08,1.33,0.35,0.19,1.86,0.01,0.00171,1.09,15200000000.0,5.43,1.35,0.07,0.05,0.06,0.00702,0.47,0.12,0.09,0.02,0.0,0.00569,0.00266,0.00019,0.0,0.0,0.00152,0.16,0.00721,0.01,0.18,0.00171,0.00171,0.0,0.4,0.01,0.03,0.08,0.00475,0.000759,0.03,0.00968,0.05,0.00114,0.00892,0.01,0.01,0.08,0.02,0.04,0.00399,0.0,0.00133,0.14,0.0055,0.01,0.18,0.00171,0.00133,0.0,0.4,0.01,0.03,0.07,0.00475,0.000569,0.03,0.00797,0.05,0.00114,0.00835,0.00664,0.01,0.07,0.02,0.04,0.00399,0.0,0.0,0.00513,0.000569,0.00019,0.02,0.0,0.0,0.0,0.02,0.0,0.00418,0.00702,0.0,0.0,0.00114,0.000949,0.00569,0.0,0.0,0.00038,0.0,0.00532,0.00854,0.000949,0.00019,0.14,0.2,0.17,0.03
std,627000.0,0.27,0.0,0.0,0.0,0.5,0.48,5.04,0.0337,557000000000.0,0.0275,2.68,0.0138,0.95,17595.43,1090000.0,490.9,165.57,539.82,0.0413,1.03,20317.76,7360000.0,660.22,374.7,201.56,16.82,0.0645,747.77,0.4,730000000000.0,7.23,5.81,0.31,0.24,0.27,1.85,0.95,0.76,2.2,0.15,0.0457,0.42,730000000000.0,7.18,5.69,0.26,0.22,0.25,0.118,0.5,0.32,0.28,0.15,0.0,0.0753,0.0515,0.0138,0.0,0.0,0.0389,0.36,0.0846,0.1,0.38,0.0413,0.0413,0.0,0.49,0.11,0.17,0.28,0.0687,0.0275,0.17,0.0979,0.21,0.0337,0.094,0.1,0.11,0.27,0.14,0.19,0.063,0.0,0.0364,0.35,0.074,0.1,0.38,0.0413,0.0364,0.0,0.49,0.1,0.17,0.25,0.0687,0.0239,0.17,0.0889,0.21,0.0337,0.091,0.0812,0.11,0.26,0.13,0.18,0.063,0.0,0.0,0.0714,0.0239,0.0138,0.14,0.0,0.0,0.0,0.13,0.0,0.0645,0.0835,0.0,0.0,0.0337,0.0308,0.0753,0.0,0.0,0.0195,0.0,0.0727,0.092,0.0308,0.0138,0.19,0.25,0.2,0.1
min,44800000.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,-1645.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,47100000.0,0.0,0.0,1.0,0.0,0.0,0.0,3.0,0.0,5.0,0.0,0.0,0.0,0.0,15.0,37.0,3.0,0.0,0.0,0.0,0.0,297.0,10200.0,48.75,0.0,0.0,2.0,0.0,40.0,1.0,0.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.02,0.02,0.0
50%,47400000.0,0.0,0.0,1.0,0.0,0.0,0.0,3.0,0.0,1190.0,0.0,0.0,0.0,0.0,221.0,1400.0,26.0,0.0,0.0,0.0,0.0,1790.0,128000.0,286.0,1.0,1.0,3.0,0.0,369.0,1.0,0.0,5.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.09,0.08,0.0
75%,47600000.0,0.0,0.0,1.0,0.0,1.0,1.0,4.0,0.0,12400.0,0.0,0.0,0.0,0.0,2558.0,36800.0,223.0,4.0,4.0,0.0,0.0,5868.0,826000.0,790.0,10.0,7.0,6.0,0.0,1068.0,1.0,644.0,8.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,2.0,0.0,0.0,1.0,630.0,7.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.21,0.33,0.33,0.0
max,48300000.0,1.0,0.0,1.0,0.0,1.0,1.0,246.0,1.0,35300000000000.0,1.0,34.0,1.0,8.0,282581.0,31600000.0,3406.0,4412.0,2919.0,1.0,6.0,282581.0,185000000.0,3406.0,6307.0,2919.0,526.0,2.0,3868.0,4.0,41200000000000.0,252.0,241.0,1.0,1.0,1.0,56.0,52.0,20.0,56.0,5.0,2.0,4.0,41200000000000.0,249.0,238.0,1.0,1.0,1.0,5.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.89,0.89,0.89,0.44


In [11]:
attributes.groupby('changeset_harmful').size()

changeset_harmful
0    4851
1     417
dtype: int64

In [12]:
attributes.corr(method='pearson').head(10)

Unnamed: 0,changeset_id,changeset_harmful,changeset_features_created,changeset_features_modified,changeset_features_deleted,changeset_has_imagery_used,changeset_has_source,changeset_comment_number_of_words,changeset_comment_naughty_words_count,changeset_bbox_area,changeset_non_open_data_source,changeset_comment_special_characters_count,user_name_naughty_words_count,user_name_special_characters_count,user_changesets_count,user_features_count,user_mapping_days_count,user_discussions_count,user_changesets_with_discussions_count,old_user_name_naughty_words_count,old_user_name_special_characters_count,old_user_changesets_count,old_user_features_count,old_user_mapping_days_count,old_user_discussions_count,old_user_changesets_with_discussions_count,feature_version,feature_name_naughty_words_count,feature_days_since_last_edit,feature_primary_tags,feature_area,feature_property_tags,feature_name_translations_count,feature_has_website,feature_has_wikidata,feature_has_wikipedia,feature_tags_created_count,feature_tags_modified_count,feature_tags_deleted_count,feature_tags_distance,feature_similar_tags_count,feature_name_naughty_words_count_old,feature_primary_tags_old,feature_area_old,feature_property_tags_old,feature_name_translations_count_old,feature_has_website_old,feature_has_wikidata_old,feature_has_wikipedia_old,feature_similar_tags_count_old,iD,JOSM,MAPS.ME,Potlatch,Redaction,Vespucci,OsmAnd,Merkaartor,GNOME,aerialway,aeroway,amenity,barrier,boundary,building,craft,emergency,geological,highway,historic,landuse,leisure,man_made,military,natural,office,place,power,public_transport,railway,route,shop,sport,tourism,waterway,aerialway_old,aeroway_old,amenity_old,barrier_old,boundary_old,building_old,craft_old,emergency_old,geological_old,highway_old,historic_old,landuse_old,leisure_old,man_made_old,military_old,natural_old,office_old,place_old,power_old,public_transport_old,railway_old,route_old,shop_old,sport_old,tourism_old,waterway_old,aerialway_modified,aeroway_modified,amenity_modified,barrier_modified,boundary_modified,building_modified,craft_modified,emergency_modified,geological_modified,highway_modified,historic_modified,landuse_modified,leisure_modified,man_made_modified,military_modified,natural_modified,office_modified,place_modified,power_modified,public_transport_modified,railway_modified,route_modified,shop_modified,sport_modified,tourism_modified,waterway_modified,tag_values_popularity_min,tag_values_popularity_max,tag_values_popularity_mean,tag_values_popularity_stddev
changeset_id,1.0,-0.46,,,,-0.183,0.16,0.108,0.0143,-0.00209,0.00772,0.00286,-0.00951,0.048,-0.0233,0.021,0.00269,-0.0187,-0.752,0.0189,0.0509,0.0288,0.0317,0.0566,-0.00344,-0.263,-0.0176,0.0325,-0.00989,0.0187,-0.00285,0.051,0.00024,-0.0108,0.0457,0.02,0.0231,0.0203,0.0353,0.0405,0.0211,0.0146,-0.0086,-0.00306,0.0492,0.0172,0.00651,0.0531,0.04,0.0111,-0.18,-0.0165,0.0487,0.0124,,-0.0231,0.00292,-0.0108,,,0.00506,-0.00298,0.00591,-0.0088,0.05,-0.0022,0.00925,,-0.14,0.00282,0.0458,0.0962,-0.0117,0.00324,-0.000945,0.00391,0.0865,0.000222,-0.0125,-0.00286,-0.0191,0.03,0.00363,0.000607,0.000229,,-0.000353,-0.00614,0.00747,-0.00982,0.04,-0.0022,0.00931,,-0.13,0.000541,0.0541,0.0572,-0.0136,0.00626,0.0138,0.0158,0.0831,0.000222,-0.0188,-0.00834,-0.0192,0.0354,-0.00376,0.00332,-0.00148,,,-0.0138,0.0137,-0.00262,0.0226,,,,0.00176,,0.0407,0.0609,,,0.00168,-0.0138,0.0499,,,-0.00202,,0.00697,-0.0105,0.0114,0.00919,-0.0606,-0.0539,-0.0624,-0.0101
changeset_harmful,-0.46,1.0,,,,0.249,-0.22,-0.0335,0.0109,-0.00674,0.0174,-0.0616,-0.00404,-0.0582,-0.0522,-0.0587,-0.133,-0.0361,0.653,0.0049,-0.0439,-0.0265,-0.0242,-0.099,-0.00377,0.232,-0.00477,0.059,-0.0156,0.0338,0.0335,-0.0328,0.0464,-0.0502,-0.0322,-0.04,-0.0176,0.0262,0.0289,0.00652,0.105,0.00443,-0.0254,0.0336,-0.0254,0.0299,-0.0381,-0.0199,-0.03,0.0662,0.252,-0.0968,-0.0271,-0.0321,,-0.0222,-0.0151,-0.00404,,,0.00662,-0.0679,-0.00837,-0.00138,-0.02,-0.0121,0.0049,,0.05,-0.0258,0.0757,0.116,-0.0202,-0.00808,0.0505,-0.00745,-0.00209,-0.0099,-0.00539,-0.0232,-0.034,-0.0486,-0.00733,-0.0226,0.00377,,-0.0107,-0.0622,-0.00281,-0.00138,-0.03,-0.0121,0.00861,,0.05,-0.0229,0.0645,0.0516,-0.0202,-0.007,0.0325,-0.0105,0.00283,-0.0099,-0.00373,-0.0153,-0.0338,-0.0463,-0.0186,-0.0177,0.00377,,,-0.0112,-0.007,-0.00404,-0.00472,,,,-0.014,,0.0246,0.0343,,,0.0109,-0.00904,0.0245,,,-0.00571,,-0.0214,-0.0272,0.0138,-0.00404,0.0607,0.0462,0.0574,-0.000659
changeset_features_created,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
changeset_features_modified,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
changeset_features_deleted,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
changeset_has_imagery_used,-0.183,0.249,,,,1.0,-0.71,0.109,0.0131,0.00279,0.0153,-0.0161,-0.0131,0.128,-0.171,-0.131,-0.236,-0.0997,0.199,0.0068,0.0265,-0.0203,-0.00874,-0.136,-0.00395,0.0717,-0.0398,0.0267,-0.0342,0.14,0.019,0.0258,0.0152,0.0907,0.0262,0.03,0.094,0.166,0.0599,0.172,0.0586,0.00615,0.064,0.0193,0.00805,0.00368,0.0862,0.021,0.04,0.0404,0.99,-0.348,-0.296,-0.148,,0.0647,-0.0489,-0.0131,,,0.0216,0.0417,0.036,-0.0346,0.12,-0.0024,-0.0208,,-0.27,0.0216,0.102,0.237,0.0175,0.0291,0.0604,0.0538,0.0172,-0.00947,-0.00506,-0.000143,0.0394,-0.0136,0.12,0.0294,0.0305,,0.0176,0.0276,0.0168,-0.0308,0.11,-0.0024,-0.0137,,-0.27,0.00919,0.102,0.196,0.00644,0.0252,0.0643,0.0304,0.0262,-0.00947,-0.0118,0.0348,0.0412,-0.0153,0.109,0.036,0.0245,,,0.0278,0.00924,-0.0131,0.122,,,,0.0313,,0.027,0.075,,,0.0243,0.0202,0.0444,,,0.00104,,0.0457,0.0979,0.00781,0.0145,0.000255,0.0914,0.0562,0.116
changeset_has_source,0.164,-0.217,,,,-0.713,1.0,-0.0556,-0.0144,-0.0133,-0.00698,0.0586,0.0179,-0.133,0.212,0.158,0.184,0.0462,-0.147,-0.0129,-0.0177,0.0447,0.0344,0.154,-0.0168,-0.0545,0.0373,-0.0332,0.0401,-0.187,-0.013,-0.0495,-0.0255,-0.181,-0.0563,-0.1,-0.124,-0.192,-0.0538,-0.207,-0.0435,-0.0202,-0.108,-0.0134,-0.0234,-0.011,-0.145,-0.0604,-0.07,-0.0293,-0.723,0.386,-0.24,-0.121,,0.0669,-0.0398,-0.0106,,,-0.03,-0.169,-0.0285,0.00496,-0.21,-0.0129,0.00616,,0.47,-0.045,-0.0808,-0.196,-0.0475,-0.0212,-0.0494,-0.0601,-0.0507,0.0206,-0.0355,0.00972,-0.0482,-0.0631,-0.0998,-0.116,-0.03,,-0.0281,-0.16,-0.0202,0.00103,-0.2,-0.0129,-0.00654,,0.48,-0.0394,-0.0793,-0.161,-0.0418,-0.0184,-0.0499,-0.0426,-0.0584,0.0206,-0.0319,-0.0388,-0.0508,-0.0519,-0.0926,-0.113,-0.0238,,,-0.0278,-0.0184,0.0179,-0.0985,,,,-0.015,,-0.0134,-0.0601,,,-0.026,-0.0237,-0.027,,,0.00514,,-0.0455,-0.0715,-0.011,-0.0106,-0.0242,-0.151,-0.104,-0.169
changeset_comment_number_of_words,0.108,-0.0335,,,,0.109,-0.06,1.0,0.0667,0.0079,0.0645,0.28,-0.00604,0.0126,0.104,0.0218,-0.0224,0.0209,-0.119,-0.000808,-0.00526,-0.00639,-0.0156,-0.0351,0.00377,-0.0412,-0.0129,-0.00711,-0.0172,-0.00168,-0.0108,0.00767,-0.0163,0.0109,-0.00569,0.01,0.0403,0.0645,0.0492,0.0789,0.0217,-0.0065,0.00406,-0.0108,0.00253,-0.0106,0.016,-0.00962,0.01,7.88e-05,0.111,0.107,-0.0769,-0.0205,,-0.0177,-0.0146,0.0295,,,-0.0103,-0.00656,0.0183,0.0256,0.02,-0.0117,0.00193,,-0.05,-0.0247,0.0347,0.026,0.00425,-0.00525,-0.0115,0.00666,0.0258,0.0176,0.0145,-0.00501,0.0119,0.00377,0.000421,-0.0212,0.0231,,-0.00772,-0.00224,0.0203,0.0207,0.02,-0.0117,0.013,,-0.06,-0.0274,0.0401,0.0152,0.0179,-0.00573,0.001,0.0157,0.0206,0.0176,0.0148,0.0179,0.0124,0.00285,0.0054,-0.0235,0.0189,,,0.026,0.0353,-0.00604,0.00829,,,,0.0192,,0.0218,0.00958,,,0.00418,0.0183,0.0159,,,-0.00854,,0.0575,-0.0174,-0.00984,-0.000573,-0.0182,-0.0172,-0.0188,-0.00423
changeset_comment_naughty_words_count,0.0143,0.0109,,,,0.0131,-0.01,0.0667,1.0,-0.000777,-0.000931,-4.78e-05,-0.000465,-0.00118,-0.00912,-0.00401,-0.00866,-0.00467,-0.00672,-0.0014,-0.00878,-0.00825,-0.00272,-0.00837,-0.00229,-0.00236,0.0112,-0.00199,0.0111,-0.0129,-0.000714,0.0768,0.0811,0.00674,-0.0085,0.01,0.0214,0.00535,-0.000886,0.02,0.114,0.122,-0.00756,-0.000704,0.0717,0.084,-0.0094,-0.00782,0.01,-0.00202,0.0134,0.00499,0.00927,-0.00529,,-0.00256,-0.00174,-0.000465,,,-0.00132,0.000989,-0.00288,-0.0034,-0.02,-0.0014,-0.0014,,-0.02,-0.00372,-0.00596,0.0102,-0.00233,-0.000931,0.0268,-0.00334,0.0191,-0.00114,-0.0032,-0.00347,-0.00392,0.0112,-0.00486,-0.00679,-0.00214,,-0.00123,0.00249,-0.00251,-0.0034,-0.02,-0.0014,-0.00123,,-0.02,-0.00344,-0.00576,0.0136,-0.00233,-0.000806,0.0264,-0.00303,0.0195,-0.00114,-0.0031,-0.00276,-0.00389,0.0124,-0.00458,-0.00644,-0.00214,,,-0.00242,-0.000806,-0.000465,-0.0047,,,,-0.00462,,-0.00219,-0.00284,,,-0.00114,-0.00104,-0.00256,,,-0.000658,,-0.00247,-0.00313,-0.00104,-0.000465,-9.24e-05,-0.0083,-0.00514,-0.0105
changeset_bbox_area,-0.00209,-0.00674,,,,0.00279,-0.01,0.0079,-0.000777,1.0,-0.000634,-0.000645,-0.000317,0.019,-0.00279,4.82e-05,0.0153,-0.000589,-0.00433,-0.000952,0.00357,-0.00547,-0.00386,-0.00657,-0.00252,-0.00218,0.103,-0.00135,-0.0191,-0.0087,0.551,0.415,0.505,-0.00406,0.0537,0.05,0.00764,-0.00688,0.00396,0.00482,-0.00162,-0.000861,-0.0082,0.55,0.416,0.509,0.00217,0.0575,0.05,-0.00137,0.00302,0.0247,-0.00718,-0.0036,,-0.00174,-0.00119,-0.000317,,,-0.000897,-0.0099,-0.00196,0.129,-0.01,-0.000952,0.156,,-0.02,-0.00253,-0.00405,-0.00678,-0.00159,-0.00056,-0.00286,-0.00228,0.00953,-0.000774,-0.00218,-0.00236,-0.00163,-0.00669,-0.00331,-0.00463,-0.00145,,-0.000839,-0.00933,-0.00171,0.119,-0.01,-0.000952,0.177,,-0.02,-0.00234,-0.00392,-0.00604,-0.00159,-0.000463,-0.00292,-0.00206,0.00846,-0.000774,-0.00211,-0.00188,-0.00161,-0.00641,-0.00312,-0.00439,-0.00146,,,-0.00165,-0.000549,-0.000176,-0.0032,,,,-0.00315,,-0.00149,-0.00175,,,-0.000769,-0.000709,-0.00174,,,-0.000449,,-0.00168,-0.00214,-0.000709,-0.000317,0.0753,0.0516,0.0685,-0.00715


In [13]:
skew = attributes.skew()
skew.sort(ascending=False)
skew.head(10)

Merkaartor                       72.58
user_name_naughty_words_count    72.58
boundary_modified                72.58
waterway_modified                72.58
changeset_bbox_area              55.83
feature_area_old                 51.92
feature_area                     51.88
railway_modified                 51.31
military_old                     41.88
barrier_modified                 41.88
dtype: float64

#### 2b. Data visualizations

In [14]:
# Histograms.
# attributes.hist(figsize=(20, 20));

In [15]:
# Density plot.
# attributes.plot.density(figsize=(20, 50), subplots=True, sharex=False);

In [16]:
# attributes.plot.box(layout=(5, 4), figsize=(20, 25), subplots=True, sharex=False, sharey=False);

In [17]:
# sns.heatmap(attributes.corr(method='pearson'));

## 3. Prepare data

#### 3a. Data cleaning

#### 3b. Feature selection

In [18]:
non_training_attributes = ['changeset_id', 'changeset_harmful']
X = attributes.drop(non_training_attributes, axis=1)
y = attributes['changeset_harmful']

#### 3c. Data transforms

In [19]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X)
Xscaled = scaler.transform(X)

In [20]:
from sklearn.cross_validation import train_test_split

Xtrain, Xtest, ytrain, ytest = train_test_split(Xscaled, y, random_state=42, train_size=0.66)



In [21]:
# Estimate importance of all features.
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier()
model.fit(X, y)

importances = pd.DataFrame(
    list(zip(X.columns, model.feature_importances_)),
    columns=['feature', 'importance']
)
importances.sort(columns='importance', ascending=False).head(10)

Unnamed: 0,feature,importance
16,user_changesets_with_discussions_count,0.28
23,old_user_changesets_with_discussions_count,0.05
48,iD,0.03
12,user_changesets_count,0.03
136,tag_values_popularity_max,0.03
5,changeset_comment_number_of_words,0.02
14,user_mapping_days_count,0.02
26,feature_days_since_last_edit,0.02
7,changeset_bbox_area,0.02
28,feature_area,0.02


## 4. Evaluate algorithms

#### 4a. Split-out validation dataset
- We have a seperate validation dataset. Yay!

#### 4b. Spot Check Algorithms
- Running algorithms in a loop below.

#### 4c. Compare Algorithms

In [22]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

models = []
models.append(('LR', LogisticRegression()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('SVM', SVC()))
models.append(('RFC', RandomForestClassifier()))
models.append(('GBC', GradientBoostingClassifier()))

results = []
names = []
for (name, model) in models:
    kfold = KFold(n_splits=10, random_state=7)
    result = cross_val_score(model, Xscaled, y, cv=kfold, scoring='roc_auc')
    results.append(result)
    names.append(name)
    
    print('{}: {} ({})'.format(name, round(result.mean(), 2), round(result.std(), 2)))

LR: 0.79 (0.08)
CART: 0.58 (0.05)
KNN: 0.62 (0.08)
SVM: 0.73 (0.08)
RFC: 0.74 (0.08)
GBC: 0.87 (0.04)


In [23]:
# NOTE: Temporarily turning off the graph.
# fig, ax = plt.subplots(1, 1)
# plt.boxplot(results)
# ax.set_xticklabels(names)
# ax.set_ylabel('roc_auc')
# ax.set_ylim((0, 1));

## 5. Improve Accuracy

#### 5a. Algorithm Tuning

In [24]:
harmful_count = attributes[attributes['changeset_harmful'] == True].shape[0]
not_harmful_count = attributes[attributes['changeset_harmful'] == False].shape[0]

# Making this a float to prevent truncation due to integer division.
count = 1.0 * (harmful_count + not_harmful_count)

# Calculate sample weights based on number of occourances.
sample_weight = [10 if l else 1 for l in ytrain]

In [25]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier()
param_grid = {
    'n_estimators': [300, 500, 700],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.05, 0.1,  0.5],
    'max_features': ['log2'],
    'random_state': [5],
}
grid = GridSearchCV(estimator=model, param_grid=param_grid, fit_params={'sample_weight': sample_weight})
grid.fit(Xtrain, ytrain)

print('Best score: {}'.format(grid.best_score_))
print('Best params: {}'.format(grid.best_params_))

Best score: 0.9559838895281934
Best params: {'learning_rate': 0.1, 'max_features': 'log2', 'random_state': 5, 'max_depth': 7, 'n_estimators': 300}


In [26]:
model = grid.best_estimator_
ymodel = model.predict(Xtest)

#### 5b. Test options and evaluation metric

In [27]:
from sklearn.metrics import confusion_matrix

matrix = confusion_matrix(ytest, ymodel)

matrix = pd.DataFrame(matrix, index=['Labelled good', 'Labelled harmful'], columns=['Predicted good', 'Predicted harmful'])
matrix

Unnamed: 0,Predicted good,Predicted harmful
Labelled good,1641,14
Labelled harmful,56,81


In [28]:
total = matrix.sum().sum()
matrix / total

Unnamed: 0,Predicted good,Predicted harmful
Labelled good,0.92,0.00781
Labelled harmful,0.03,0.0452


In [29]:
from sklearn.metrics import classification_report

report = classification_report(ytest, ymodel)
print(report)

             precision    recall  f1-score   support

          0       0.97      0.99      0.98      1655
          1       0.85      0.59      0.70       137

avg / total       0.96      0.96      0.96      1792



In [30]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, Xscaled, y, cv=kfold, scoring='roc_auc')

# An area of 0.5 represents a model that is as good as random.
# An area of 1.0 represents a model that made all predictions perfectly.
print('Scores: {} ({})'.format(round(scores.mean(), 2), round(scores.std(), 2)))

Scores: 0.85 (0.03)


#### 5c. Ensembles
- Todo

## 6. Finalize model

#### 6a. Predictions on validation dataset

In [31]:
vattributes = pd.read_csv('../downloads/feature-classifier-2017-05-25/validation/attributes.csv')
vattributes.head(10)

Unnamed: 0,changeset_id,changeset_harmful,changeset_features_created,changeset_features_modified,changeset_features_deleted,changeset_has_imagery_used,changeset_has_source,changeset_comment_number_of_words,changeset_comment_naughty_words_count,changeset_bbox_area,changeset_non_open_data_source,changeset_comment_special_characters_count,user_name_naughty_words_count,user_name_special_characters_count,user_changesets_count,user_features_count,user_mapping_days_count,user_discussions_count,user_changesets_with_discussions_count,old_user_name_naughty_words_count,old_user_name_special_characters_count,old_user_changesets_count,old_user_features_count,old_user_mapping_days_count,old_user_discussions_count,old_user_changesets_with_discussions_count,feature_version,feature_name_naughty_words_count,feature_days_since_last_edit,feature_primary_tags,feature_area,feature_property_tags,feature_name_translations_count,feature_has_website,feature_has_wikidata,feature_has_wikipedia,feature_tags_created_count,feature_tags_modified_count,feature_tags_deleted_count,feature_tags_distance,feature_similar_tags_count,feature_name_naughty_words_count_old,feature_primary_tags_old,feature_area_old,feature_property_tags_old,feature_name_translations_count_old,feature_has_website_old,feature_has_wikidata_old,feature_has_wikipedia_old,feature_similar_tags_count_old,iD,JOSM,MAPS.ME,Potlatch,Redaction,Vespucci,OsmAnd,Merkaartor,GNOME,aerialway,aeroway,amenity,barrier,boundary,building,craft,emergency,geological,highway,historic,landuse,leisure,man_made,military,natural,office,place,power,public_transport,railway,route,shop,sport,tourism,waterway,aerialway_old,aeroway_old,amenity_old,barrier_old,boundary_old,building_old,craft_old,emergency_old,geological_old,highway_old,historic_old,landuse_old,leisure_old,man_made_old,military_old,natural_old,office_old,place_old,power_old,public_transport_old,railway_old,route_old,shop_old,sport_old,tourism_old,waterway_old,aerialway_modified,aeroway_modified,amenity_modified,barrier_modified,boundary_modified,building_modified,craft_modified,emergency_modified,geological_modified,highway_modified,historic_modified,landuse_modified,leisure_modified,man_made_modified,military_modified,natural_modified,office_modified,place_modified,power_modified,public_transport_modified,railway_modified,route_modified,shop_modified,sport_modified,tourism_modified,waterway_modified,tag_values_popularity_min,tag_values_popularity_max,tag_values_popularity_mean,tag_values_popularity_stddev
0,48917139,0,0,1,0,1,0,1,0,16,0,0,0,3,42,297,8,0,0,0,3,42,297,8,0,0,5,0,0,1,0,5,2,0,0,0,0,0,0,0,0,0,1,0,5,2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0221,0.0221,0.0221,0.0
1,48901413,0,0,1,0,1,0,5,0,0,0,0,0,0,4,4,1,0,0,0,0,4,4,1,0,0,2,0,0,1,0,3,1,0,0,0,1,0,0,1,0,0,0,0,2,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.166,0.166,0.166,0.0
2,48900803,0,0,1,0,0,1,2,0,0,0,0,0,0,507,83653,109,2,6,0,0,507,83653,109,2,6,5,0,419,1,0,6,4,0,0,0,3,0,0,3,0,0,1,0,3,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0221,0.0221,0.0221,0.0
3,48899023,0,0,1,0,1,0,5,0,4974,0,0,0,4,3,12,1,0,0,0,0,45080,3570101,1248,8,11,4,0,494,1,4397,4,1,0,0,0,2,0,1,3,0,0,1,4397,3,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.166,0.166,0.166,0.0
4,48891092,0,0,1,0,0,1,4,0,25183777914,0,4,0,1,1089,1226,31,0,2,0,0,115,11681,50,6,0,26,0,81,1,22095166073,61,57,0,1,1,46,0,0,46,0,0,1,22095166073,15,11,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.29,0.29,0.29,0.0
5,48891008,1,0,1,0,1,0,1,0,0,0,0,0,0,40,492,3,0,2,0,0,40,492,3,0,2,3,0,0,2,0,5,1,0,0,0,0,0,1,1,0,0,2,0,6,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0714,0.294,0.183,0.11
6,48891000,0,0,1,0,0,1,2,0,0,0,3,0,5,93891,3402703,790,62,40,0,0,1,4,1,0,0,20,0,0,1,0,22,11,0,1,1,1,0,0,1,0,0,0,0,21,11,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0043,0.0043,0.0043,0.0
7,48890991,1,0,1,0,1,0,1,0,0,0,0,0,0,40,492,3,0,2,0,0,40,492,3,0,2,3,0,0,2,0,4,1,0,0,0,0,0,1,1,0,0,2,0,5,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.214,0.294,0.254,0.04
8,48890966,0,0,1,0,1,0,3,0,405037,0,0,0,0,5,26,1,0,0,0,0,2351,80975,500,0,0,7,0,3254,1,115628,1,0,0,0,0,0,1,1,2,0,0,1,115628,2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.166,0.166,0.166,0.0
9,48884156,0,0,1,0,0,0,5,0,0,0,3,0,0,10247,443817,1049,1059,39,0,0,120,847,37,0,0,14,0,7,1,0,16,12,0,1,1,1,0,1,2,0,0,1,0,16,12,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0043,0.0043,0.0043,0.0


In [32]:
# Drop all duplicate attributes.
print('Shape before dropping duplicates: {}'.format(vattributes.shape))
vattributes = vattributes.drop_duplicates(subset='changeset_id')
print('Shape after dropping duplicates: {}'.format(vattributes.shape))

Shape before dropping duplicates: (537, 141)
Shape after dropping duplicates: (537, 141)


In [33]:
vattributes.groupby('changeset_harmful').size()

changeset_harmful
0    449
1     88
dtype: int64

In [34]:
# TODO: Replace with a pipeline
vX = vattributes.drop(non_training_attributes, axis=1)
vXscaled = scaler.transform(vX)

vy = vattributes['changeset_harmful']

In [35]:
vymodel = model.predict(vXscaled)

In [36]:
vattributes['prediction'] = model.predict(vXscaled)
vattributes.to_csv('../downloads/feature-classifier-2017-05-25/validation/review.csv', index=False)

In [37]:
from sklearn.metrics import confusion_matrix
matrix = confusion_matrix(vy, vymodel)
matrix = pd.DataFrame(matrix, index=['Labelled good', 'Labelled harmful'], columns=['Predicted good', 'Predicted harmful'])
matrix

Unnamed: 0,Predicted good,Predicted harmful
Labelled good,428,21
Labelled harmful,66,22


In [38]:
from sklearn.metrics import classification_report

report = classification_report(vy, vymodel)
print(report)

             precision    recall  f1-score   support

          0       0.87      0.95      0.91       449
          1       0.51      0.25      0.34        88

avg / total       0.81      0.84      0.81       537



In [39]:
scores = cross_val_score(model, vXscaled, vy, cv=kfold, scoring='roc_auc')
print('Scores: {} ({})'.format(round(scores.mean(), 2), round(scores.std(), 2)))

Scores: 0.79 (0.11)


#### 6b. Create standalone model on entire training dataset
- Todo

#### 6c. Predictions on testing dataset

In [40]:
tattributes = pd.read_csv('../downloads/feature-classifier-2017-05-25/testing/attributes.csv')
tattributes.head(10)

Unnamed: 0,changeset_id,changeset_harmful,changeset_features_created,changeset_features_modified,changeset_features_deleted,changeset_has_imagery_used,changeset_has_source,changeset_comment_number_of_words,changeset_comment_naughty_words_count,changeset_bbox_area,changeset_non_open_data_source,changeset_comment_special_characters_count,user_name_naughty_words_count,user_name_special_characters_count,user_changesets_count,user_features_count,user_mapping_days_count,user_discussions_count,user_changesets_with_discussions_count,old_user_name_naughty_words_count,old_user_name_special_characters_count,old_user_changesets_count,old_user_features_count,old_user_mapping_days_count,old_user_discussions_count,old_user_changesets_with_discussions_count,feature_version,feature_name_naughty_words_count,feature_days_since_last_edit,feature_primary_tags,feature_area,feature_property_tags,feature_name_translations_count,feature_has_website,feature_has_wikidata,feature_has_wikipedia,feature_tags_created_count,feature_tags_modified_count,feature_tags_deleted_count,feature_tags_distance,feature_similar_tags_count,feature_name_naughty_words_count_old,feature_primary_tags_old,feature_area_old,feature_property_tags_old,feature_name_translations_count_old,feature_has_website_old,feature_has_wikidata_old,feature_has_wikipedia_old,feature_similar_tags_count_old,iD,JOSM,MAPS.ME,Potlatch,Redaction,Vespucci,OsmAnd,Merkaartor,GNOME,aerialway,aeroway,amenity,barrier,boundary,building,craft,emergency,geological,highway,historic,landuse,leisure,man_made,military,natural,office,place,power,public_transport,railway,route,shop,sport,tourism,waterway,aerialway_old,aeroway_old,amenity_old,barrier_old,boundary_old,building_old,craft_old,emergency_old,geological_old,highway_old,historic_old,landuse_old,leisure_old,man_made_old,military_old,natural_old,office_old,place_old,power_old,public_transport_old,railway_old,route_old,shop_old,sport_old,tourism_old,waterway_old,aerialway_modified,aeroway_modified,amenity_modified,barrier_modified,boundary_modified,building_modified,craft_modified,emergency_modified,geological_modified,highway_modified,historic_modified,landuse_modified,leisure_modified,man_made_modified,military_modified,natural_modified,office_modified,place_modified,power_modified,public_transport_modified,railway_modified,route_modified,shop_modified,sport_modified,tourism_modified,waterway_modified,tag_values_popularity_min,tag_values_popularity_max,tag_values_popularity_mean,tag_values_popularity_stddev
0,48320582,0,0,1,0,1,0,5,0,421,0,0,0,2,2503,131397,239,0,1,0,0,2245,374811,512,0,0,5,0,412,2,628,10,1,0,0,0,0,2,0,2,0,0,2,628,10,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0.0032,0.81,0.407,0.4
1,48320566,0,0,1,0,1,0,2,0,0,0,0,0,0,1785,5983,272,0,2,0,0,1785,5983,272,0,2,4,0,13,1,0,3,1,0,0,0,0,1,0,1,0,0,1,0,3,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0642,0.0642,0.0642,0.0
2,48320564,0,0,1,0,0,0,4,0,53072,0,0,0,0,2465,44250,259,0,1,0,0,2465,44250,259,0,1,2,0,0,1,0,9,0,0,0,0,1,0,0,1,0,0,1,0,8,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.062,0.062,0.062,0.0
3,48320557,0,0,1,0,1,0,1,0,0,0,0,0,4,3161,14475,232,0,0,0,0,3359,111051,1019,13,1,2,0,513,1,0,4,1,0,0,0,0,1,0,1,0,0,1,0,4,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0642,0.0642,0.0642,0.0
4,48320539,0,0,1,0,1,0,1,0,29773,0,0,0,0,525,17829,27,0,5,0,0,12044,1242695,1534,1988,56,2,0,802,2,51529,4,1,0,0,0,0,1,0,1,0,0,2,51529,4,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0017,0.126,0.0638,0.06
5,48320533,0,0,1,0,1,0,1,0,0,0,0,0,4,3161,14475,232,0,0,0,0,3359,111051,1019,13,1,3,0,698,1,0,3,1,0,0,0,0,1,0,1,0,0,1,0,3,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0018,0.0018,0.0018,0.0
6,48320532,0,0,1,0,0,0,3,0,1169,0,0,0,0,6471,2528927,824,6,3,0,0,6471,2528927,824,6,3,6,0,985,2,531,14,1,1,0,0,0,1,0,1,0,0,2,531,14,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0642,0.81,0.437,0.37
7,48320524,0,0,1,0,0,1,3,0,910,0,0,0,0,6471,2528927,824,6,3,0,0,6471,2528927,824,6,3,4,0,1228,2,336,11,1,1,0,0,2,0,0,2,0,0,2,336,9,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0642,0.81,0.437,0.37
8,48320519,0,0,1,0,1,0,8,0,4327288,0,1,0,0,1633,23457,145,3,3,0,0,1633,23457,145,3,3,2,0,0,1,0,2,1,0,0,0,1,0,0,1,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0003,0.0003,0.0003,0.0
9,48320502,0,0,1,0,1,0,1,0,849,0,0,0,0,173,7183,32,0,1,0,0,173,7183,32,0,1,2,0,5,2,608,8,1,0,0,0,3,1,0,4,0,0,1,608,5,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0783,0.81,0.444,0.37


In [41]:
# Drop all duplicate attributes.
print('Shape before dropping duplicates: {}'.format(tattributes.shape))
tattributes = tattributes.drop_duplicates(subset='changeset_id')
print('Shape after dropping duplicates: {}'.format(tattributes.shape))

Shape before dropping duplicates: (3704, 141)
Shape after dropping duplicates: (3704, 141)


In [42]:
# TODO: Replace with a pipeline
tX = tattributes.drop(non_training_attributes, axis=1)
tXscaled = scaler.transform(tX)

ty = tattributes['changeset_harmful']

In [43]:
tattributes['prediction'] = model.predict(tXscaled)

In [44]:
print('Predicted good: {}'.format(tattributes[tattributes['prediction'] == False].shape[0]))
print('Predicted harmful: {}'.format(tattributes[tattributes['prediction'] == True].shape[0]))

Predicted good: 3676
Predicted harmful: 28


In [45]:
tattributes.to_csv('../downloads/feature-classifier-2017-05-25/testing/review.csv', index=False)

#### 6d. Save model for later use

In [46]:
from sklearn.externals import joblib

model_path = '../gabbar/trained/model.pkl'
joblib.dump(model, model_path)

['../gabbar/trained/model.pkl']