# Feature classifier - Gabbar

## Step 1. Prepare problem

#### 1a. Load libraries

In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set_style('ticks')

In [3]:
pd.set_option('display.precision', 2)
pd.set_option('display.max_columns', None)

In [4]:
path = '../downloads/old-feature-classifier/'

#### 1b. Load dataset

In [5]:
# Loading the training dataset.
training = pd.read_csv(path + 'training.csv')

In [6]:
# Drop all duplicate attributes.
print('Shape before dropping duplicates: {}'.format(training.shape))
training = training.drop_duplicates(subset='changeset_id')
print('Shape after dropping duplicates: {}'.format(training.shape))

Shape before dropping duplicates: (6035, 115)
Shape after dropping duplicates: (5302, 115)


In [7]:
# Creating a smaller sample to speed up workflow.
# training = training[:1000]

In [8]:
training.tail()

Unnamed: 0,changeset_id,changeset_harmful,changeset_features_created,changeset_features_modified,changeset_features_deleted,changeset_has_imagery_used,changeset_has_source,changeset_comment_number_of_words,changeset_comment_naughty_words_count,changeset_bbox_area,changeset_non_open_data_source,changeset_comment_special_characters_count,user_name_naughty_words_count,user_name_special_characters_count,user_changesets_count,user_features_count,user_mapping_days_count,user_discussions_count,user_changesets_with_discussions_count,old_user_name_naughty_words_count,old_user_name_special_characters_count,old_user_changesets_count,old_user_features_count,old_user_mapping_days_count,old_user_discussions_count,old_user_changesets_with_discussions_count,feature_version,feature_name_naughty_words_count,feature_days_since_last_edit,feature_primary_tags,feature_area,feature_property_tags,feature_name_translations_count,feature_has_website,feature_has_wikidata,feature_has_wikipedia,feature_tags_created_count,feature_tags_modified_count,feature_tags_deleted_count,feature_tags_distance,feature_similar_tags_count,feature_name_naughty_words_count_old,feature_primary_tags_old,feature_area_old,feature_property_tags_old,feature_name_translations_count_old,feature_has_website_old,feature_has_wikidata_old,feature_has_wikipedia_old,feature_similar_tags_count_old,iD,JOSM,MAPS.ME,Potlatch,Redaction,Vespucci,OsmAnd,Merkaartor,GNOME,aerialway,aeroway,amenity,barrier,boundary,building,craft,emergency,geological,highway,historic,landuse,leisure,man_made,military,natural,office,place,power,public_transport,railway,route,shop,sport,tourism,waterway,aerialway_old,aeroway_old,amenity_old,barrier_old,boundary_old,building_old,craft_old,emergency_old,geological_old,highway_old,historic_old,landuse_old,leisure_old,man_made_old,military_old,natural_old,office_old,place_old,power_old,public_transport_old,railway_old,route_old,shop_old,sport_old,tourism_old,waterway_old,tag_values_popularity_min,tag_values_popularity_max,tag_values_popularity_mean,tag_values_popularity_stddev
6030,48422721,0,0,1,0,1,0,5,0,23,0,10,0,0,17,729,1,0,0,0,0,13,486,1,0,0,2,0,0,1,20,1,0,0,0,0,1,0,1,2,0,0,0,20,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,81.02,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.81,0.81,0.81,0.0
6031,48407981,0,0,1,0,1,0,5,0,24,0,10,0,0,37,2478,1,0,0,0,0,37,2478,1,0,0,2,0,0,1,25,1,0,0,0,0,1,0,1,2,0,0,0,25,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,81.02,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.81,0.81,0.81,0.0
6032,48344660,0,0,1,0,1,0,9,0,17,0,11,0,0,79,1024,1,0,0,0,0,79,1024,1,0,0,2,0,0,1,13,1,0,0,0,0,1,0,1,2,0,0,0,13,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,81.02,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.81,0.81,0.81,0.0
6033,48422923,0,0,1,0,1,0,3,0,20,0,9,0,0,562,6273,1,0,0,0,0,562,6273,1,0,0,2,0,0,1,17,1,0,0,0,0,1,0,1,2,0,0,0,17,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,81.02,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.81,0.81,0.81,0.0
6034,48409432,0,0,1,0,1,0,5,0,102,0,10,0,0,3,81,1,0,0,0,0,3,81,1,0,0,2,0,0,1,88,1,0,0,0,0,1,0,1,2,0,0,0,88,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,81.02,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.81,0.81,0.81,0.0


## 2. Summarize data

#### 2a. Descriptive statistics

In [9]:
print('Shape: {}'.format(training.shape))

Shape: (5302, 115)


In [10]:
training.dtypes.head()

changeset_id                   int64
changeset_harmful              int64
changeset_features_created     int64
changeset_features_modified    int64
changeset_features_deleted     int64
dtype: object

In [11]:
training.describe()

Unnamed: 0,changeset_id,changeset_harmful,changeset_features_created,changeset_features_modified,changeset_features_deleted,changeset_has_imagery_used,changeset_has_source,changeset_comment_number_of_words,changeset_comment_naughty_words_count,changeset_bbox_area,changeset_non_open_data_source,changeset_comment_special_characters_count,user_name_naughty_words_count,user_name_special_characters_count,user_changesets_count,user_features_count,user_mapping_days_count,user_discussions_count,user_changesets_with_discussions_count,old_user_name_naughty_words_count,old_user_name_special_characters_count,old_user_changesets_count,old_user_features_count,old_user_mapping_days_count,old_user_discussions_count,old_user_changesets_with_discussions_count,feature_version,feature_name_naughty_words_count,feature_days_since_last_edit,feature_primary_tags,feature_area,feature_property_tags,feature_name_translations_count,feature_has_website,feature_has_wikidata,feature_has_wikipedia,feature_tags_created_count,feature_tags_modified_count,feature_tags_deleted_count,feature_tags_distance,feature_similar_tags_count,feature_name_naughty_words_count_old,feature_primary_tags_old,feature_area_old,feature_property_tags_old,feature_name_translations_count_old,feature_has_website_old,feature_has_wikidata_old,feature_has_wikipedia_old,feature_similar_tags_count_old,iD,JOSM,MAPS.ME,Potlatch,Redaction,Vespucci,OsmAnd,Merkaartor,GNOME,aerialway,aeroway,amenity,barrier,boundary,building,craft,emergency,geological,highway,historic,landuse,leisure,man_made,military,natural,office,place,power,public_transport,railway,route,shop,sport,tourism,waterway,aerialway_old,aeroway_old,amenity_old,barrier_old,boundary_old,building_old,craft_old,emergency_old,geological_old,highway_old,historic_old,landuse_old,leisure_old,man_made_old,military_old,natural_old,office_old,place_old,power_old,public_transport_old,railway_old,route_old,shop_old,sport_old,tourism_old,waterway_old,tag_values_popularity_min,tag_values_popularity_max,tag_values_popularity_mean,tag_values_popularity_stddev
count,5300.0,5302.0,5302.0,5302.0,5302.0,5302.0,5302.0,5302.0,5300.0,5300.0,5300.0,5302.0,5300.0,5302.0,5302.0,5300.0,5302.0,5302.0,5302.0,5300.0,5302.0,5302.0,5300.0,5302.0,5302.0,5302.0,5302.0,5300.0,5302.0,5302.0,5300.0,5302.0,5302.0,5302.0,5302.0,5302.0,5302.0,5302.0,5302.0,5302.0,5302.0,5300.0,5302.0,5300.0,5302.0,5302.0,5302.0,5302.0,5302.0,5300.0,5302.0,5302.0,5302.0,5302.0,5302.0,5300.0,5300.0,5300.0,5302.0,5302.0,5302.0,5302.0,5302.0,5302.0,5302.0,5300.0,5302.0,5302.0,5302.0,5302.0,5302.0,5302.0,5302.0,5300.0,5302.0,5302.0,5302.0,5302.0,5302.0,5302.0,5302.0,5302.0,5302.0,5302.0,5302.0,5302.0,5302.0,5302.0,5302.0,5302.0,5302.0,5300.0,5302.0,5302.0,5302.0,5302.0,5302.0,5302.0,5302.0,5300.0,5302.0,5302.0,5302.0,5302.0,5302.0,5302.0,5302.0,5302.0,5302.0,5302.0,5302.0,5302.0,5302.0,5302.0,5302.0
mean,47300000.0,0.08,0.0,1.0,0.0,0.52,0.31,4.25,0.00132,30600000000.0,0.000754,0.85,0.000377,0.39,4514.74,272000.0,261.86,25.93,82.22,0.00151,0.44,6740.79,1360000.0,543.94,59.54,20.29,6.81,0.00453,590.11,1.17,26800000000.0,7.19,2.12,0.11,0.09,0.11,1.5,0.37,0.22,2.09,0.01,0.00245,1.1,26500000000.0,5.92,1.74,0.07,0.08,0.1,0.00679,0.51,0.14,0.1,0.02,0.0,0.00717,0.00622,0.000377,0.0,0.01,0.02,0.65,0.1,0.58,9.46,0.00675,0.07,0.0,4.9,0.11,0.24,1.54,0.02,0.00115,0.72,0.1,0.39,0.02,0.27,0.15,0.21,0.32,0.12,0.37,0.11,0.01,0.02,0.6,0.09,0.58,10.1,0.00638,0.07,0.0,4.89,0.1,0.21,1.07,0.02,0.000615,0.74,0.08,0.4,0.01,0.27,0.08,0.21,0.31,0.09,0.34,0.09,0.13,0.2,0.16,0.03
std,784000.0,0.28,0.0,0.0,0.0,0.5,0.46,5.28,0.0363,1300000000000.0,0.0275,2.61,0.0194,0.98,14737.58,1190000.0,484.44,152.48,466.29,0.0388,1.05,21121.84,7420000.0,656.55,409.35,189.71,18.72,0.0725,734.71,0.42,1070000000000.0,9.54,8.2,0.32,0.29,0.32,3.41,0.66,0.93,3.53,0.18,0.0531,0.44,1070000000000.0,9.01,7.49,0.26,0.27,0.3,0.116,0.5,0.34,0.29,0.15,0.0,0.0844,0.0787,0.0194,0.0,0.55,0.52,2.37,1.62,7.14,25.83,0.174,2.22,0.0,10.72,1.12,1.84,5.27,0.4,0.05,4.36,1.35,2.57,0.96,3.48,1.99,2.1,1.63,1.5,2.09,2.67,0.55,0.53,2.26,1.57,7.14,26.62,0.172,2.21,0.0,10.73,1.07,1.82,4.51,0.4,0.0317,4.32,1.18,2.63,0.89,3.48,1.69,2.09,1.62,1.33,1.95,2.47,0.19,0.26,0.2,0.1
min,44800000.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,-2330.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,46900000.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0,48.0,3.0,0.0,0.0,0.0,0.0,242.0,6940.0,39.0,0.0,0.0,2.0,0.0,24.25,1.0,0.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.02,0.02,0.0
50%,47400000.0,0.0,0.0,1.0,0.0,1.0,0.0,3.0,0.0,984.0,0.0,0.0,0.0,0.0,225.0,1600.0,27.0,0.0,0.0,0.0,0.0,1747.0,131000.0,275.0,1.0,1.0,4.0,0.0,294.0,1.0,0.0,5.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,4.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.08,0.06,0.0
75%,47800000.0,0.0,0.0,1.0,0.0,1.0,1.0,5.0,0.0,12800.0,0.0,0.0,0.0,0.0,2627.0,52800.0,229.0,4.0,4.0,0.0,0.0,5912.0,827000.0,793.5,12.0,7.0,6.0,0.0,978.0,1.0,964.0,9.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,2.0,0.0,0.0,1.0,938.0,7.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.81,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.81,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.17,0.33,0.29,0.0
max,49100000.0,1.0,0.0,1.0,0.0,1.0,1.0,246.0,1.0,85300000000000.0,1.0,34.0,1.0,9.0,283525.0,40700000.0,3420.0,4437.0,2919.0,1.0,9.0,283525.0,185000000.0,3420.0,6362.0,2919.0,526.0,2.0,3868.0,4.0,56800000000000.0,252.0,241.0,1.0,1.0,1.0,100.0,9.0,21.0,100.0,5.0,2.0,4.0,56800000000000.0,249.0,238.0,1.0,1.0,1.0,5.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,28.56,12.15,21.22,31.9,89.37,81.02,7.81,80.24,0.0,32.55,19.2,20.64,29.39,12.25,2.82,30.12,21.52,27.12,64.71,50.83,46.21,25.22,12.99,25.51,28.35,73.2,28.56,12.15,21.22,31.9,89.37,81.02,7.81,80.24,0.0,32.55,19.2,20.64,29.39,12.25,1.63,30.12,21.52,27.12,64.71,50.83,46.21,25.22,12.99,25.51,28.35,73.2,0.89,0.89,0.89,0.45


In [12]:
training.groupby('changeset_harmful').size()

changeset_harmful
0    4861
1     441
dtype: int64

In [13]:
training.corr(method='pearson').head()

Unnamed: 0,changeset_id,changeset_harmful,changeset_features_created,changeset_features_modified,changeset_features_deleted,changeset_has_imagery_used,changeset_has_source,changeset_comment_number_of_words,changeset_comment_naughty_words_count,changeset_bbox_area,changeset_non_open_data_source,changeset_comment_special_characters_count,user_name_naughty_words_count,user_name_special_characters_count,user_changesets_count,user_features_count,user_mapping_days_count,user_discussions_count,user_changesets_with_discussions_count,old_user_name_naughty_words_count,old_user_name_special_characters_count,old_user_changesets_count,old_user_features_count,old_user_mapping_days_count,old_user_discussions_count,old_user_changesets_with_discussions_count,feature_version,feature_name_naughty_words_count,feature_days_since_last_edit,feature_primary_tags,feature_area,feature_property_tags,feature_name_translations_count,feature_has_website,feature_has_wikidata,feature_has_wikipedia,feature_tags_created_count,feature_tags_modified_count,feature_tags_deleted_count,feature_tags_distance,feature_similar_tags_count,feature_name_naughty_words_count_old,feature_primary_tags_old,feature_area_old,feature_property_tags_old,feature_name_translations_count_old,feature_has_website_old,feature_has_wikidata_old,feature_has_wikipedia_old,feature_similar_tags_count_old,iD,JOSM,MAPS.ME,Potlatch,Redaction,Vespucci,OsmAnd,Merkaartor,GNOME,aerialway,aeroway,amenity,barrier,boundary,building,craft,emergency,geological,highway,historic,landuse,leisure,man_made,military,natural,office,place,power,public_transport,railway,route,shop,sport,tourism,waterway,aerialway_old,aeroway_old,amenity_old,barrier_old,boundary_old,building_old,craft_old,emergency_old,geological_old,highway_old,historic_old,landuse_old,leisure_old,man_made_old,military_old,natural_old,office_old,place_old,power_old,public_transport_old,railway_old,route_old,shop_old,sport_old,tourism_old,waterway_old,tag_values_popularity_min,tag_values_popularity_max,tag_values_popularity_mean,tag_values_popularity_stddev
changeset_id,1.0,-0.17,,,,-0.05,0.1,0.07,0.0205,-0.00988,0.00485,0.04,-0.0158,0.065,-0.04,0.03,-0.02,-0.02,-0.52,0.00805,0.04,0.01,0.04,0.02,0.0195,-0.2,0.05,0.03,-0.0407,0.00857,-0.01,0.1,0.102,-0.07,0.17,0.16,0.07,0.0295,0.07,0.09,0.05,0.01,-0.02,-0.01,0.09,0.0805,-0.02,0.17,0.15,0.01,-0.05,-0.00704,-0.00383,-0.00175,,-0.03,-0.04,0.0105,,0.0296,-0.02,-0.07,-0.0139,-0.00296,-0.05,-0.01,0.00794,,-0.15,-0.00812,0.08,0.16,-0.03,0.00177,0.03,-0.00845,0.1,0.0235,-0.0569,-0.00351,-0.04,-0.03,0.02,-0.04,0.01,0.0296,-0.0159,-0.06,-0.0115,0.00339,-0.06,-0.00861,0.00799,,-0.15,-0.00418,0.05,0.08,-0.03,0.00787,0.05,0.015,0.1,0.0165,-0.0603,-0.00891,-0.04,-0.04,0.02,-0.04,0.00841,-0.06,-0.08,-0.08,-0.04
changeset_harmful,-0.17,1.0,,,,0.22,-0.18,-0.02,0.00786,-0.00709,0.0166,-0.05,-0.00585,-0.00694,-0.06,-0.06,-0.14,-0.05,0.54,0.00589,-0.04,-0.02,-0.02,-0.08,-0.000221,0.21,-0.01,0.06,-0.00447,0.0373,0.02,-0.05,0.00209,-0.05,-0.03,-0.04,-0.03,0.00093,0.04,-0.02,0.14,0.04,-0.01,0.02,-0.04,0.00055,-0.04,-0.01,-0.02,0.08,0.23,-0.0947,-0.042,-0.0246,,-0.03,-0.02,-0.00585,,-0.00599,-0.01,-0.04,-0.00583,0.00174,-0.03,-0.01,-0.00736,,0.08,-0.0299,0.09,0.13,-0.01,-0.00691,0.06,-0.00167,0.03,-0.00607,-0.00163,-0.0065,-0.03,-0.05,0.05,-0.04,0.03,-0.00599,0.00209,-0.03,-0.00389,0.00182,-0.03,-0.0112,-0.00701,,0.08,-0.0279,0.08,0.05,-0.01,-0.00585,0.05,-0.0085,0.02,-0.00489,-0.00163,0.00494,-0.03,-0.04,0.04,-0.03,0.0117,0.05,0.03,0.04,-0.01
changeset_features_created,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
changeset_features_modified,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
changeset_features_deleted,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [14]:
skew = training.skew()
skew.sort(ascending=False)
skew.head()

power_old                        71.91
power                            61.60
changeset_bbox_area              57.92
military_old                     51.47
user_name_naughty_words_count    51.47
dtype: float64

#### 2b. Data visualizations

In [15]:
# Histograms.
# training.hist(figsize=(20, 20));

In [16]:
# Density plot.
# training.plot.density(figsize=(20, 50), subplots=True, sharex=False);

In [17]:
# training.plot.box(layout=(5, 4), figsize=(20, 25), subplots=True, sharex=False, sharey=False);

In [18]:
# sns.heatmap(training.corr(method='pearson'));

## 3. Prepare data

#### 3a. Data cleaning

#### 3b. Feature selection

In [19]:
non_training_attributes = ['changeset_id', 'changeset_harmful']
X = training.drop(non_training_attributes, axis=1)
y = training['changeset_harmful']

#### 3c. Data transforms

In [20]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X)
X_scaled = scaler.transform(X)

In [21]:
# Estimate importance of all features.
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier()
model.fit(X_scaled, y)

importances = pd.DataFrame(
    list(zip(X.columns, model.feature_importances_)),
    columns=['feature', 'importance']
)
importances.sort(columns='importance', ascending=False).head()

Unnamed: 0,feature,importance
16,user_changesets_with_discussions_count,0.21
23,old_user_changesets_with_discussions_count,0.04
12,user_changesets_count,0.03
14,user_mapping_days_count,0.03
5,changeset_comment_number_of_words,0.03


## 4. Evaluate algorithms

#### 4a. Split-out validation dataset
- We have a seperate validation dataset. Yay!

#### 4b. Spot Check Algorithms
- Running algorithms in a loop below.

#### 4c. Compare Algorithms

In [22]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

models = []
models.append(('LR', LogisticRegression()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('SVM', SVC()))
models.append(('RFC', RandomForestClassifier()))
models.append(('GBC', GradientBoostingClassifier()))

results = []
names = []
for (name, model) in models:
    kfold = KFold(n_splits=2, random_state=7)
    result = cross_val_score(model, X_scaled, y, cv=kfold, scoring='roc_auc')
    results.append(result)
    names.append(name)
    
    print('{}: {} ({})'.format(name, round(result.mean(), 2), round(result.std(), 2)))

LR: 0.83 (0.02)
CART: 0.73 (0.03)
KNN: 0.77 (0.04)
SVM: 0.84 (0.04)
RFC: 0.85 (0.03)
GBC: 0.91 (0.02)


In [23]:
# NOTE: Temporarily turning off the graph.
# fig, ax = plt.subplots(1, 1)
# plt.boxplot(results)
# ax.set_xticklabels(names)
# ax.set_ylabel('roc_auc')
# ax.set_ylim((0, 1));

## 5. Improve Accuracy

#### 5a. Algorithm Tuning

In [24]:
harmful_count = training[training['changeset_harmful'] == True].shape[0]
not_harmful_count = training[training['changeset_harmful'] == False].shape[0]

# Making this a float to prevent truncation due to integer division.
count = 1.0 * (harmful_count + not_harmful_count)

# Calculate sample weights based on number of occourances.
sample_weight = [1 if l else 1 for l in y]

In [25]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier()
param_grid = {
    'n_estimators': [300, 500, 700],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.05, 0.1,  0.5],
    'max_features': ['log2'],
    'random_state': [5],
}
grid = GridSearchCV(estimator=model, param_grid=param_grid, fit_params={'sample_weight': sample_weight})
grid.fit(X_scaled, y)

print('Best score: {}'.format(grid.best_score_))
print('Best params: {}'.format(grid.best_params_))

Best score: 0.9481327800829875
Best params: {'max_depth': 5, 'n_estimators': 500, 'max_features': 'log2', 'learning_rate': 0.05, 'random_state': 5}


In [26]:
model = grid.best_estimator_
y_model = model.predict(X_scaled)

In [27]:
training['prediction'] = model.predict(X_scaled)
training.to_csv(path + 'training-review.csv', index=False)

#### 5b. Test options and evaluation metric

In [28]:
from sklearn.metrics import confusion_matrix

matrix = confusion_matrix(y, y_model)

matrix = pd.DataFrame(matrix, index=['Labelled good', 'Labelled harmful'], columns=['Predicted good', 'Predicted harmful'])
matrix

Unnamed: 0,Predicted good,Predicted harmful
Labelled good,4861,0
Labelled harmful,43,398


In [29]:
total = matrix.sum().sum()
matrix / total

Unnamed: 0,Predicted good,Predicted harmful
Labelled good,0.917,0.0
Labelled harmful,0.00811,0.08


In [30]:
from sklearn.metrics import classification_report

report = classification_report(y, y_model)
print(report)

             precision    recall  f1-score   support

          0       0.99      1.00      1.00      4861
          1       1.00      0.90      0.95       441

avg / total       0.99      0.99      0.99      5302



In [31]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X_scaled, y, cv=kfold, scoring='roc_auc')

# An area of 0.5 represents a model that is as good as random.
# An area of 1.0 represents a model that made all predictions perfectly.
print('Scores: {} ({})'.format(round(scores.mean(), 2), round(scores.std(), 2)))

Scores: 0.9 (0.02)


#### 5c. Ensembles
- Todo

## 6. Finalize model

#### 6a. Predictions on validation dataset

In [32]:
validation = pd.read_csv(path + 'validation.csv')
validation.head()

Unnamed: 0,changeset_id,changeset_harmful,changeset_features_created,changeset_features_modified,changeset_features_deleted,changeset_has_imagery_used,changeset_has_source,changeset_comment_number_of_words,changeset_comment_naughty_words_count,changeset_bbox_area,changeset_non_open_data_source,changeset_comment_special_characters_count,user_name_naughty_words_count,user_name_special_characters_count,user_changesets_count,user_features_count,user_mapping_days_count,user_discussions_count,user_changesets_with_discussions_count,old_user_name_naughty_words_count,old_user_name_special_characters_count,old_user_changesets_count,old_user_features_count,old_user_mapping_days_count,old_user_discussions_count,old_user_changesets_with_discussions_count,feature_version,feature_name_naughty_words_count,feature_days_since_last_edit,feature_primary_tags,feature_area,feature_property_tags,feature_name_translations_count,feature_has_website,feature_has_wikidata,feature_has_wikipedia,feature_tags_created_count,feature_tags_modified_count,feature_tags_deleted_count,feature_tags_distance,feature_similar_tags_count,feature_name_naughty_words_count_old,feature_primary_tags_old,feature_area_old,feature_property_tags_old,feature_name_translations_count_old,feature_has_website_old,feature_has_wikidata_old,feature_has_wikipedia_old,feature_similar_tags_count_old,iD,JOSM,MAPS.ME,Potlatch,Redaction,Vespucci,OsmAnd,Merkaartor,GNOME,aerialway,aeroway,amenity,barrier,boundary,building,craft,emergency,geological,highway,historic,landuse,leisure,man_made,military,natural,office,place,power,public_transport,railway,route,shop,sport,tourism,waterway,aerialway_old,aeroway_old,amenity_old,barrier_old,boundary_old,building_old,craft_old,emergency_old,geological_old,highway_old,historic_old,landuse_old,leisure_old,man_made_old,military_old,natural_old,office_old,place_old,power_old,public_transport_old,railway_old,route_old,shop_old,sport_old,tourism_old,waterway_old,tag_values_popularity_min,tag_values_popularity_max,tag_values_popularity_mean,tag_values_popularity_stddev
0,46458207,0.0,0,1,0,0,0,25,0,0,0,5,0,0,6164,240741,110,83,33,0,0,44935,1616972,2171,17,1,3,0,604,1,0.0,5,1,1,1,1,1,1,0,2,0,0,1,0.0,4,1,1,0,1,0,0,1,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.52,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.52,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0952,0.0952,0.0952,0.0
1,48166724,0.0,0,1,0,1,0,7,0,223014,0,3,0,0,1,1,1,0,0,0,3,35614,12926795,1562,46,187,3,0,639,1,322570.0,2,1,0,0,0,0,1,0,1,0,0,1,322570.0,2,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.64,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.64,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.206,0.206,0.206,0.0
2,47500384,0.0,0,1,0,0,0,10,0,0,0,1,0,3,1759,244245,627,45,2,0,1,1667,1827,159,20,7,3,0,2,1,0.0,4,1,1,0,0,1,0,0,1,0,0,0,0.0,3,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.46,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0046,0.0046,0.0046,0.0
3,48646312,0.0,0,1,0,1,0,1,0,1019522,0,0,0,0,10,183,1,0,1,0,2,541,180693,314,1,0,22,0,1215,1,339127.0,1,0,0,0,0,1,0,1,2,0,0,1,339127.0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.57,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.62,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166,0.166,0.166,0.0
4,47489162,0.0,0,1,0,0,1,3,0,685,0,0,0,0,571,16006,97,0,0,0,0,571,16006,97,0,0,2,0,1022,1,0.0,7,1,0,0,0,1,0,0,1,0,0,1,0.0,6,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.81,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.81,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0181,0.0181,0.0181,0.0


In [33]:
# Drop all duplicate attributes.
print('Shape before dropping duplicates: {}'.format(validation.shape))
validation = validation.drop_duplicates(subset='changeset_id')
print('Shape after dropping duplicates: {}'.format(validation.shape))

Shape before dropping duplicates: (2583, 115)
Shape after dropping duplicates: (2439, 115)


In [34]:
validation.groupby('changeset_harmful').size()

changeset_harmful
0.0    2230
1.0     209
dtype: int64

In [35]:
# TODO: Replace with a pipeline
vX = validation.drop(non_training_attributes, axis=1)
vX_scaled = scaler.transform(vX)

vy = validation['changeset_harmful']

In [36]:
vy_model = model.predict(vX_scaled)

In [37]:
validation['prediction'] = model.predict(vX_scaled)
validation.to_csv(path + 'validation-review.csv', index=False)

In [38]:
from sklearn.metrics import confusion_matrix
matrix = confusion_matrix(vy, vy_model)
matrix = pd.DataFrame(matrix, index=['Labelled good', 'Labelled harmful'], columns=['Predicted good', 'Predicted harmful'])
matrix

Unnamed: 0,Predicted good,Predicted harmful
Labelled good,2219,11
Labelled harmful,64,145


In [39]:
from sklearn.metrics import classification_report

report = classification_report(vy, vy_model)
print(report)

             precision    recall  f1-score   support

        0.0       0.97      1.00      0.98      2230
        1.0       0.93      0.69      0.79       209

avg / total       0.97      0.97      0.97      2439



In [40]:
scores = cross_val_score(model, vX_scaled, vy, cv=kfold, scoring='roc_auc')
print('Scores: {} ({})'.format(round(scores.mean(), 2), round(scores.std(), 2)))

Scores: 0.87 (0.0)


#### 6b. Create standalone model on entire training dataset
- Todo

#### 6c. Predictions on testing dataset

In [41]:
testing = pd.read_csv(path + 'testing.csv')
testing.head()

Unnamed: 0,changeset_id,changeset_harmful,changeset_features_created,changeset_features_modified,changeset_features_deleted,changeset_has_imagery_used,changeset_has_source,changeset_comment_number_of_words,changeset_comment_naughty_words_count,changeset_bbox_area,changeset_non_open_data_source,changeset_comment_special_characters_count,user_name_naughty_words_count,user_name_special_characters_count,user_changesets_count,user_features_count,user_mapping_days_count,user_discussions_count,user_changesets_with_discussions_count,old_user_name_naughty_words_count,old_user_name_special_characters_count,old_user_changesets_count,old_user_features_count,old_user_mapping_days_count,old_user_discussions_count,old_user_changesets_with_discussions_count,feature_version,feature_name_naughty_words_count,feature_days_since_last_edit,feature_primary_tags,feature_area,feature_property_tags,feature_name_translations_count,feature_has_website,feature_has_wikidata,feature_has_wikipedia,feature_tags_created_count,feature_tags_modified_count,feature_tags_deleted_count,feature_tags_distance,feature_similar_tags_count,feature_name_naughty_words_count_old,feature_primary_tags_old,feature_area_old,feature_property_tags_old,feature_name_translations_count_old,feature_has_website_old,feature_has_wikidata_old,feature_has_wikipedia_old,feature_similar_tags_count_old,iD,JOSM,MAPS.ME,Potlatch,Redaction,Vespucci,OsmAnd,Merkaartor,GNOME,aerialway,aeroway,amenity,barrier,boundary,building,craft,emergency,geological,highway,historic,landuse,leisure,man_made,military,natural,office,place,power,public_transport,railway,route,shop,sport,tourism,waterway,aerialway_old,aeroway_old,amenity_old,barrier_old,boundary_old,building_old,craft_old,emergency_old,geological_old,highway_old,historic_old,landuse_old,leisure_old,man_made_old,military_old,natural_old,office_old,place_old,power_old,public_transport_old,railway_old,route_old,shop_old,sport_old,tourism_old,waterway_old,tag_values_popularity_min,tag_values_popularity_max,tag_values_popularity_mean,tag_values_popularity_stddev
0,48312494,,0,1,0,0,1,2,0,0,0,2,0,0,1396,27158,216,2,0,0,0,2398,1172651,1167,0,1,6,0,979,1,0.0,19,2,0,1,0,1,0,0,1,0,0,1,0.0,18,2,0,0,0,0,0,1,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24.77,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24.77,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.25,0.25,0.0
1,48308033,,0,1,0,1,0,9,0,0,0,2,0,0,1902,2774,134,9,5,0,0,1902,2774,134,9,5,3,0,63,1,0.0,12,1,1,0,0,3,1,0,4,0,0,1,0.0,9,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.2,0.2,0.0
2,48443295,,0,1,0,1,0,5,0,476,0,1,0,4,438,16082,27,0,4,0,0,6890,1179843,891,55,15,2,0,1012,1,0.0,4,1,0,0,0,2,1,0,3,0,0,1,0.0,2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,32.55,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.04,0.04,0.0
3,48396287,,0,1,0,1,0,6,0,626193,0,0,0,0,7979,489804,897,175,26,0,2,138,1191,13,0,1,9,1,-21,1,0.0,6,1,0,0,0,0,1,0,1,0,0,1,0.0,6,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03,0.03,0.03,0.0
4,48369421,,0,1,0,0,1,3,0,43,0,1,0,0,3539,7322752,493,9,4,0,0,3539,7322752,493,9,4,2,0,0,2,22.0,4,0,0,0,0,1,1,0,2,0,0,1,22.0,3,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,81.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.51,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.81,0.41,0.4


In [42]:
# Drop all duplicate attributes.
print('Shape before dropping duplicates: {}'.format(testing.shape))
testing = testing.drop_duplicates(subset='changeset_id')
print('Shape after dropping duplicates: {}'.format(testing.shape))

Shape before dropping duplicates: (19999, 115)
Shape after dropping duplicates: (19793, 115)


In [43]:
training_attributes = list(set(testing.columns) - set(non_training_attributes))
testing = testing.dropna(subset=training_attributes)
print('After samples rows with null: {}'.format(testing.shape))

After samples rows with null: (19788, 115)


In [44]:
# TODO: Replace with a pipeline
tX = testing.drop(non_training_attributes, axis=1)
tX_scaled = scaler.transform(tX)
# ty = testing['changeset_harmful']

In [45]:
testing['prediction'] = model.predict(tX_scaled)

In [46]:
tharmful_count = testing[testing['prediction'] == True].shape[0]
tnot_harmful_count = testing[testing['prediction'] == False].shape[0]

print('Predicted good: {}'.format(tnot_harmful_count))
print('Predicted harmful: {}'.format(tharmful_count))

print('Percentage harmful: {}%'.format(round(100.0 * tharmful_count / testing.shape[0], 2)))

Predicted good: 19748
Predicted harmful: 40
Percentage harmful: 0.2%


In [47]:
testing.to_csv(path + 'testing-review.csv', index=False)

#### 6d. Save model for later use

In [48]:
from sklearn.externals import joblib

scaler_path = '../gabbar/trained/scaler.pkl'
joblib.dump(scaler, scaler_path)

model_path = '../gabbar/trained/model.pkl'
joblib.dump(model, model_path)

['../gabbar/trained/model.pkl']