# Feature classifier - Gabbar

## Step 1. Prepare problem

#### 1a. Load libraries

In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set_style('ticks')

In [3]:
pd.set_option('display.precision', 2)
pd.set_option('display.max_columns', None)

#### 1b. Load dataset

In [4]:
# Loading the training dataset.
attributes = pd.read_csv('../downloads/feature-classifier/training/attributes.csv')

In [5]:
# Drop all duplicate attributes.
print('Shape before dropping duplicates: {}'.format(attributes.shape))
attributes = attributes.drop_duplicates(subset='changeset_id')
print('Shape after dropping duplicates: {}'.format(attributes.shape))

Shape before dropping duplicates: (5470, 75)
Shape after dropping duplicates: (5470, 75)


In [6]:
# Creating a smaller sample to speed up workflow.
# attributes = attributes[:500]

In [7]:
attributes.head(10)

Unnamed: 0,changeset_id,changeset_harmful,changeset_features_created,changeset_features_modified,changeset_features_deleted,changeset_has_imagery_used,changeset_has_source,changeset_comment_number_of_words,changeset_comment_naughty_words_count,changeset_bbox_area,changeset_non_open_data_source,user_name_naughty_words_count,user_name_special_characters_count,user_changesets_count,user_features_count,user_mapping_days_count,user_discussions_count,user_changesets_with_discussions_count,old_user_name_naughty_words_count,old_user_name_special_characters_count,old_user_changesets_count,old_user_features_count,old_user_mapping_days_count,old_user_discussions_count,old_user_changesets_with_discussions_count,feature_version,feature_name_naughty_words_count,feature_days_since_last_edit,feature_primary_tags,feature_area,feature_old_area,feature_property_tags,feature_name_translations_count,feature_has_website,feature_has_wikidata,feature_has_wikipedia,feature_tags_created_count,feature_tags_modified_count,feature_tags_deleted_count,feature_tags_distance,iD,JOSM,MAPS.ME,Potlatch,Redaction,Vespucci,OsmAnd,Merkaartor,GNOME,aerialway,aeroway,amenity,barrier,boundary,building,craft,emergency,geological,highway,historic,landuse,leisure,man_made,military,natural,office,place,power,public_transport,railway,route,shop,sport,tourism,waterway
0,48269805,0,0,1,0,0,0,0,0,10504,0,0,0,1,1,1,0,0,0,0,2145,9123,213,15,7,7,0,95,2,9350,9350,4,2,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,48269170,1,0,1,0,1,0,1,0,0,0,0,0,2405,125105,400,0,0,0,0,2405,125105,400,0,0,3,0,934,2,0,0,10,7,0,0,0,1,6,1,8,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0
2,48267592,1,0,1,0,1,0,4,0,628386,0,0,0,11,48,2,0,0,0,0,4364,1115802,1783,16,30,3,0,743,1,853363,853363,1,0,0,0,0,1,0,1,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,48266008,0,0,1,0,1,0,2,0,2217,0,0,0,20,163,2,0,1,0,0,662,45987,90,0,0,2,0,2147,2,754,754,4,1,0,0,0,4,0,1,5,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
4,48265965,0,0,1,0,1,0,2,0,0,0,0,0,100,1599,5,0,1,0,0,4393,332332,517,1145,28,37,0,39,1,0,0,12,8,0,1,1,0,3,0,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
5,48264238,0,0,1,0,1,0,5,0,0,0,0,1,43032,2458846,992,8,20,0,0,5,18,3,0,0,8,0,149,1,0,0,8,5,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
6,48260457,1,0,1,0,1,0,3,0,8602,0,0,0,2,15,1,0,0,0,0,2,15,1,0,0,2,0,0,1,16918,16918,6,1,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,48256321,0,0,1,0,1,0,12,0,4281722,0,0,0,1174,320767,118,2,4,0,0,1174,320767,118,2,4,2,0,0,1,0,0,2,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,48255884,1,0,1,0,1,0,4,0,72287,0,0,0,59,1021,13,0,0,0,0,59,1021,13,0,0,2,0,0,2,0,0,4,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,48255854,1,0,1,0,1,0,4,0,173848,0,0,0,59,1021,13,0,0,0,0,59,1021,13,0,0,2,0,0,2,0,0,4,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## 2. Summarize data

#### 2a. Descriptive statistics

In [8]:
print('Shape: {}'.format(attributes.shape))

Shape: (5470, 75)


In [9]:
attributes.dtypes.head(10)

changeset_id                             int64
changeset_harmful                        int64
changeset_features_created               int64
changeset_features_modified              int64
changeset_features_deleted               int64
changeset_has_imagery_used               int64
changeset_has_source                     int64
changeset_comment_number_of_words        int64
changeset_comment_naughty_words_count    int64
changeset_bbox_area                      int64
dtype: object

In [10]:
attributes.describe()

Unnamed: 0,changeset_id,changeset_harmful,changeset_features_created,changeset_features_modified,changeset_features_deleted,changeset_has_imagery_used,changeset_has_source,changeset_comment_number_of_words,changeset_comment_naughty_words_count,changeset_bbox_area,changeset_non_open_data_source,user_name_naughty_words_count,user_name_special_characters_count,user_changesets_count,user_features_count,user_mapping_days_count,user_discussions_count,user_changesets_with_discussions_count,old_user_name_naughty_words_count,old_user_name_special_characters_count,old_user_changesets_count,old_user_features_count,old_user_mapping_days_count,old_user_discussions_count,old_user_changesets_with_discussions_count,feature_version,feature_name_naughty_words_count,feature_days_since_last_edit,feature_primary_tags,feature_area,feature_old_area,feature_property_tags,feature_name_translations_count,feature_has_website,feature_has_wikidata,feature_has_wikipedia,feature_tags_created_count,feature_tags_modified_count,feature_tags_deleted_count,feature_tags_distance,iD,JOSM,MAPS.ME,Potlatch,Redaction,Vespucci,OsmAnd,Merkaartor,GNOME,aerialway,aeroway,amenity,barrier,boundary,building,craft,emergency,geological,highway,historic,landuse,leisure,man_made,military,natural,office,place,power,public_transport,railway,route,shop,sport,tourism,waterway
count,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0,5470.0
mean,47300000.0,0.08,0.0,1.0,0.0,0.48,0.37,4.26,0.0011,13700000000.0,0.000914,0.000183,0.36,5060.42,261000.0,263.64,24.18,105.08,0.00165,0.43,6746.59,1460000.0,556.26,50.32,20.85,6.2,0.00366,637.13,1.11,15300000000.0,15100000000.0,6.47,1.52,0.11,0.06,0.08,1.31,0.35,0.2,1.86,0.47,0.13,0.09,0.02,0.0,0.00548,0.00256,0.000183,0.0,0.0,0.00146,0.15,0.00713,0.00969,0.17,0.00165,0.00165,0.0,0.39,0.01,0.03,0.08,0.00457,0.000731,0.03,0.00932,0.05,0.0011,0.00859,0.01,0.01,0.08,0.02,0.04,0.00384
std,623000.0,0.27,0.0,0.0,0.0,0.5,0.48,5.1,0.0331,551000000000.0,0.0302,0.0135,0.95,17547.42,1080000.0,493.55,173.56,531.48,0.0405,1.04,20283.55,8560000.0,657.03,368.0,201.71,16.59,0.0633,750.33,0.45,717000000000.0,717000000000.0,7.15,5.7,0.31,0.23,0.27,1.85,0.95,0.82,2.21,0.5,0.33,0.28,0.15,0.0,0.0739,0.0505,0.0135,0.0,0.0,0.0382,0.36,0.0841,0.098,0.38,0.0405,0.0405,0.0,0.49,0.11,0.17,0.27,0.0675,0.027,0.17,0.0961,0.21,0.0331,0.0923,0.1,0.11,0.26,0.14,0.19,0.0618
min,44800000.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,-1645.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,47100000.0,0.0,0.0,1.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,15.0,37.0,3.0,0.0,0.0,0.0,0.0,289.5,10200.0,47.0,0.0,0.0,2.0,0.0,36.0,1.0,0.0,0.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,47400000.0,0.0,0.0,1.0,0.0,0.0,0.0,3.0,0.0,1060.0,0.0,0.0,0.0,221.0,1450.0,26.0,0.0,0.0,0.0,0.0,1801.5,130000.0,286.0,1.0,1.0,3.0,0.0,361.0,1.0,0.0,0.0,5.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,47600000.0,0.0,0.0,1.0,0.0,1.0,1.0,5.0,0.0,11800.0,0.0,0.0,0.0,2666.0,46000.0,242.0,4.0,4.0,0.0,0.0,5899.0,826000.0,789.75,10.0,7.0,6.0,0.0,1066.0,1.0,598.0,591.0,8.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,48300000.0,1.0,0.0,1.0,0.0,1.0,1.0,246.0,1.0,35300000000000.0,1.0,1.0,8.0,282581.0,31600000.0,3406.0,4412.0,2919.0,1.0,6.0,282581.0,185000000.0,3406.0,6307.0,2919.0,526.0,2.0,3868.0,4.0,41200000000000.0,41200000000000.0,252.0,241.0,1.0,1.0,1.0,56.0,52.0,20.0,56.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [11]:
attributes.groupby('changeset_harmful').size()

changeset_harmful
0    5031
1     439
dtype: int64

In [12]:
attributes.corr(method='pearson').head(10)

Unnamed: 0,changeset_id,changeset_harmful,changeset_features_created,changeset_features_modified,changeset_features_deleted,changeset_has_imagery_used,changeset_has_source,changeset_comment_number_of_words,changeset_comment_naughty_words_count,changeset_bbox_area,changeset_non_open_data_source,user_name_naughty_words_count,user_name_special_characters_count,user_changesets_count,user_features_count,user_mapping_days_count,user_discussions_count,user_changesets_with_discussions_count,old_user_name_naughty_words_count,old_user_name_special_characters_count,old_user_changesets_count,old_user_features_count,old_user_mapping_days_count,old_user_discussions_count,old_user_changesets_with_discussions_count,feature_version,feature_name_naughty_words_count,feature_days_since_last_edit,feature_primary_tags,feature_area,feature_old_area,feature_property_tags,feature_name_translations_count,feature_has_website,feature_has_wikidata,feature_has_wikipedia,feature_tags_created_count,feature_tags_modified_count,feature_tags_deleted_count,feature_tags_distance,iD,JOSM,MAPS.ME,Potlatch,Redaction,Vespucci,OsmAnd,Merkaartor,GNOME,aerialway,aeroway,amenity,barrier,boundary,building,craft,emergency,geological,highway,historic,landuse,leisure,man_made,military,natural,office,place,power,public_transport,railway,route,shop,sport,tourism,waterway
changeset_id,1.0,-0.436,,,,-0.171,0.16,0.104,0.0142,0.000892,-0.00406,-0.00939,0.0454,-0.0239,0.0218,-0.00183,-0.0207,-0.745,0.0187,0.0456,0.0296,0.0298,0.0543,-0.00379,-0.266,-0.0175,0.0321,-0.00763,0.0174,-0.00204,-0.00224,0.0532,0.000739,-0.00377,0.0456,0.02,0.0255,0.0231,0.0367,0.0449,-0.168,-0.0209,0.047,0.0078,,-0.0228,0.0029,-0.0107,,,0.00501,-0.00257,0.00608,-0.00868,0.05,-0.00217,0.00915,,-0.14,0.00281,0.0461,0.095,-0.0116,0.0032,-0.000902,0.00388,0.0854,0.000226,-0.0123,-0.00281,-0.0188,0.0296,0.00361,0.000636,0.000238
changeset_harmful,-0.436,1.0,,,,0.251,-0.22,-0.0248,0.0105,-0.00734,0.0133,-0.00399,-0.0489,-0.0545,-0.0612,-0.137,-0.036,0.637,0.00461,-0.0408,-0.0194,-0.0247,-0.101,-0.00358,0.231,-0.00492,0.0573,-0.0205,0.02,0.0324,0.0325,-0.0305,0.0459,-0.0365,-0.0303,-0.04,-0.0181,0.0278,0.0579,0.0182,0.254,-0.102,-0.029,-0.0335,,-0.0219,-0.015,-0.00399,,,0.0063,-0.0679,-0.00904,-0.00174,-0.02,-0.012,0.00461,,0.05,-0.0256,0.076,0.112,-0.02,-0.00799,0.0486,-0.00765,-0.00292,-0.00979,-0.00563,-0.023,-0.0336,-0.0485,-0.00771,-0.0229,0.00342
changeset_features_created,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
changeset_features_modified,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
changeset_features_deleted,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
changeset_has_imagery_used,-0.171,0.251,,,,1.0,-0.71,0.112,0.0125,0.000146,0.00739,-0.0129,0.123,-0.176,-0.136,-0.244,-0.1,0.194,0.00631,0.0255,-0.0183,0.00661,-0.14,-0.00483,0.0711,-0.0405,0.0257,-0.032,0.0995,0.0178,0.0181,0.0239,0.0142,0.0936,0.0251,0.03,0.0907,0.162,0.0599,0.168,0.99,-0.363,-0.295,-0.151,,0.0628,-0.0485,-0.0129,,,0.0208,0.0366,0.0364,-0.0349,0.11,-0.00272,-0.0208,,-0.27,0.0202,0.0997,0.23,0.0166,0.0283,0.0576,0.0519,0.0149,-0.00959,-0.00579,-0.00104,0.0376,-0.0158,0.116,0.027,0.0294
changeset_has_source,0.155,-0.217,,,,-0.715,1.0,-0.0564,-0.0139,-0.0147,0.00195,0.0177,-0.128,0.214,0.158,0.184,0.0527,-0.143,-0.0123,-0.0172,0.0428,0.0183,0.153,-0.0156,-0.0541,0.0387,-0.0322,0.0357,-0.144,-0.0131,-0.0135,-0.0476,-0.0247,-0.18,-0.0539,-0.1,-0.122,-0.187,-0.0554,-0.203,-0.724,0.402,-0.236,-0.121,,0.0663,-0.0387,-0.0103,,,-0.0293,-0.162,-0.0288,0.0056,-0.2,-0.0123,0.00635,,0.47,-0.0435,-0.0793,-0.19,-0.0462,-0.0207,-0.0473,-0.0584,-0.0482,0.0204,-0.0342,0.0103,-0.0466,-0.0598,-0.0971,-0.113,-0.0291
changeset_comment_number_of_words,0.104,-0.0248,,,,0.112,-0.06,1.0,0.0644,0.00627,0.0661,-0.00601,0.0161,0.0984,0.0197,-0.0266,0.0194,-0.118,-0.00121,-6.53e-05,-0.00113,-0.0168,-0.0387,0.00334,-0.0411,-0.0145,-0.00751,-0.0189,-0.0265,-0.011,-0.011,0.00341,-0.0172,0.00966,-0.00791,0.01,0.042,0.0623,0.0527,0.0814,0.115,0.1,-0.0786,-0.0231,,-0.0179,-0.0147,0.0285,,,-0.0104,-0.011,0.0165,0.0238,0.02,-0.0118,0.00145,,-0.06,-0.0251,0.0341,0.0221,0.00342,-0.00538,-0.013,0.00545,0.0227,0.0167,0.0131,-0.00592,0.0104,0.000674,-0.00107,-0.0227,0.0217
changeset_comment_naughty_words_count,0.0142,0.0105,,,,0.0125,-0.01,0.0644,1.0,-0.000824,-0.001,-0.000448,-0.00109,-0.00903,-0.00409,-0.00866,-0.00449,-0.00649,-0.00135,-0.00857,-0.00818,-0.0027,-0.0081,-0.00219,-0.00233,0.0113,-0.00191,0.011,-0.0083,-0.000709,-0.0007,0.0767,0.0812,0.00648,-0.00823,0.01,0.0214,0.00539,-0.00153,0.0197,0.0129,0.00409,0.00939,-0.00523,,-0.00246,-0.00168,-0.000448,,,-0.00127,0.0015,-0.00281,-0.00328,-0.02,-0.00135,-0.00135,,-0.02,-0.00358,-0.00577,0.0106,-0.00225,-0.000896,0.027,-0.00321,0.0193,-0.0011,-0.00308,-0.00334,-0.00377,0.0115,-0.00468,-0.00654,-0.00206
changeset_bbox_area,0.000892,-0.00734,,,,0.000146,-0.01,0.00627,-0.000824,1.0,-0.000752,-0.000336,0.0176,-6.95e-06,0.00763,0.0216,-9.04e-05,-0.00447,-0.00101,0.00244,-0.00597,-0.00382,-0.00781,-0.00276,-0.00235,0.101,-0.00144,-0.0205,-0.0116,0.55,0.55,0.407,0.5,-0.00486,0.0523,0.04,0.00595,-0.00498,0.0055,0.00489,0.000392,0.0295,-0.00767,-0.00392,,-0.00185,-0.00126,-0.000336,,,-0.000951,-0.0105,-0.00211,0.128,-0.01,-0.00101,0.154,,-0.02,-0.00268,-0.00432,-0.00718,-0.00168,-0.000599,-0.00311,-0.00241,0.00909,-0.00082,-0.00231,-0.00251,-0.0018,-0.00709,-0.00351,-0.00491,-0.00154


In [13]:
skew = attributes.skew()
skew.sort(ascending=False)
skew.head(10)

Merkaartor                               73.96
user_name_naughty_words_count            73.96
changeset_bbox_area                      55.53
feature_old_area                         52.82
feature_area                             52.78
military                                 36.95
changeset_non_open_data_source           33.04
feature_tags_modified_count              30.66
power                                    30.15
changeset_comment_naughty_words_count    30.15
dtype: float64

#### 2b. Data visualizations

In [14]:
# Histograms.
# attributes.hist(figsize=(20, 20));

In [15]:
# Density plot.
# attributes.plot.density(figsize=(20, 50), subplots=True, sharex=False);

In [16]:
# attributes.plot.box(layout=(5, 4), figsize=(20, 25), subplots=True, sharex=False, sharey=False);

In [17]:
# sns.heatmap(attributes.corr(method='pearson'));

## 3. Prepare data

#### 3a. Data cleaning

#### 3b. Feature selection

In [18]:
non_training_attributes = ['changeset_id', 'changeset_harmful']
X = attributes.drop(non_training_attributes, axis=1)
y = attributes['changeset_harmful']

#### 3c. Data transforms

In [19]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X)
Xscaled = scaler.transform(X)

In [20]:
from sklearn.cross_validation import train_test_split

Xtrain, Xtest, ytrain, ytest = train_test_split(Xscaled, y, random_state=42, train_size=0.9)



In [21]:
# Estimate importance of all features.
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier()
model.fit(X, y)

importances = pd.DataFrame(
    list(zip(X.columns, model.feature_importances_)),
    columns=['feature', 'importance']
)
importances.sort(columns='importance', ascending=False).head(10)

Unnamed: 0,feature,importance
15,user_changesets_with_discussions_count,0.26
22,old_user_changesets_with_discussions_count,0.05
11,user_changesets_count,0.04
29,feature_property_tags,0.03
30,feature_name_translations_count,0.03
56,highway,0.03
14,user_discussions_count,0.03
5,changeset_comment_number_of_words,0.03
25,feature_days_since_last_edit,0.03
18,old_user_changesets_count,0.03


## 4. Evaluate algorithms

#### 4a. Split-out validation dataset
- We have a seperate validation dataset. Yay!

#### 4b. Spot Check Algorithms
- Running algorithms in a loop below.

#### 4c. Compare Algorithms

In [22]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

models = []
models.append(('LR', LogisticRegression()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('SVM', SVC()))
models.append(('RFC', RandomForestClassifier()))
models.append(('GBC', GradientBoostingClassifier()))

results = []
names = []
for (name, model) in models:
    kfold = KFold(n_splits=10, random_state=7)
    result = cross_val_score(model, Xscaled, y, cv=kfold, scoring='roc_auc')
    results.append(result)
    names.append(name)
    
    print('{}: {} ({})'.format(name, round(result.mean(), 2), round(result.std(), 2)))

LR: 0.81 (0.07)
CART: 0.58 (0.06)
KNN: 0.63 (0.1)
SVM: 0.74 (0.13)
RFC: 0.75 (0.1)
GBC: 0.86 (0.06)


In [23]:
# NOTE: Temporarily turning off the graph.
# fig, ax = plt.subplots(1, 1)
# plt.boxplot(results)
# ax.set_xticklabels(names)
# ax.set_ylabel('roc_auc')
# ax.set_ylim((0, 1));

## 5. Improve Accuracy

#### 5a. Algorithm Tuning

In [24]:
harmful_count = attributes[attributes['changeset_harmful'] == True].shape[0]
not_harmful_count = attributes[attributes['changeset_harmful'] == False].shape[0]

# Making this a float to prevent truncation due to integer division.
count = 1.0 * (harmful_count + not_harmful_count)

# Calculate sample weights based on number of occourances.
sample_weight = [(not_harmful_count / count) if l else (harmful_count / count) for l in ytrain]

In [25]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier()
param_grid = {
    'n_estimators': [1, 10, 100],
    'max_depth': [1, 10, 100],
    'learning_rate': [0.1, 1, 10],
    'max_features': ['log2']
}
grid = GridSearchCV(estimator=model, param_grid=param_grid, fit_params={'sample_weight': sample_weight})
grid.fit(Xtrain, ytrain)

print('Best score: {}'.format(grid.best_score_))
print('Best params: {}'.format(grid.best_params_))

Best score: 0.9571399553118017
Best params: {'learning_rate': 0.1, 'max_depth': 100, 'n_estimators': 100, 'max_features': 'log2'}


In [26]:
model = grid.best_estimator_
ymodel = model.predict(Xtest)

#### 5b. Test options and evaluation metric

In [27]:
from sklearn.metrics import confusion_matrix

matrix = confusion_matrix(ytest, ymodel)

matrix = pd.DataFrame(matrix, index=['Labelled good', 'Labelled harmful'], columns=['Predicted good', 'Predicted harmful'])
matrix

Unnamed: 0,Predicted good,Predicted harmful
Labelled good,488,4
Labelled harmful,22,33


In [28]:
total = matrix.sum().sum()
matrix / total

Unnamed: 0,Predicted good,Predicted harmful
Labelled good,0.89,0.00731
Labelled harmful,0.04,0.0603


In [29]:
from sklearn.metrics import classification_report

report = classification_report(ytest, ymodel)
print(report)

             precision    recall  f1-score   support

          0       0.96      0.99      0.97       492
          1       0.89      0.60      0.72        55

avg / total       0.95      0.95      0.95       547



In [30]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, Xscaled, y, cv=kfold, scoring='roc_auc')

# An area of 0.5 represents a model that is as good as random.
# An area of 1.0 represents a model that made all predictions perfectly.
print('Scores: {} ({})'.format(round(scores.mean(), 2), round(scores.std(), 2)))

Scores: 0.86 (0.08)


#### 5c. Ensembles
- Todo

## 6. Finalize model

#### 6a. Predictions on validation dataset

In [31]:
vattributes = pd.read_csv('../downloads/feature-classifier/validation/attributes.csv')
vattributes.head(10)

Unnamed: 0,changeset_id,changeset_harmful,changeset_features_created,changeset_features_modified,changeset_features_deleted,changeset_has_imagery_used,changeset_has_source,changeset_comment_number_of_words,changeset_comment_naughty_words_count,changeset_bbox_area,changeset_non_open_data_source,user_name_naughty_words_count,user_name_special_characters_count,user_changesets_count,user_features_count,user_mapping_days_count,user_discussions_count,user_changesets_with_discussions_count,old_user_name_naughty_words_count,old_user_name_special_characters_count,old_user_changesets_count,old_user_features_count,old_user_mapping_days_count,old_user_discussions_count,old_user_changesets_with_discussions_count,feature_version,feature_name_naughty_words_count,feature_days_since_last_edit,feature_primary_tags,feature_area,feature_old_area,feature_property_tags,feature_name_translations_count,feature_has_website,feature_has_wikidata,feature_has_wikipedia,feature_tags_created_count,feature_tags_modified_count,feature_tags_deleted_count,feature_tags_distance,iD,JOSM,MAPS.ME,Potlatch,Redaction,Vespucci,OsmAnd,Merkaartor,GNOME,aerialway,aeroway,amenity,barrier,boundary,building,craft,emergency,geological,highway,historic,landuse,leisure,man_made,military,natural,office,place,power,public_transport,railway,route,shop,sport,tourism,waterway
0,48917139,0,0,1,0,1,0,1,0,16,0,0,3,42,297,8,0,0,0,3,42,297,8,0,0,5,0,0,1,0,0,5,2,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
1,48901413,0,0,1,0,1,0,5,0,0,0,0,0,4,4,1,0,0,0,0,4,4,1,0,0,2,0,0,1,0,0,3,1,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,48900803,0,0,1,0,0,1,2,0,0,0,0,0,507,83653,109,2,6,0,0,507,83653,109,2,6,5,0,419,1,0,0,6,4,0,0,0,3,0,0,3,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
3,48899023,0,0,1,0,1,0,5,0,4974,0,0,4,3,12,1,0,0,0,0,45080,3570101,1248,8,11,4,0,494,1,4397,4397,4,1,0,0,0,2,0,1,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4,48891092,0,0,1,0,0,1,4,0,25183777914,0,0,1,1089,1226,31,0,2,0,0,115,11681,50,6,0,26,0,81,1,22095166073,22095166073,61,57,0,1,1,46,0,0,46,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
5,48891008,1,0,1,0,1,0,1,0,0,0,0,0,40,492,3,0,2,0,0,40,492,3,0,2,3,0,0,2,0,0,5,1,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
6,48891000,0,0,1,0,0,1,2,0,0,0,0,5,93891,3402703,790,62,40,0,0,1,4,1,0,0,20,0,0,1,0,0,22,11,0,1,1,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
7,48890991,1,0,1,0,1,0,1,0,0,0,0,0,40,492,3,0,2,0,0,40,492,3,0,2,3,0,0,2,0,0,4,1,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
8,48890966,0,0,1,0,1,0,3,0,405037,0,0,0,5,26,1,0,0,0,0,2351,80975,500,0,0,7,0,3254,1,115628,115628,1,0,0,0,0,0,1,1,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
9,48888182,1,0,1,0,1,0,2,0,0,0,0,2,734,730,18,5,9,0,2,734,730,18,5,9,8,0,0,0,0,0,0,0,0,0,0,0,0,2,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [32]:
# Drop all duplicate attributes.
print('Shape before dropping duplicates: {}'.format(vattributes.shape))
vattributes = vattributes.drop_duplicates(subset='changeset_id')
print('Shape after dropping duplicates: {}'.format(vattributes.shape))

Shape before dropping duplicates: (549, 75)
Shape after dropping duplicates: (549, 75)


In [33]:
vattributes.groupby('changeset_harmful').size()

changeset_harmful
0    455
1     94
dtype: int64

In [34]:
# TODO: Replace with a pipeline
vX = vattributes.drop(non_training_attributes, axis=1)
vXscaled = scaler.transform(vX)

vy = vattributes['changeset_harmful']

In [35]:
vymodel = model.predict(vXscaled)

In [36]:
from sklearn.metrics import confusion_matrix
matrix = confusion_matrix(vy, vymodel)
matrix = pd.DataFrame(matrix, index=['Labelled good', 'Labelled harmful'], columns=['Predicted good', 'Predicted harmful'])
matrix

Unnamed: 0,Predicted good,Predicted harmful
Labelled good,446,9
Labelled harmful,89,5


In [37]:
from sklearn.metrics import classification_report

report = classification_report(vy, vymodel)
print(report)

             precision    recall  f1-score   support

          0       0.83      0.98      0.90       455
          1       0.36      0.05      0.09        94

avg / total       0.75      0.82      0.76       549



In [38]:
scores = cross_val_score(model, vXscaled, vy, cv=kfold, scoring='roc_auc')
print('Scores: {} ({})'.format(round(scores.mean(), 2), round(scores.std(), 2)))

Scores: 0.76 (0.12)


#### 6b. Create standalone model on entire training dataset
- Todo

#### 6c. Predictions on testing dataset

In [39]:
tattributes = pd.read_csv('../downloads/feature-classifier/testing/attributes.csv')
tattributes.head(10)

Unnamed: 0,changeset_id,changeset_harmful,changeset_features_created,changeset_features_modified,changeset_features_deleted,changeset_has_imagery_used,changeset_has_source,changeset_comment_number_of_words,changeset_comment_naughty_words_count,changeset_bbox_area,changeset_non_open_data_source,user_name_naughty_words_count,user_name_special_characters_count,user_changesets_count,user_features_count,user_mapping_days_count,user_discussions_count,user_changesets_with_discussions_count,old_user_name_naughty_words_count,old_user_name_special_characters_count,old_user_changesets_count,old_user_features_count,old_user_mapping_days_count,old_user_discussions_count,old_user_changesets_with_discussions_count,feature_version,feature_name_naughty_words_count,feature_days_since_last_edit,feature_primary_tags,feature_area,feature_old_area,feature_property_tags,feature_name_translations_count,feature_has_website,feature_has_wikidata,feature_has_wikipedia,feature_tags_created_count,feature_tags_modified_count,feature_tags_deleted_count,feature_tags_distance,iD,JOSM,MAPS.ME,Potlatch,Redaction,Vespucci,OsmAnd,Merkaartor,GNOME,aerialway,aeroway,amenity,barrier,boundary,building,craft,emergency,geological,highway,historic,landuse,leisure,man_made,military,natural,office,place,power,public_transport,railway,route,shop,sport,tourism,waterway
0,48320582,0,0,1,0,1,0,5,0,421,0,0,2,2503,131397,239,0,1,0,0,2245,374811,512,0,0,5,0,412,2,628,628,10,1,0,0,0,0,2,0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1,48320566,0,0,1,0,1,0,2,0,0,0,0,0,1785,5983,272,0,2,0,0,1785,5983,272,0,2,4,0,13,1,0,0,3,1,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,48320564,0,0,1,0,0,0,4,0,53072,0,0,0,2465,44250,259,0,1,0,0,2465,44250,259,0,1,2,0,0,1,0,0,9,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,48320557,0,0,1,0,1,0,1,0,0,0,0,4,3161,14475,232,0,0,0,0,3359,111051,1019,13,1,2,0,513,1,0,0,4,1,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,48320539,0,0,1,0,1,0,1,0,29773,0,0,0,525,17829,27,0,5,0,0,12044,1242695,1534,1988,56,2,0,802,2,51529,51529,4,1,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0
5,48320533,0,0,1,0,1,0,1,0,0,0,0,4,3161,14475,232,0,0,0,0,3359,111051,1019,13,1,3,0,698,1,0,0,3,1,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
6,48320532,0,0,1,0,0,0,3,0,1169,0,0,0,6471,2528927,824,6,3,0,0,6471,2528927,824,6,3,6,0,985,2,531,531,14,1,1,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,48320524,0,0,1,0,0,1,3,0,910,0,0,0,6471,2528927,824,6,3,0,0,6471,2528927,824,6,3,4,0,1228,2,336,336,11,1,1,0,0,2,0,0,2,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,48320519,0,0,1,0,1,0,8,0,4327288,0,0,0,1633,23457,145,3,3,0,0,1633,23457,145,3,3,2,0,0,1,0,0,2,1,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
9,48320502,0,0,1,0,1,0,1,0,849,0,0,0,173,7183,32,0,1,0,0,173,7183,32,0,1,2,0,5,2,608,608,8,1,0,0,0,3,1,0,4,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [40]:
# Drop all duplicate attributes.
print('Shape before dropping duplicates: {}'.format(tattributes.shape))
tattributes = tattributes.drop_duplicates(subset='changeset_id')
print('Shape after dropping duplicates: {}'.format(tattributes.shape))

Shape before dropping duplicates: (3904, 75)
Shape after dropping duplicates: (3904, 75)


In [41]:
# TODO: Replace with a pipeline
tX = tattributes.drop(non_training_attributes, axis=1)
tXscaled = scaler.transform(tX)

ty = tattributes['changeset_harmful']

In [42]:
tattributes['prediction'] = model.predict(tXscaled)

In [43]:
print('Predicted good: {}'.format(tattributes[tattributes['prediction'] == False].shape[0]))
print('Predicted harmful: {}'.format(tattributes[tattributes['prediction'] == True].shape[0]))

Predicted good: 3901
Predicted harmful: 3


In [44]:
tattributes.to_csv('../downloads/feature-classifier/testing/review.csv', index=False)

#### 6d. Save model for later use

In [45]:
from sklearn.externals import joblib

model_path = '../gabbar/trained/model.pkl'
joblib.dump(model, model_path)

['../gabbar/trained/model.pkl']