# Workflow - Gabbar

In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import warnings
warnings.filterwarnings("ignore")

In [2]:
import os

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set_style('ticks')

In [4]:
from sklearn.externals import joblib

## 1. Download labelled features from S3

In [5]:
changesets = pd.read_csv('https://s3-us-west-2.amazonaws.com/mapbox-gabbar/public/labelled_features.csv')

print('Shape of labelled changesets: {}'.format(changesets.shape))
changesets.head()

Shape of labelled changesets: (22336, 10)


Unnamed: 0,changeset_id,harmful,features_created,features_modified,features_deleted,user_id,user_name,user_first_edit,user_changesets,user_features
0,47898082,False,0,5,5,109705.0,gscholz,2009-03-24T20:52:47.000Z,22272.0,1623482.0
1,47898235,False,2,8,1,109705.0,gscholz,2009-03-24T20:52:47.000Z,22272.0,1623482.0
2,47889470,False,1,0,0,5623416.0,MRN1951,2017-04-18T02:02:26.000Z,15.0,17.0
3,47898150,False,0,1,1,109705.0,gscholz,2009-03-24T20:52:47.000Z,22272.0,1623482.0
4,47897677,True,6,14,6,4803528.0,Eva Blue,2016-11-03T12:16:03.000Z,16301.0,193329.0


In [6]:
non_training_features = np.array(['changeset_id', 'harmful', 'user_first_edit', 'user_id', 'user_name'])

features = changesets.columns.values
features = np.setdiff1d(features, non_training_features)

print('Total features: {}\n'.format(len(features)))
print('*** Feature list *** \n{}'.format('\n'.join(features)))

Total features: 5

*** Feature list *** 
features_created
features_deleted
features_modified
user_changesets
user_features


In [7]:
print('Changesets before dropping rows with null values: {}'.format(changesets[features].shape[0]))

# Drop entire row is any of the feature is null.
changesets = changesets.dropna(subset=features)

print('Changesets after dropping rows with null values: {}'.format(changesets[features].shape[0]))
changesets.head()

Changesets before dropping rows with null values: 22336
Changesets after dropping rows with null values: 21334


Unnamed: 0,changeset_id,harmful,features_created,features_modified,features_deleted,user_id,user_name,user_first_edit,user_changesets,user_features
0,47898082,False,0,5,5,109705.0,gscholz,2009-03-24T20:52:47.000Z,22272.0,1623482.0
1,47898235,False,2,8,1,109705.0,gscholz,2009-03-24T20:52:47.000Z,22272.0,1623482.0
2,47889470,False,1,0,0,5623416.0,MRN1951,2017-04-18T02:02:26.000Z,15.0,17.0
3,47898150,False,0,1,1,109705.0,gscholz,2009-03-24T20:52:47.000Z,22272.0,1623482.0
4,47897677,True,6,14,6,4803528.0,Eva Blue,2016-11-03T12:16:03.000Z,16301.0,193329.0


In [8]:
# Feature matrix.
X = changesets[features]

# Target array.
y = changesets['harmful']

## 2. Normalize features for machine learning

In [9]:
# Scale features using RobustScaler which is good for outliers.
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler().fit(X)
X = scaler.transform(X)
print(scaler)

RobustScaler(copy=True, with_centering=True, with_scaling=True)


In [10]:
# Dump scaler to a file.
scalerpath = '../gabbar/trained/scaler.pkl'
joblib.dump(scaler, scalerpath, compress=3)

['../gabbar/trained/scaler.pkl']

## 3. Break up training and testing datasets

In [11]:
from sklearn.cross_validation import train_test_split

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, random_state=42, train_size=0.7)
print('Training samples: {}'.format(Xtrain.shape[0]))
print('Testing samples: {}'.format(Xtest.shape[0]))

Training samples: 14933
Testing samples: 6401


## 4. Train model on training dataset

In [12]:
from sklearn.svm import SVC
model = SVC(kernel='rbf')
%time model.fit(Xtrain, ytrain);
print(model)

CPU times: user 4.76 s, sys: 282 ms, total: 5.04 s
Wall time: 6.06 s
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)


In [13]:
# Dump scaler to a file.
modelpath = '../gabbar/trained/model.pkl'
joblib.dump(model, modelpath, compress=3)

['../gabbar/trained/model.pkl']

## 5. Test model on testing dataset

In [14]:
ymodel = model.predict(Xtest)

## 6. Print performance metrics

In [15]:
from sklearn.metrics import classification_report
print(classification_report(ytest, ymodel))

             precision    recall  f1-score   support

      False       0.93      1.00      0.96      5967
       True       0.00      0.00      0.00       434

avg / total       0.87      0.93      0.90      6401



In [16]:
# Evaluate a score by cross-validation.
from sklearn.cross_validation import cross_val_score
scores = cross_val_score(model, X, y, cv=3)
np.mean(scores)

0.93203337805246578

In [17]:
changesets['prediction'] = model.predict(scaler.transform(changesets[features]))

In [18]:
booleans = [True, False]
columns = ['Labelled', 'Predicted', 'Number of changesets']
results = []
for actual in booleans:
    for prediction in booleans:
        temp = changesets[(changesets['harmful'] == actual) & (changesets['prediction'] == prediction)]
        results.append([actual, prediction, temp.shape[0]])
pd.DataFrame(results, columns=columns)

Unnamed: 0,Labelled,Predicted,Number of changesets
0,True,True,57
1,True,False,1392
2,False,True,0
3,False,False,19885


## 7. Download unlabelled features from s3

In [19]:
changesets = pd.read_csv('https://s3-us-west-2.amazonaws.com/mapbox-gabbar/public/unlabelled_features.csv')

print('Shape of unlabelled changesets: {}'.format(changesets.shape))
changesets.head()

Shape of unlabelled changesets: (26191, 9)


Unnamed: 0,changeset_id,features_created,features_modified,features_deleted,user_id,user_name,user_first_edit,user_changesets,user_features
0,47370185,0,8,0,2285494.0,JJIglesias,2014-08-24T04:14:01.000Z,13386.0,657956.0
1,47370177,0,1,0,2324744.0,arman76,2014-09-10T12:41:11.000Z,102.0,154.0
2,47370159,67,0,0,3593219.0,musawo,2016-02-04T13:35:05.000Z,1544.0,128109.0
3,47370183,0,1,0,2324744.0,arman76,2014-09-10T12:41:11.000Z,102.0,154.0
4,47370184,316,165,89,74847.0,Marc Mongenet,2008-10-23T17:49:18.000Z,11914.0,2082052.0


In [20]:
print('Changesets before dropping rows with null values: {}'.format(changesets[features].shape[0]))

# Drop entire row is any of the feature is null.
changesets = changesets.dropna(subset=features)

print('Changesets after dropping rows with null values: {}'.format(changesets[features].shape[0]))
changesets.head()

Changesets before dropping rows with null values: 26191
Changesets after dropping rows with null values: 26174


Unnamed: 0,changeset_id,features_created,features_modified,features_deleted,user_id,user_name,user_first_edit,user_changesets,user_features
0,47370185,0,8,0,2285494.0,JJIglesias,2014-08-24T04:14:01.000Z,13386.0,657956.0
1,47370177,0,1,0,2324744.0,arman76,2014-09-10T12:41:11.000Z,102.0,154.0
2,47370159,67,0,0,3593219.0,musawo,2016-02-04T13:35:05.000Z,1544.0,128109.0
3,47370183,0,1,0,2324744.0,arman76,2014-09-10T12:41:11.000Z,102.0,154.0
4,47370184,316,165,89,74847.0,Marc Mongenet,2008-10-23T17:49:18.000Z,11914.0,2082052.0


## 8. Test model on unlabelled features dataset

In [21]:
# Load the scaler and model.
loaded_scaler = joblib.load(scalerpath)
loaded_model = joblib.load(modelpath)

changesets['prediction'] = loaded_model.predict(loaded_scaler.transform(changesets[features]))

## 9. Print performance metrics

In [22]:
print('Predicted harmful     : {}'.format(changesets[changesets['prediction'] == True].shape[0]))
print('Predicted not harmful : {}'.format(changesets[changesets['prediction'] == False].shape[0]))

Predicted harmful     : 1
Predicted not harmful : 26173


In [23]:
changesets[changesets['prediction'] == True].head()

Unnamed: 0,changeset_id,features_created,features_modified,features_deleted,user_id,user_name,user_first_edit,user_changesets,user_features,prediction
10655,47359107,1,19,40,2716442.0,pete404,2015-03-01T01:27:16.000Z,1987.0,204390.0,True
