# Workflow - Gabbar

In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import warnings
warnings.filterwarnings("ignore")

In [2]:
import os

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set_style('ticks')

In [4]:
from sklearn.externals import joblib

## 1. Download labelled features from S3

In [5]:
changesets = pd.read_csv('https://s3-us-west-2.amazonaws.com/mapbox-gabbar/public/labelled_features.csv')

print('Shape of labelled changesets: {}'.format(changesets.shape))
changesets.head()

Shape of labelled changesets: (22336, 5)


Unnamed: 0,changeset_id,harmful,features_created,features_modified,features_deleted
0,47898235,False,2,8,1
1,47898190,False,1,1,1
2,47898172,False,0,1,1
3,47898150,False,0,1,1
4,47898121,False,1,1,4


In [6]:
non_training_features = np.array(['changeset_id', 'harmful'])

features = changesets.columns.values
features = np.setdiff1d(features, non_training_features)

print('Total features: {}\n'.format(len(features)))
print('*** Feature list *** \n{}'.format('\n'.join(features)))

Total features: 3

*** Feature list *** 
features_created
features_deleted
features_modified


In [7]:
# Feature matrix.
X = changesets[features]

# Target array.
y = changesets['harmful']

## 2. Normalize features for machine learning

In [8]:
# Scale features using RobustScaler which is good for outliers.
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler().fit(X)
X = scaler.transform(X)
print(scaler)

RobustScaler(copy=True, with_centering=True, with_scaling=True)


In [9]:
# Dump scaler to a file.
scalerpath = '../gabbar/trained/scaler.pkl'
joblib.dump(scaler, scalerpath, compress=3)

['../gabbar/trained/scaler.pkl']

## 3. Break up training and testing datasets

In [10]:
from sklearn.cross_validation import train_test_split

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, random_state=42, train_size=0.7)
print('Training samples: {}'.format(Xtrain.shape[0]))
print('Testing samples: {}'.format(Xtest.shape[0]))

Training samples: 15635
Testing samples: 6701


## 4. Train model on training dataset

In [11]:
from sklearn.svm import SVC
model = SVC(kernel='rbf')
%time model.fit(Xtrain, ytrain);
print(model)

CPU times: user 2.79 s, sys: 183 ms, total: 2.97 s
Wall time: 3.05 s
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)


In [12]:
# Dump scaler to a file.
modelpath = '../gabbar/trained/model.pkl'
joblib.dump(model, modelpath, compress=3)

['../gabbar/trained/model.pkl']

## 5. Test model on testing dataset

In [13]:
ymodel = model.predict(Xtest)

## 6. Print performance metrics

In [14]:
from sklearn.metrics import classification_report
print(classification_report(ytest, ymodel))

             precision    recall  f1-score   support

      False       0.93      1.00      0.96      6226
       True       0.00      0.00      0.00       475

avg / total       0.86      0.93      0.89      6701



In [15]:
# Evaluate a score by cross-validation.
from sklearn.cross_validation import cross_val_score
scores = cross_val_score(model, X, y, cv=3)
np.mean(scores)

0.92577005901335985

In [16]:
changesets['prediction'] = model.predict(scaler.transform(changesets[features]))

In [17]:
booleans = [True, False]
columns = ['Labelled', 'Predicted', 'Number of changesets']
results = []
for actual in booleans:
    for prediction in booleans:
        temp = changesets[(changesets['harmful'] == actual) & (changesets['prediction'] == prediction)]
        results.append([actual, prediction, temp.shape[0]])
pd.DataFrame(results, columns=columns)

Unnamed: 0,Labelled,Predicted,Number of changesets
0,True,True,57
1,True,False,1597
2,False,True,0
3,False,False,20682


## 7. Download unlabelled features from s3

In [18]:
changesets = pd.read_csv('https://s3-us-west-2.amazonaws.com/mapbox-gabbar/public/unlabelled_features.csv')

print('Shape of unlabelled changesets: {}'.format(changesets.shape))
changesets.head()

Shape of unlabelled changesets: (26191, 4)


Unnamed: 0,changeset_id,features_created,features_modified,features_deleted
0,47370186,111,14,0
1,47370185,0,8,0
2,47370184,316,165,89
3,47370183,0,1,0
4,47370182,0,1,0


## 8. Test model on unlabelled features dataset

In [19]:
# Load the scaler and model.
loaded_scaler = joblib.load(scalerpath)
loaded_model = joblib.load(modelpath)

changesets['prediction'] = loaded_model.predict(loaded_scaler.transform(changesets[features]))

## 9. Print performance metrics

In [20]:
print('Predicted harmful     : {}'.format(changesets[changesets['prediction'] == True].shape[0]))
print('Predicted not harmful : {}'.format(changesets[changesets['prediction'] == False].shape[0]))

Predicted harmful     : 0
Predicted not harmful : 26191
