# Model Creation/Training

First, import libraries:

In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.pipeline import Pipeline
# Feature Transformer is a custom class
from score_auto_gbm.FeatureTransformer import FeatureTransformer

Load the training data:

In [2]:
in_data = pd.read_json("training_data.jsons", orient = "records")
X = in_data.drop("risk", 1)
y = np.array(in_data["risk"])

Create an instance of our custom FeatureTransformer class:

In [3]:
preprocess = FeatureTransformer()

Create a GBM model, which we will then train:

In [4]:
gbm = GradientBoostingRegressor(learning_rate = 0.1, random_state = 1234)

We want to combine the scoring with the feature transformation, so we create a Pipeline:

In [5]:
pipe = Pipeline([("preprocess", preprocess), ("gbm", gbm)])

Now, we train the GBM using grid search cross-validation:

In [6]:
gbm_cv = GridSearchCV(pipe,
                     dict(gbm__n_estimators = [50, 100, 150, 200],
                         gbm__max_depth = [5, 6, 7, 8, 9, 10]),
                     cv = 5,
                     scoring = make_scorer(mean_squared_error),
                     verbose = 100)
gbm_cv.fit(X, y)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV] gbm__max_depth=5, gbm__n_estimators=50 ..........................
[CV]  gbm__max_depth=5, gbm__n_estimators=50, score=1.390660, total=   0.1s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[CV] gbm__max_depth=5, gbm__n_estimators=50 ..........................
[CV]  gbm__max_depth=5, gbm__n_estimators=50, score=0.884076, total=   0.1s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s
[CV] gbm__max_depth=5, gbm__n_estimators=50 ..........................
[CV]  gbm__max_depth=5, gbm__n_estimators=50, score=0.498074, total=   0.1s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.2s remaining:    0.0s
[CV] gbm__max_depth=5, gbm__n_estimators=50 ..........................
[CV]  gbm__max_depth=5, gbm__n_estimators=50, score=0.800251, total=   0.1s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.3s remaining:    0.0s
[CV] gbm__max_depth=5, gbm__n_

[CV]  gbm__max_depth=6, gbm__n_estimators=200, score=0.665809, total=   0.1s
[Parallel(n_jobs=1)]: Done  38 out of  38 | elapsed:    4.3s remaining:    0.0s
[CV] gbm__max_depth=6, gbm__n_estimators=200 .........................
[CV]  gbm__max_depth=6, gbm__n_estimators=200, score=1.080982, total=   0.1s
[Parallel(n_jobs=1)]: Done  39 out of  39 | elapsed:    4.4s remaining:    0.0s
[CV] gbm__max_depth=6, gbm__n_estimators=200 .........................
[CV]  gbm__max_depth=6, gbm__n_estimators=200, score=1.591564, total=   0.1s
[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:    4.5s remaining:    0.0s
[CV] gbm__max_depth=7, gbm__n_estimators=50 ..........................
[CV]  gbm__max_depth=7, gbm__n_estimators=50, score=1.152452, total=   0.1s
[Parallel(n_jobs=1)]: Done  41 out of  41 | elapsed:    4.6s remaining:    0.0s
[CV] gbm__max_depth=7, gbm__n_estimators=50 ..........................
[CV]  gbm__max_depth=7, gbm__n_estimators=50, score=1.314702, total=   0.1s
[Parallel(n_j

[CV]  gbm__max_depth=8, gbm__n_estimators=150, score=1.189825, total=   0.1s
[Parallel(n_jobs=1)]: Done  74 out of  74 | elapsed:    7.9s remaining:    0.0s
[CV] gbm__max_depth=8, gbm__n_estimators=150 .........................
[CV]  gbm__max_depth=8, gbm__n_estimators=150, score=1.922418, total=   0.1s
[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed:    8.0s remaining:    0.0s
[CV] gbm__max_depth=8, gbm__n_estimators=200 .........................
[CV]  gbm__max_depth=8, gbm__n_estimators=200, score=1.161536, total=   0.1s
[Parallel(n_jobs=1)]: Done  76 out of  76 | elapsed:    8.1s remaining:    0.0s
[CV] gbm__max_depth=8, gbm__n_estimators=200 .........................
[CV]  gbm__max_depth=8, gbm__n_estimators=200, score=1.218707, total=   0.1s
[Parallel(n_jobs=1)]: Done  77 out of  77 | elapsed:    8.2s remaining:    0.0s
[CV] gbm__max_depth=8, gbm__n_estimators=200 .........................
[CV]  gbm__max_depth=8, gbm__n_estimators=200, score=0.941231, total=   0.1s
[Parallel(n

[CV]  gbm__max_depth=10, gbm__n_estimators=200, score=1.218707, total=   0.1s
[CV] gbm__max_depth=10, gbm__n_estimators=200 ........................
[CV]  gbm__max_depth=10, gbm__n_estimators=200, score=0.903160, total=   0.1s
[CV] gbm__max_depth=10, gbm__n_estimators=200 ........................
[CV]  gbm__max_depth=10, gbm__n_estimators=200, score=1.386489, total=   0.1s
[CV] gbm__max_depth=10, gbm__n_estimators=200 ........................
[CV]  gbm__max_depth=10, gbm__n_estimators=200, score=1.922418, total=   0.1s
[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:   12.3s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(steps=[('preprocess', FeatureTransformer(transforms=[('impute', Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)), ('scale', StandardScaler(copy=True, with_mean=True, with_std=True))])), ('gbm', GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
   ...       presort='auto', random_state=1234, subsample=1.0, verbose=0,
             warm_start=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'gbm__max_depth': [5, 6, 7, 8, 9, 10], 'gbm__n_estimators': [50, 100, 150, 200]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=make_scorer(mean_squared_error), verbose=100)

Now we have a trained model. Serialize it, so we can use it later for scoring.

In [8]:
import cPickle
with open("score_auto_gbm/gbmFit.pkl", "wb") as pickle_file:
    cPickle.dump(gbm_cv.best_estimator_, pickle_file)

# Model Preparation

Let's prepare the trained model for FastScore. Load the FastScore library:

In [9]:
from fastscoredeploy import ipmagic

Load the schemas:

In [10]:
%%schema gbm_input
{
  "type": "record",
  "name": "CarRecord",
  "fields": [
    {"name": "make", "type": "string"},
    {"name": "fuelType", "type": "string"},
    {"name": "aspiration", "type": "string"},
    {"name": "numDoors", "type": "string"},
    {"name": "bodyStyle", "type": "string"},
    {"name": "driveWheels", "type": "string"},
    {"name": "engineLocation", "type": "string"},
    {"name": "wheelBase", "type": "double"},
    {"name": "length", "type": "double"},
    {"name": "width", "type": "double"},
    {"name": "height", "type": "double"},
    {"name": "curbWeight", "type": "int"},
    {"name": "engineType", "type": "string"},
    {"name": "numCylinders", "type": "string"},
    {"name": "engineSize", "type": "int"},
    {"name": "fuelSystem", "type": "string"},
    {"name": "bore", "type": "double"},
    {"name": "stroke", "type": "double"},
    {"name": "compressionRatio", "type": "double"},
    {"name": "horsepower", "type": "int"},
    {"name": "peakRPM", "type": "int"},
    {"name": "cityMPG", "type": "int"},
    {"name": "highwayMPG", "type": "int"},
    {"name": "price", "type": "int"}
  ]
}


Schema loaded and bound to gbm_input variable


In [11]:
%%schema gbm_output
{"type": "double"}

Schema loaded and bound to gbm_output variable


Use the `%%py2model` cell magic to turn a cell into a FastScore model.

In [12]:
%%model auto_gbm

# fastscore.schema.0: gbm_input
# fastscore.schema.1: gbm_output
# fastscore.module-attached: score_auto_gbm

import cPickle
import numpy as np
import pandas as pd
from score_auto_gbm.FeatureTransformer import FeatureTransformer

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import Pipeline

def begin():
    global gbmFit
    with open("score_auto_gbm/gbmFit.pkl", "rb") as pickle_file:
        gbmFit = cPickle.load(pickle_file)

def action(datum):
    score = list(gbmFit.predict(pd.DataFrame([datum])))[0]
    yield score

Model loaded and bound to auto_gbm variable.


Let's score (offline) our test data, to make sure the model works.

In [13]:
import json
with open('input_data.jsons', 'rb') as f:
    inputs = [json.loads(line) for line in f]

auto_gbm.score(inputs)

[0.78488308928032324,
 0.99978623622995488,
 0.14520131926120264,
 0.00047602695754688152,
 0.99991458040281866,
 0.99991458040281866,
 1.9995876660931677,
 1.3224157902700489,
 1.0001180249460448,
 1.1177808900433042,
 0.0001946819176505984,
 3.0932028320970658e-05,
 0.00084216500537457012,
 1.9995378874905989,
 1.9995378874905989,
 1.1850050689999387,
 1.763893343895927,
 0.99983306050542853,
 1.0001046276155432,
 0.99973584636241164,
 0.00024021999504945344,
 7.3299588672116753e-05,
 2.9996514167995838,
 -0.52638308075100781,
 0.016724337306485742,
 0.016724337306485742,
 -2.8204923308308159e-05,
 -0.00010477766387946928,
 1.0002100842451624,
 1.1850050689999387,
 2.99962270583951,
 2.9996243078606204,
 1.9996073670387611,
 0.00029127440377646317,
 0.0001946819176505984,
 0.99999070863700634,
 0.00025611299138624243,
 0.00028333394568963352,
 0.00022805533266072081,
 1.9998561788680871,
 -0.99962152697975848,
 -0.99962152697975848,
 -0.99962152697975848,
 1.9996012026719876,
 0.0001

Looks like the model works. Package up the attachment. You can bundle multiple external files into a single attachment. FastScore expects an attachment to be either a gzipped tarball (.tar.gz) or just a .zip file.

In [14]:
import tarfile

with tarfile.open('attachment.tar.gz', 'w:gz') as tar:
    tar.add('score_auto_gbm')

# Model Deployment

If you're satisfied with how the model runs locally, it's time to put it in an engine.

In [22]:
from fastscoredeploy.suite import Connect

connect = Connect('https://devel:8000')
engine = connect.get('engine-2')
mm = connect.lookup('model-manage')

In [23]:
auto_gbm.update(model_manage = mm)

True

In [24]:
from fastscore.attachment import Attachment
att = Attachment('att_auto_gbm', datafile='attachment.tar.gz')
att.upload(auto_gbm)

In [25]:
auto_gbm.update(model_manage = mm)

True

In [26]:
auto_gbm.deploy(engine)

Once a model has been deployed to an engine, you can score data with it:

In [29]:
engine.score(inputs)

NameError: global name 'FastScoreError' is not defined

In [None]:
engine.reset()