# DataRobot Prediction Scoring Python Examples
### The examples below show how to get predictions (aka scoring) in various ways using python.  This includes using our datarobot python module, the requests module, the batch scoring command line interface, and requesting predictions from a deployed model on the prediction server (vs a model on the modelling server).

In [1]:
import pandas as pd
import datarobot as dr
import os
import time
import sys
import requests
import pprint
pd.options.display.max_columns = 200

# Set up
### Create train and test (prediction) data sets.  We'll primarily use the predictions in this notebook, but this can be handy when training projects.

In [2]:
#
# Train test split a dataset
#

SOURCE_FILE = 'data/DR_Demo_10K_Lending_Club_Loans.csv'
TRAIN_FILE = 'data/DR_Demo_10K_Lending_Club_Loans_train.csv'
PRED_FILE = 'data/DR_Demo_10K_Lending_Club_Loans_pred.csv'

df_full = pd.read_csv(SOURCE_FILE, encoding = 'ISO-8859-1')
target = 'is_bad'

# Shuffle the rows
df_full = df_full.sample(frac=1, random_state=0)

# Split 90% for training, 10% for predictions
split = int(df_full.shape[0] * .1)
df = df_full[split:]
df_pred = df_full[:split].copy()

# Drop the target from the prediction dataset
prediction_data = df_pred.drop(target, axis=1)

print('Full data shape:      ',df_full.shape)
print('Training data shape:  ',df.shape)
print('Prediction data shape:',prediction_data.shape)

df.to_csv(TRAIN_FILE, index=False)
prediction_data.to_csv(PRED_FILE, index=False)

Full data shape:       (10000, 34)
Training data shape:   (9000, 34)
Prediction data shape: (1000, 33)


In [3]:
# 
# Connect to DataRobot  
#

USERNAME = os.environ['DATAROBOT_USERNAME']
API_TOKEN = os.environ['DATAROBOT_API_TOKEN']

dr.Client(token=API_TOKEN, endpoint='https://app.datarobot.com/api/v2')
PROJECT_ID = '5bdcda3c38f00b610eff5d8f'
MODEL_ID = '5bdcdd9b16378b3d256a8d9c'

project = dr.Project.get(project_id=PROJECT_ID)
model = dr.Model.get(project=PROJECT_ID, model_id=MODEL_ID)
datasets = project.get_datasets()

print(project)
print(model)

Project(DR_Demo_Lending_Club_c_my_api_demo)
Model('Gradient Boosted Trees Classifier with Early Stopping')


# 1. Module API

In [4]:
# 
# New Scoring Predictions directly on project and model object 
#

print('Uploading prediction dataset')
dataset_from_path = project.upload_dataset(PRED_FILE)

print('Request predictions')
predict_job = model.request_predictions(dataset_from_path.id)

print('Waiting for prediction calculations')
predictions = predict_job.get_result_when_complete()

predictions.head()

Uploading prediction dataset
Request predictions
Waiting for prediction calculations


Unnamed: 0,positive_probability,prediction,prediction_threshold,row_id,class_0.0,class_1.0
0,0.234347,0.0,0.5,0,0.765653,0.234347
1,0.18163,0.0,0.5,1,0.81837,0.18163
2,0.045061,0.0,0.5,2,0.954939,0.045061
3,0.08644,0.0,0.5,3,0.91356,0.08644
4,0.203154,0.0,0.5,4,0.796846,0.203154


# 2. Request.post to Deployment

In [5]:
#
# This is taken from the Deployement > Integrations tab, with add code for a pandas dataframe 
# converted to submit as type json
#
# Replace thew API toke and username with yours.  (I store mine in my environment variables in .bash_profile, fyi..)
#

import requests
import sys

API_TOKEN = os.environ['DATAROBOT_API_TOKEN']
USERNAME = os.environ['DATAROBOT_USERNAME']

DEPLOYMENT_ID = '5bda1b807c6f8b2c45427f7e'

# Set HTTP headers

# FOR A FILE:
# Note: The charset should match the contents of the file.
# headers = {'Content-Type': 'text/plain; charset=UTF-8', 'datarobot-key': '544ec55f-61bf-f6ee-0caf-15c7f919a45d'}
# data = open(sys.argv[1], 'rb').read()

# FOR A DATAFRAME 
# generate JSON version of the dataframe to pass to API:
headers = {'Content-Type': 'application/json; charset=UTF-8', 'datarobot-key': '544ec55f-61bf-f6ee-0caf-15c7f919a45d'}
data = pd.read_csv(PRED_FILE)
data = data.to_json(orient='records')

# Make predictions on your data
# The URL has the following format:
#     https://cfds-ccm-prod.orm.datarobot.com/predApi/v1.0/deployments/<DEPLOYMENT_ID>/predictions
# See docs for details:
#     app.datarobot.com/docs/users-guide/deploy/api/new-prediction-api.html
predictions_response = requests.post('https://cfds-ccm-prod.orm.datarobot.com/predApi/v1.0/deployments/%s/predictions' % (DEPLOYMENT_ID),
                                     auth=(USERNAME, API_TOKEN), data=data, headers=headers)

predictions_response.raise_for_status()
print(predictions_response.json())

{'data': [{'predictionValues': [{'value': 0.2521431884, 'label': 1.0}, {'value': 0.7478568116, 'label': 0.0}], 'predictionThreshold': 0.5, 'prediction': 0.0, 'rowId': 0}, {'predictionValues': [{'value': 0.2579760086, 'label': 1.0}, {'value': 0.7420239914, 'label': 0.0}], 'predictionThreshold': 0.5, 'prediction': 0.0, 'rowId': 1}, {'predictionValues': [{'value': 0.0646559332, 'label': 1.0}, {'value': 0.9353440668, 'label': 0.0}], 'predictionThreshold': 0.5, 'prediction': 0.0, 'rowId': 2}, {'predictionValues': [{'value': 0.1039504294, 'label': 1.0}, {'value': 0.8960495706, 'label': 0.0}], 'predictionThreshold': 0.5, 'prediction': 0.0, 'rowId': 3}, {'predictionValues': [{'value': 0.1986127644, 'label': 1.0}, {'value': 0.8013872356, 'label': 0.0}], 'predictionThreshold': 0.5, 'prediction': 0.0, 'rowId': 4}, {'predictionValues': [{'value': 0.0986546668, 'label': 1.0}, {'value': 0.9013453332, 'label': 0.0}], 'predictionThreshold': 0.5, 'prediction': 0.0, 'rowId': 5}, {'predictionValues': [{'

# 3. Batch script

### This installs the batch prediction help code from:
### https://github.com/datarobot/batch-scoring
### pip install -U datarobot_batch_scoring

### (This cell doesn't run - it's just command line interface examples)

In [6]:
batch_scoring --host=https://mycorp.orm.datarobot.com/ --user="greg@mycorp.com" 
              --out=pred.csv 5545eb20b4912911244d4835 5545eb71b4912911244d4847 /home/greg/Downloads/diabetes_test.csv
batch_scoring_sse --host=https://mycorp.orm.datarobot.com/ --out=pred.csv 
              0ec5bcea7f0f45918fa88257bfe42c09 /home/greg/Downloads/diabetes_test.csv
batch_scoring_deployment_aware --host=https://mycorp.orm.datarobot.com/ 
              --user="greg@mycorp.com" --out=pred.csv 5545eb71b4912911244d4848 /home/greg/Downloads/diabetes_test.csv

# 4. Predictions with prediction explanations - inline

In [7]:
# 
# This creates mini batches of predictions requests and also captures prediction explanations.  
# The formatting output to csv should be tailored to your liking and needs.
# 

OUTPUT_FILE = 'reason_codes_long_format.csv'

# PROJECT_ID = '5ab1519e5feaa758b227f436'
# MODEL_ID = '5ab3c9ba5feaa7572313d02f'
MAX_CODES = 10
POSITIVE_CLASS = 1
BATCH_SIZE = 100 # how many records to send each time, typically less than 100 rows

PROJECT_ID = '5bcf5d3cb64ee905a66dbe83'
MODEL_ID = '5bcf66e52e7f0df1ffac8f33'
AUTH_KEY = '544ec55f-61bf-f6ee-0caf-15c7f919a45d'
USERNAME = os.environ['DATAROBOT_USERNAME']
API_TOKEN = os.environ['DATAROBOT_API_TOKEN']

SERVER_URL = 'https://cfds-ccm-prod.orm.datarobot.com/'

headers = {'Content-Type': 'text/plain; charset=UTF-8', 'datarobot-key': AUTH_KEY}

url_request = "{server_url}/predApi/v1.0/{project_id}/{model_id}/reasonCodesPredictions".format(
        server_url=SERVER_URL, project_id=PROJECT_ID, model_id=MODEL_ID)

def mini_batch_file(filepath, batch_size, encoding='utf-8'):
    with open(filepath) as infile:
        header = infile.readline()
        output_lines = []
        for line in infile:
#             output_lines.append(line.decode(encoding))  # May be needed if python 2
            output_lines.append(line)
            if len(output_lines) == batch_size:
                output_lines.insert(0, header) # put header up front
                yield ''.join(output_lines) # output csv text with header
                output_lines = []
        else:
            output_lines.insert(0, header) # put header up front
            yield ''.join(output_lines) # output csv text with header

def get_probability_prediction(rc_json_row, positive_class):
    return [p for p in rc_json_row['predictionValues'] if p['label'] == positive_class][0]['value']

params = {'maxCodes': MAX_CODES}

# Let's time this:
t1 = time.time()

reason_code_lines = [] # long so will have MAX_CODES * num_record lines
for batch_num, batch in enumerate(mini_batch_file(PRED_FILE, BATCH_SIZE)):
    try:
        sys.stderr.write('\r--- Making request {:,} totalling {:,} rows requested'.format(
            batch_num+1, (batch_num+1)*BATCH_SIZE))

        row_id_offset = batch_num*BATCH_SIZE # needed so row_id reflected of prediction file
        data = batch.encode('utf-8') # encoding must match in headers
        predictions_response = requests.post(url_request,
                                            auth=(USERNAME, API_TOKEN),
                                            data=data,
                                            headers=headers,
                                            params=params,
                                            timeout=120)
        print('pred_resp:', predictions_response)
        if predictions_response.status_code != 200:
            try:
                message = predictions_response.json().get('message', predictions_response.text)
                status_code = predictions_response.status_code
                reason = predictions_response.reason

                print(u'Status: {status_code} {reason}. Message: {message}.'.format(message=message,
                                                                                    status_code=status_code,
                                                                                    reason=reason))
            except ValueError:
                print('Prediction failed: {}'.format(predictions_response.reason))
                predictions_response.raise_for_status()
        else:
            reason_code_prediction_rows = predictions_response.json()['data']
#             print('- rc: reason_code_prediction_rows:', reason_code_prediction_rows)
            for rc_json_row in reason_code_prediction_rows:
                prediction = get_probability_prediction(rc_json_row, POSITIVE_CLASS)
                row_id = rc_json_row['rowId'] + row_id_offset
                for rc in rc_json_row['reasonCodes']:
                    rc['prediction'] = prediction
                    rc['row_id'] = row_id
                    reason_code_lines.append(rc)
            print('- status code:', predictions_response.status_code)
            print('- reason_code_lines shape', len(reason_code_lines))
    except KeyboardInterrupt:
        break

pd.DataFrame(reason_code_lines).to_csv(OUTPUT_FILE, index=False)

# Output the time:
print('\nDone:  %0.3f seconds' % (time.time()-t1))

--- Making request 1 totalling 100 rows requested

pred_resp: <Response [200]>
- status code: 200
- reason_code_lines shape 1000


--- Making request 2 totalling 200 rows requested

pred_resp: <Response [200]>
- status code: 200
- reason_code_lines shape 2000


--- Making request 3 totalling 300 rows requested

pred_resp: <Response [200]>
- status code: 200
- reason_code_lines shape 3000


--- Making request 4 totalling 400 rows requested

pred_resp: <Response [200]>
- status code: 200
- reason_code_lines shape 4000


--- Making request 5 totalling 500 rows requested

pred_resp: <Response [200]>
- status code: 200
- reason_code_lines shape 5000


--- Making request 6 totalling 600 rows requested

pred_resp: <Response [200]>
- status code: 200
- reason_code_lines shape 6000


--- Making request 7 totalling 700 rows requested

pred_resp: <Response [200]>
- status code: 200
- reason_code_lines shape 7000


--- Making request 8 totalling 800 rows requested

pred_resp: <Response [200]>
- status code: 200
- reason_code_lines shape 8000


--- Making request 9 totalling 900 rows requested

pred_resp: <Response [200]>
- status code: 200
- reason_code_lines shape 9000


--- Making request 10 totalling 1,000 rows requested

pred_resp: <Response [200]>
- status code: 200
- reason_code_lines shape 10000


--- Making request 11 totalling 1,100 rows requested

pred_resp: <Response [422]>
Status: 422 UNPROCESSABLE ENTITY. Message: No data to predict on.

Done:  74.202 seconds


# 5. Deployment predictions - 
### Check for any deployed model that is failing, retrain it and point the deployment to the new model  

In [8]:
# 
# This check the health of a depoloyed model, and if it is failing (iow, has drifted) then retrain it
#

DEPLOYMENT_ID = '5bda1b807c6f8b2c45427f7e'

headers2 = {'Content-Type': 'application/json', 'Authorization': 'token %s' % API_TOKEN}

print('Getting model deployments')
health_response = requests.get('https://app.datarobot.com/api/v2/modelDeployments/%s/' % (DEPLOYMENT_ID), 
                               headers=headers2)

if health_response.json()['modelHealth'] == 'failing':
    print('Getting a failing model')
    model = dr.Model.get(model_id=health_response.json()['model']['id'], 
                                project=health_response.json()['project']['id'])
    
    print('Starting a project to retrain the model in manual mode')
    retrainProject = dr.Project.start(sourcedata=TRAIN_FILE, 
                                      project_name='Lending Club Retrain', target='is_bad', 
                                      autopilot_on=False,
                                      worker_count=4)
    
    print('Retraining the model')
    modelJobId = retrainProject.train(model.blueprint)
    newModel = dr.models.modeljob.wait_for_async_model_creation(project_id=retrainProject.id, 
                                                                       model_job_id=modelJobId)

    # # **Now unnecessary
    print('Calling feature impact prior to deploying the new model')
    fi = newModel.get_or_request_feature_impact(600)
    
    print('Updating the deployment to point to the new model')
    model_Update = requests.patch('https://app.datarobot.com/api/v2/modelDeployments/%s/model' % (DEPLOYMENT_ID), 
                                  headers=headers2, data="{'modelId':'%s'}" % newModel.id)
    
    pprint.pprint(model_Update)

pprint.pprint(health_response.json())

Getting model deployments
{'accuracyHealth': 'unavailable',
 'createdAt': '2018-10-31 21:15:49.362000',
 'dataSourceTypes': ['scoring', 'training'],
 'deployed': False,
 'description': '',
 'externalDataInfo': None,
 'id': '5bda1b807c6f8b2c45427f7e',
 'instance': {'datarobotKey': '544ec55f-61bf-f6ee-0caf-15c7f919a45d',
              'hostName': 'cfds-ccm-prod.orm.datarobot.com',
              'id': '5a61d7a0fbd723001a2f70d9',
              'sslEnabled': True},
 'label': 'is_bad Predictions',
 'lastPredictionRequest': '2018-11-20 16:55:19+00:00',
 'model': {'hasAnomalyInsight': None,
           'id': '5bf45bd2e6448d6292bcc8eb',
           'modelType': 'Gradient Boosted Trees Classifier with Early Stopping',
           'predictionThreshold': 0.5,
           'userId': '5a8a6402b11ba422e62b7c7a'},
 'modelHealth': 'unknown',
 'modelHistory': [{'endDate': None,
                   'model': {'id': '5bf45bd2e6448d6292bcc8eb',
                             'modelType': 'Gradient Boosted Trees Cla