# DataRobot Prediction Scoring Python Examples
### The examples below show how to get predictions (aka scoring) and prediction explanations in various ways using python.  This includes using our datarobot python module, the requests module, the batch scoring command line tool, and requesting predictions from a deployed model on the prediction server (vs a model on the modelling server).

In [1]:
import pandas as pd
import datarobot as dr
import os
import time
import sys
import requests
import pprint
from pandas.io.json import json_normalize
pd.options.display.max_colwidth = 200

# Set up
### Create train and test (prediction) data sets.  We'll primarily use the predictions in this notebook, but this can be handy when training projects.

In [2]:
#
# Train test split a dataset
#

SOURCE_FILE = 'data/DR_Demo_10K_Lending_Club_Loans.csv'
TRAIN_FILE = 'data/DR_Demo_10K_Lending_Club_Loans_train.csv'
PRED_FILE = 'data/DR_Demo_10K_Lending_Club_Loans_pred.csv'

df_full = pd.read_csv(SOURCE_FILE, encoding = 'ISO-8859-1')
target = 'is_bad'

# Shuffle the rows
df_full = df_full.sample(frac=1, random_state=0)

# Split 90% for training, 10% for predictions
split = int(df_full.shape[0] * .1)
df = df_full[split:]
df_pred = df_full[:split].copy()

# Drop the target from the prediction dataset
prediction_data = df_pred.drop(target, axis=1)

print('Full data shape:      ',df_full.shape)
print('Training data shape:  ',df.shape)
print('Prediction data shape:',prediction_data.shape)

df.to_csv(TRAIN_FILE, index=False)
prediction_data.to_csv(PRED_FILE, index=False)

Full data shape:       (10000, 34)
Training data shape:   (9000, 34)
Prediction data shape: (1000, 33)


In [3]:
# 
# Connect to DataRobot  
#
# Replace the API token and username with yours, along with the 
# depolyment id and/or project and model ids you are using.  
#

USERNAME = os.environ['DATAROBOT_USERNAME']
API_TOKEN = os.environ['DATAROBOT_API_TOKEN']
ENDPOINT = 'https://app.datarobot.com/api/v2'

dr.Client(token=API_TOKEN, endpoint=ENDPOINT)  

# If there is an existing deployment, we'll use the deployment id to get the 
# model and project currently deployed to it.  If not, enter it manually.
DEPLOYMENT_ID = '5c19273c06eeed008a2ac7f9'

print('Getting model deployments')
headers = {'Content-Type': 'application/json', 'Authorization': 'token %s' % API_TOKEN}
health_response = requests.get('%s/modelDeployments/%s/' % (ENDPOINT, DEPLOYMENT_ID), 
                               headers=headers)
if health_response.status_code == 200:
    deployment_data = health_response.json()
    PROJECT_ID = deployment_data['project']['id']
    MODEL_ID = deployment_data['model']['id']
else:
    PROJECT_ID = ''  # your project id
    MODEL_ID = ''  # your model id

project = dr.Project.get(project_id=PROJECT_ID)
model = dr.Model.get(project=PROJECT_ID, model_id=MODEL_ID)
datasets = project.get_datasets()

print('project:', project.id)
print('model:', model.id)

Getting model deployments
project: 5dcf56a344849b1adf9583e0
model: 5dcf56f1bc200d1b7e63900d


# 1. Using the datarobot module API to request predictions from the app modeling server

In [4]:
# 
# New Scoring Predictions directly on project and model object 
#

print('Uploading prediction dataset')
dataset_from_path = project.upload_dataset(PRED_FILE)

print('Request predictions')
predict_job = model.request_predictions(dataset_from_path.id)

print('Waiting for prediction calculations')
predictions = predict_job.get_result_when_complete()

predictions.head()

Uploading prediction dataset
Request predictions
Waiting for prediction calculations


Unnamed: 0,positive_probability,prediction,prediction_threshold,row_id,class_0.0,class_1.0
0,0.252143,0.0,0.5,0,0.747857,0.252143
1,0.257976,0.0,0.5,1,0.742024,0.257976
2,0.064656,0.0,0.5,2,0.935344,0.064656
3,0.10395,0.0,0.5,3,0.89605,0.10395
4,0.198613,0.0,0.5,4,0.801387,0.198613


## ...and to request prediction explanations

In [5]:
# 
# Per https://datarobot-public-api-client.readthedocs-hosted.com/en/v2.13.1/api/prediction_explanations.html
# you may need to do this:
#
# "In order to create PredictionExplanations for a particular model and dataset, you must first:
# 
#   Compute feature impact for the model via 
#   dr.Model.get_feature_impact()
# 
#   Compute a PredictionExplanationsInitialization for the model via 
# Let's time this:
t1 = time.time()
print('Initializing prediction explanations')
pei = dr.PredictionExplanationsInitialization.create(project.id, model.id)
print(pei)
print('Waiting for initialization')
print(pei.get_result_when_complete())
# 
#   Compute predictions for the model and dataset via 
#   dr.Model.request_predictions(dataset_from_path.id)
#

print('Creating prediction explanations')
pe_job = dr.PredictionExplanations.create(project.id, model.id,  dataset_from_path.id)

print('Waiting for job to complete')
pe = pe_job.get_result_when_complete()

print('- Time: %0.3f' % (time.time()-t1))
df_pe = pe.get_all_as_dataframe()
df_pe.head()

Initializing prediction explanations
Job(predictionExplanationsInitialization, status=inprogress)
Waiting for initialization
PredictionExplanationsInitialization(project_id=5ce71c9019b7f94c2d72d81f, model_id=5ce71cd119b7f94c1572d883)
Creating prediction explanations
Waiting for job to complete
- Time: 57.558


Unnamed: 0,row_id,prediction,class_0_label,class_0_probability,class_1_label,class_1_probability,explanation_0_feature,explanation_0_feature_value,explanation_0_label,explanation_0_qualitative_strength,...,explanation_1_feature,explanation_1_feature_value,explanation_1_label,explanation_1_qualitative_strength,explanation_1_strength,explanation_2_feature,explanation_2_feature_value,explanation_2_label,explanation_2_qualitative_strength,explanation_2_strength
0,0,0.0,0.0,0.747857,1.0,0.252143,int_rate,17.56,1.0,+++,...,term,60 months,1.0,++,0.187849,grade,E,1.0,++,0.134023
1,1,0.0,0.0,0.742024,1.0,0.257976,term,60 months,1.0,++,...,sub_grade,E4,1.0,++,0.323003,int_rate,19.29,1.0,++,0.217547
2,2,0.0,0.0,0.935344,1.0,0.064656,int_rate,7.49,1.0,---,...,title,Debt Consolidation Loan,1.0,---,-0.207765,grade,A,1.0,--,-0.198126
3,3,0.0,0.0,0.89605,1.0,0.10395,annual_inc,38000,1.0,+++,...,int_rate,7.51,1.0,---,-0.269522,title,Credit Card Debt Consolidation,1.0,--,-0.180576
4,4,0.0,0.0,0.801387,1.0,0.198613,int_rate,16.77,1.0,+++,...,annual_inc,37000,1.0,+++,0.276217,inq_last_6mths,1,1.0,++,0.078378


# 2. Using requests.post to a deployment to get predictions or prediction explanations from a deployment and tbe dedicated prediction server

In [6]:
#
# This is example code taken from the Deployement > Integrations tab, with added code for a pandas dataframe 
# converted to submit as type json.
#
# The requests.post url route is different depending on requesting predictions or prediction explanations 
#
# Replace the API token, username and deployment id  with yours.  
#

import requests
import sys

# API_TOKEN = ''  # Declared above
# USERNAME = ''  # Declared above

# DEPLOYMENT_ID = ''  # Declared above

# Set HTTP headers

# For data from a file:
# Note: The charset should match the contents of the file.
# headers = {'Content-Type': 'text/plain; charset=UTF-8', 'datarobot-key': '544ec55f-61bf-f6ee-0caf-15c7f919a45d'}
# data = open(sys.argv[1], 'rb').read()

# For data from a dataframe:
# generate JSON version of the dataframe to pass to API:
headers = {'Content-Type': 'application/json; charset=UTF-8', 'datarobot-key': '544ec55f-61bf-f6ee-0caf-15c7f919a45d'}
data = pd.read_csv(PRED_FILE)
data = data.to_json(orient='records')

# Make predictions on your data
# The URL has the following format:
#     https://cfds-ccm-prod.orm.datarobot.com/predApi/v1.0/deployments/<DEPLOYMENT_ID>/predictions
# See docs for details:
#     app.datarobot.com/docs/users-guide/deploy/api/new-prediction-api.html

# Let's time this:
t1 = time.time()

# URL for requesting predictions
# print('Request predictions')
# predictions_response = requests.post('https://cfds-ccm-prod.orm.datarobot.com/predApi/v1.0/deployments/%s/predictions' % (DEPLOYMENT_ID),
#                                      auth=(USERNAME, API_TOKEN), data=data, headers=headers)

# URL for requesting prediction explanations.  Be sure to initiallize first.
print('Initializing prediction explanations')
pei = dr.PredictionExplanationsInitialization.create(project.id, model.id)
print(pei)
print('Waiting for initialization')
print(pei.get_result_when_complete())
print('Request prediction explanations')
predictions_response = requests.post('https://cfds-ccm-prod.orm.datarobot.com/predApi/v1.0/deployments/%s/predictionExplanations' % (DEPLOYMENT_ID),
                                     auth=(USERNAME, API_TOKEN), data=data, headers=headers)

response_json = predictions_response.json()
print('- Time: %0.3f' % (time.time()-t1))

# predictions_response.raise_for_status()
print()
json_normalize(response_json['data']).head()

Initializing prediction explanations
Job(predictionExplanationsInitialization, status=inprogress)
Waiting for initialization
PredictionExplanationsInitialization(project_id=5ce71c9019b7f94c2d72d81f, model_id=5ce71cd119b7f94c1572d883)
Request prediction explanations
- Time: 107.447



Unnamed: 0,prediction,predictionExplanations,predictionThreshold,predictionValues,rowId
0,0.0,"[{'featureValue': 17.56, 'strength': 0.2347682566, 'feature': 'int_rate', 'qualitativeStrength': '+++', 'label': 1.0}, {'featureValue': ' 60 months', 'strength': 0.18784916, 'feature': 'term', 'qu...",0.5,"[{'value': 0.2521431884, 'label': 1.0}, {'value': 0.7478568116, 'label': 0.0}]",0
1,0.0,"[{'featureValue': ' 60 months', 'strength': 0.3416690774, 'feature': 'term', 'qualitativeStrength': '++', 'label': 1.0}, {'featureValue': 'E4', 'strength': 0.3230031775, 'feature': 'sub_grade', 'q...",0.5,"[{'value': 0.2579760086, 'label': 1.0}, {'value': 0.7420239914, 'label': 0.0}]",1
2,0.0,"[{'featureValue': 7.49, 'strength': -0.3844490573, 'feature': 'int_rate', 'qualitativeStrength': '---', 'label': 1.0}, {'featureValue': 'Debt Consolidation Loan', 'strength': -0.207765149, 'featur...",0.5,"[{'value': 0.0646559332, 'label': 1.0}, {'value': 0.9353440668, 'label': 0.0}]",2
3,0.0,"[{'featureValue': 38000.0, 'strength': 0.3726952939, 'feature': 'annual_inc', 'qualitativeStrength': '+++', 'label': 1.0}, {'featureValue': 7.51, 'strength': -0.2695220159, 'feature': 'int_rate', ...",0.5,"[{'value': 0.1039504294, 'label': 1.0}, {'value': 0.8960495706, 'label': 0.0}]",3
4,0.0,"[{'featureValue': 16.77, 'strength': 0.2952863213, 'feature': 'int_rate', 'qualitativeStrength': '+++', 'label': 1.0}, {'featureValue': 37000.0, 'strength': 0.2762166789, 'feature': 'annual_inc', ...",0.5,"[{'value': 0.1986127644, 'label': 1.0}, {'value': 0.8013872356, 'label': 0.0}]",4


# 3. Using the batch script tool

### This installs the batch prediction help code from:
### https://github.com/datarobot/batch-scoring
### pip install -U datarobot_batch_scoring

For example:
- batch_scoring --host=https://mycorp.orm.datarobot.com/ --user="greg@mycorp.com" --out=pred.csv 5545eb20b4912911244d4835 5545eb71b4912911244d4847 /home/greg/Downloads/diabetes_test.csv
- batch_scoring_sse --host=https://mycorp.orm.datarobot.com/ --out=pred.csv 0ec5bcea7f0f45918fa88257bfe42c09 /home/greg/Downloads/diabetes_test.csv
- batch_scoring_deployment_aware --host=https://mycorp.orm.datarobot.com/ --user="greg@mycorp.com" --out=pred.csv 5545eb71b4912911244d4848 /home/greg/Downloads/diabetes_test.csv


In [7]:
#
# Execute the batch_scoring script via a bash shell conmand
#

DATAROBOT_KEY = '544ec55f-61bf-f6ee-0caf-15c7f919a45d'  # used for deployments on the cloud app, not on-prem instances
SERVER_URL = 'https://cfds-ccm-prod.orm.datarobot.com/'

t1 = time.time()

# Using a project and model ID to score on the modeling server
# !/anaconda3/bin/batch_scoring --host=$SERVER_URL --user=$USERNAME --api_token=$API_TOKEN --datarobot_key=$DATAROBOT_KEY $PROJECT_ID $MODEL_ID $PRED_FILE --max_prediction_explanations=3 --out=out.csv

# Using a deployment to score on the prediction server
# !/anaconda3/bin/batch_scoring_deployment_aware --host=$SERVER_URL
#                                                --user=$USERNAME 
#                                                --api_token=$API_TOKEN 
#                                                --datarobot_key=$DATAROBOT_KEY
#                                                $DEPLOYMENT_ID 
#                                                $PRED_FILE 
#                                                --max_prediction_explanations=3 
#                                                --out=out.csv
!/anaconda3/bin/batch_scoring_deployment_aware --host=$SERVER_URL --user=$USERNAME --api_token=$API_TOKEN --datarobot_key=$DATAROBOT_KEY $DEPLOYMENT_ID $PRED_FILE --max_prediction_explanations=3 --out=out.csv

print('- Time: %0.3f' % (time.time()-t1))

# copy over the batch_scorinng output to another file so you can run this cell again without a 'file exists' error
!cp out.csv out_keep.csv
!rm out.csv

# read in the csv_keep file to inspect it
df_batch_out = pd.read_csv('out_keep.csv')
df_batch_out.head()

MainProcess [INFO] version: 1.15.0
MainProcess [INFO] platform: darwin 3.6.4 |Anaconda custom (64-bit)| (default, Jan 16 2018, 12:04:33) 
[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]
MainProcess [INFO] Will be using API endpoint: https://cfds-ccm-prod.orm.datarobot.com/predApi/v1.0/
MainProcess [INFO] auto_sampler: total time seconds - 0.010028839111328125
MainProcess [INFO] auto_sample: will use batches of 4837 rows
MainProcess [INFO] Reader go...
MainProcess [INFO] Writer go...
Shovel_Proc [INFO] Shovel process started
MainProcess [INFO] Network go...
Shovel_Proc [INFO] chunking 1000 rows took 0.012948989868164062
Shovel_Proc [INFO] shoveling complete | total time elapsed 0.021791934967041016s
Netwrk_Proc [INFO] 1 responses sent | time elapsed 0.03885841369628906s
MainProcess [INFO] shovel proc finished, exit code: 0
Writer_Proc [INFO] batch 0-1000 checkpointed
MainProcess [INFO] Writer progress: Results: 1 Written: 1 Rows done: 1000 User time: 0.042 System time: 0.018

Unnamed: 0,row_id,0.0,1.0,explanation_1_feature,explanation_1_strength,explanation_2_feature,explanation_2_strength,explanation_3_feature,explanation_3_strength
0,0,0.747857,0.252143,int_rate,0.234768,term,0.187849,grade,0.134023
1,1,0.742024,0.257976,term,0.341669,sub_grade,0.323003,int_rate,0.217547
2,2,0.935344,0.064656,int_rate,-0.384449,title,-0.207765,grade,-0.198126
3,3,0.89605,0.10395,annual_inc,0.372695,int_rate,-0.269522,title,-0.180576
4,4,0.801387,0.198613,int_rate,0.295286,annual_inc,0.276217,inq_last_6mths,0.078378


# 4. Check health status of a model, retrain a new model, and redeploy it

In [9]:
# 
# This checks the health of a depoloyed model, and if it is failing (iow, has drifted) then retrain it
#

# Let's time this:
t1 = time.time()

ENDPOINT = 'https://app.datarobot.com/api/v2'
# DEPLOYMENT_ID = ''  # declared above

headers = {'Content-Type': 'application/json', 'Authorization': 'token %s' % API_TOKEN}

print('Getting model deployments')
health_response = requests.get('%s/modelDeployments/%s/' % (ENDPOINT, DEPLOYMENT_ID), 
                               headers=headers)

if health_response.json()['modelHealth'] == 'failing':
    
    print('Getting a failing model')
    model = dr.Model.get(model_id=health_response.json()['model']['id'], 
                                project=health_response.json()['project']['id'])
    
    print('Starting a project to retrain the model in manual mode')
    retrainProject = dr.Project.start(sourcedata=TRAIN_FILE, 
                                      project_name='Lending Club Retrain', target='is_bad', 
                                      autopilot_on=False,
                                      worker_count=4)
    
    print('Retraining the model')
    modelJobId = retrainProject.train(model.blueprint)
    newModel = dr.models.modeljob.wait_for_async_model_creation(project_id=retrainProject.id, 
                                                                model_job_id=modelJobId)
    
    print('Updating the deployment to point to the new model')
    model_Update = requests.patch('%s/modelDeployments/%s/model' % (ENDPOINT, DEPLOYMENT_ID), 
                                  headers=headers, data="{'modelId':'%s'}" % newModel.id)
    
    pprint.pprint(model_Update)

print('- Time: %0.3f' % (time.time()-t1))

pprint.pprint(health_response.json())

Getting model deployments
Getting a failing model
Starting a project to retrain the model in manual mode
Retraining the model




Updating the deployment to point to the new model
<Response [202]>
- Time: 150.366
{'accuracyHealth': 'unavailable',
 'accuracyHealthEnd': None,
 'accuracyHealthStart': None,
 'associationIdSettings': {'allowMissingValues': False, 'columnName': None},
 'createdAt': '2018-12-18 16:58:41.885000',
 'dataSourceTypes': ['scoring', 'training'],
 'deployed': False,
 'description': '',
 'externalDataInfo': None,
 'id': '5c19273c06eeed008a2ac7f9',
 'instance': {'datarobotKey': '544ec55f-61bf-f6ee-0caf-15c7f919a45d',
              'hostName': 'cfds-ccm-prod.orm.datarobot.com',
              'id': '5a61d7a0fbd723001a2f70d9',
              'sslEnabled': True},
 'label': 'is_bad Predictions',
 'lastPredictionRequest': '2019-05-23 22:29:30+00:00',
 'model': {'hasAnomalyInsight': None,
           'id': '5ce71cd119b7f94c1572d883',
           'isAnomalyDetection': False,
           'modelType': 'Gradient Boosted Trees Classifier with Early Stopping',
           'predictionThreshold': 0.5,
           'u