# DataRobot Prediction Scoring Python Examples
### The examples below show how to get predictions (aka scoring) and prediction explanations in various ways using python.  This includes using our datarobot python module, the requests module, the batch scoring command line tool, and requesting predictions from a deployed model on the prediction server (vs a model on the modelling server).

In [1]:
import pandas as pd
import datarobot as dr
import os
import time
import sys
import requests
import pprint
pd.options.display.max_columns = 200

# Set up
### Create train and test (prediction) data sets.  We'll primarily use the predictions in this notebook, but this can be handy when training projects.

In [2]:
#
# Train test split a dataset
#

SOURCE_FILE = 'data/DR_Demo_10K_Lending_Club_Loans.csv'
TRAIN_FILE = 'data/DR_Demo_10K_Lending_Club_Loans_train.csv'
PRED_FILE = 'data/DR_Demo_10K_Lending_Club_Loans_pred.csv'

df_full = pd.read_csv(SOURCE_FILE, encoding = 'ISO-8859-1')
target = 'is_bad'

# Shuffle the rows
df_full = df_full.sample(frac=1, random_state=0)

# Split 90% for training, 10% for predictions
split = int(df_full.shape[0] * .1)
df = df_full[split:]
df_pred = df_full[:split].copy()

# Drop the target from the prediction dataset
prediction_data = df_pred.drop(target, axis=1)

print('Full data shape:      ',df_full.shape)
print('Training data shape:  ',df.shape)
print('Prediction data shape:',prediction_data.shape)

df.to_csv(TRAIN_FILE, index=False)
prediction_data.to_csv(PRED_FILE, index=False)

Full data shape:       (10000, 34)
Training data shape:   (9000, 34)
Prediction data shape: (1000, 33)


In [3]:
# 
# Connect to DataRobot  
#
# Replace the API token and username with yours.  (I store mine in my environment variables in .bash_profile, fyi..)
#

USERNAME = os.environ['DATAROBOT_USERNAME']
API_TOKEN = os.environ['DATAROBOT_API_TOKEN']

dr.Client(token=API_TOKEN, endpoint='https://app.datarobot.com/api/v2')  # /predApi/v1.0
# dr.Client(token=API_TOKEN, endpoint='https://cfds-ccm-prod.orm.datarobot.com/predApi/v1.0')  # 

PROJECT_ID = '5bff27caf57686467e65e893'
MODEL_ID = '5bff2815f5768646a265e62f'

DEPLOYMENT_ID = '5bda1b807c6f8b2c45427f7e'

project = dr.Project.get(project_id=PROJECT_ID)
model = dr.Model.get(project=PROJECT_ID, model_id=MODEL_ID)
datasets = project.get_datasets()

print(project)
print(model)

Project(Lending Club Retrain)
Model('Gradient Boosted Trees Classifier with Early Stopping')


# 1. Using the datarobot module API to request predictions

In [4]:
# 
# New Scoring Predictions directly on project and model object 
#

print('Uploading prediction dataset')
dataset_from_path = project.upload_dataset(PRED_FILE)

print('Request predictions')
predict_job = model.request_predictions(dataset_from_path.id)

print('Waiting for prediction calculations')
predictions = predict_job.get_result_when_complete()

predictions.head()

Uploading prediction dataset
Request predictions
Waiting for prediction calculations


Unnamed: 0,positive_probability,prediction,prediction_threshold,row_id,class_0.0,class_1.0
0,0.252143,0.0,0.5,0,0.747857,0.252143
1,0.257976,0.0,0.5,1,0.742024,0.257976
2,0.064656,0.0,0.5,2,0.935344,0.064656
3,0.10395,0.0,0.5,3,0.89605,0.10395
4,0.198613,0.0,0.5,4,0.801387,0.198613


## ...and to request prediction explanations

In [5]:
# Per https://datarobot-public-api-client.readthedocs-hosted.com/en/v2.13.1/api/prediction_explanations.html
# you may need to do this:
#
# In order to create PredictionExplanations for a particular model and dataset, you must first:
# 
# Compute feature impact for the model via 
# dr.Model.get_feature_impact()
# 
# Compute a PredictionExplanationsInitialization for the model via 
print('Initializing prediction explanations')
pei = dr.PredictionExplanationsInitialization.create(project.id, model.id)
print(pei)
print('Waiting for initialization')
print(pei.get_result_when_complete())
# 
# Compute predictions for the model and dataset via 
# dr.Model.request_predictions(dataset_from_path.id)

print('Creating prediction explanations')
pe_job = dr.PredictionExplanations.create(project.id, model.id,  dataset_from_path.id)

print('Waiting for job to complete')
pe = pe_job.get_result_when_complete()

print('As dataframe')
df_pe = pe.get_all_as_dataframe()
df_pe.head()

Initializing prediction explanations
Job(predictionExplanationsInitialization, status=inprogress)
Waiting for initialization
PredictionExplanationsInitialization(project_id=5bff27caf57686467e65e893, model_id=5bff2815f5768646a265e62f)
Creating prediction explanations
Waiting for job to complete
As dataframe


Unnamed: 0,row_id,prediction,class_0_label,class_0_probability,class_1_label,class_1_probability,explanation_0_feature,explanation_0_feature_value,explanation_0_label,explanation_0_qualitative_strength,explanation_0_strength,explanation_1_feature,explanation_1_feature_value,explanation_1_label,explanation_1_qualitative_strength,explanation_1_strength,explanation_2_feature,explanation_2_feature_value,explanation_2_label,explanation_2_qualitative_strength,explanation_2_strength
0,0,0.0,1.0,0.252143,0.0,0.747857,int_rate,17.56,1.0,+++,0.234768,term,60 months,1.0,++,0.187849,grade,E,1.0,++,0.134023
1,1,0.0,1.0,0.257976,0.0,0.742024,term,60 months,1.0,++,0.341669,sub_grade,E4,1.0,++,0.323003,int_rate,19.29,1.0,++,0.217547
2,2,0.0,1.0,0.064656,0.0,0.935344,int_rate,7.49,1.0,---,-0.384449,title,Debt Consolidation Loan,1.0,---,-0.207765,grade,A,1.0,--,-0.198126
3,3,0.0,1.0,0.10395,0.0,0.89605,annual_inc,38000,1.0,+++,0.372695,int_rate,7.51,1.0,---,-0.269522,title,Credit Card Debt Consolidation,1.0,--,-0.180576
4,4,0.0,1.0,0.198613,0.0,0.801387,int_rate,16.77,1.0,+++,0.295286,annual_inc,37000,1.0,+++,0.276217,inq_last_6mths,1,1.0,++,0.078378


# 2. Using request.post to a deployment to get predictions or prediction explanations

In [6]:
#
# This is taken from the Deployement > Integrations tab, with add code for a pandas dataframe 
# converted to submit as type json
#
# Replace the API token and username with yours.  (I store mine in my environment variables in .bash_profile, fyi..)
#

import requests
import sys

API_TOKEN = os.environ['DATAROBOT_API_TOKEN']
USERNAME = os.environ['DATAROBOT_USERNAME']

# DEPLOYMENT_ID = '5bda1b807c6f8b2c45427f7e'  # Declared above

# Set HTTP headers

# FOR A FILE:
# Note: The charset should match the contents of the file.
# headers = {'Content-Type': 'text/plain; charset=UTF-8', 'datarobot-key': '544ec55f-61bf-f6ee-0caf-15c7f919a45d'}
# data = open(sys.argv[1], 'rb').read()

# FOR A DATAFRAME 
# generate JSON version of the dataframe to pass to API:
headers = {'Content-Type': 'application/json; charset=UTF-8', 'datarobot-key': '544ec55f-61bf-f6ee-0caf-15c7f919a45d'}
data = pd.read_csv(PRED_FILE)
data = data.to_json(orient='records')

# Make predictions on your data
# The URL has the following format:
#     https://cfds-ccm-prod.orm.datarobot.com/predApi/v1.0/deployments/<DEPLOYMENT_ID>/predictions
# See docs for details:
#     app.datarobot.com/docs/users-guide/deploy/api/new-prediction-api.html

t1 = time.time()

# URL for requesting predictions
# print('Making request')
# predictions_response = requests.post('https://cfds-ccm-prod.orm.datarobot.com/predApi/v1.0/deployments/%s/predictions' % (DEPLOYMENT_ID),
#                                      auth=(USERNAME, API_TOKEN), data=data, headers=headers)

# URL for requesting prediction explanations.  Be sure to initiallize first.
print('Initializing prediction explanations')
pei = dr.PredictionExplanationsInitialization.create(project.id, model.id)
print(pei)
print('Waiting for initialization')
print(pei.get_result_when_complete())
print('Making request')
predictions_response = requests.post('https://cfds-ccm-prod.orm.datarobot.com/predApi/v1.0/deployments/%s/predictionExplanations' % (DEPLOYMENT_ID),
                                     auth=(USERNAME, API_TOKEN), data=data, headers=headers)

response_json = predictions_response.json()
print('- Time: %0.3f' % (time.time()-t1))

# predictions_response.raise_for_status()
print()
print(response_json)
# print(json.dumps(response_json, indent=4, sort_keys=True))

Initializing prediction explanations
Job(predictionExplanationsInitialization, status=inprogress)
Waiting for initialization
PredictionExplanationsInitialization(project_id=5bff27caf57686467e65e893, model_id=5bff2815f5768646a265e62f)
Making request
- Time: 107.597

{'data': [{'predictionValues': [{'value': 0.2521431884, 'label': 1.0}, {'value': 0.7478568116, 'label': 0.0}], 'predictionThreshold': 0.5, 'prediction': 0.0, 'rowId': 0, 'predictionExplanations': [{'featureValue': 17.56, 'strength': 0.2347682566, 'feature': 'int_rate', 'qualitativeStrength': '+++', 'label': 1.0}, {'featureValue': ' 60 months', 'strength': 0.18784916, 'feature': 'term', 'qualitativeStrength': '++', 'label': 1.0}, {'featureValue': 'E', 'strength': 0.1340228025, 'feature': 'grade', 'qualitativeStrength': '++', 'label': 1.0}]}, {'predictionValues': [{'value': 0.2579760086, 'label': 1.0}, {'value': 0.7420239914, 'label': 0.0}], 'predictionThreshold': 0.5, 'prediction': 0.0, 'rowId': 1, 'predictionExplanations': [

# 3. Using the batch script tool

### This installs the batch prediction help code from:
### https://github.com/datarobot/batch-scoring
### pip install -U datarobot_batch_scoring

For example:
- batch_scoring --host=https://mycorp.orm.datarobot.com/ --user="greg@mycorp.com" --out=pred.csv 5545eb20b4912911244d4835 5545eb71b4912911244d4847 /home/greg/Downloads/diabetes_test.csv
- batch_scoring_sse --host=https://mycorp.orm.datarobot.com/ --out=pred.csv 0ec5bcea7f0f45918fa88257bfe42c09 /home/greg/Downloads/diabetes_test.csv
- batch_scoring_deployment_aware --host=https://mycorp.orm.datarobot.com/ --user="greg@mycorp.com" --out=pred.csv 5545eb71b4912911244d4848 /home/greg/Downloads/diabetes_test.csv


In [7]:
#
# Execute the batch_scoring script via a bash shell conmand
#

t1 = time.time()

# Using a project and model ID to score on the modeling server
# !/anaconda3/bin/batch_scoring --host=https://cfds-ccm-prod.orm.datarobot.com --user=$USERNAME --api_token=$API_TOKEN --datarobot_key=544ec55f-61bf-f6ee-0caf-15c7f919a45d $PROJECT_ID $MODEL_ID $PRED_FILE --max_prediction_explanations=3 --out=out.csv

# Using a deployment to score on the prediction server
# !/anaconda3/bin/batch_scoring_deployment_aware --host=https://cfds-ccm-prod.orm.datarobot.com 
#                                                --user=$USERNAME 
#                                                --api_token=$API_TOKEN 
#                                                --datarobot_key=544ec55f-61bf-f6ee-0caf-15c7f919a45d 
#                                                $DEPLOYMENT_ID 
#                                                $PRED_FILE 
#                                                --max_prediction_explanations=3 
#                                                --out=out.csv
!/anaconda3/bin/batch_scoring_deployment_aware --host=https://cfds-ccm-prod.orm.datarobot.com --user=$USERNAME --api_token=$API_TOKEN --datarobot_key=544ec55f-61bf-f6ee-0caf-15c7f919a45d $DEPLOYMENT_ID $PRED_FILE --max_prediction_explanations=3 --out=out.csv

print('- Time: %0.3f' % (time.time()-t1))

# copy over the batch_scorinng output to another file so you can run this cell again without a 'file exists' error
!cp out.csv out_keep.csv
!rm out.csv

# read in the csv_keep file to inspect it
df_batch_out = pd.read_csv('out_keep.csv')
df_batch_out.head()

MainProcess [INFO] version: 1.15.0
MainProcess [INFO] platform: darwin 3.6.4 |Anaconda custom (64-bit)| (default, Jan 16 2018, 12:04:33) 
[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]
MainProcess [INFO] Will be using API endpoint: https://cfds-ccm-prod.orm.datarobot.com/predApi/v1.0/
MainProcess [INFO] auto_sampler: total time seconds - 0.010833978652954102
MainProcess [INFO] auto_sample: will use batches of 4837 rows
MainProcess [INFO] Reader go...
MainProcess [INFO] Writer go...
Shovel_Proc [INFO] Shovel process started
MainProcess [INFO] Network go...
Shovel_Proc [INFO] chunking 1000 rows took 0.013713836669921875
Shovel_Proc [INFO] shoveling complete | total time elapsed 0.022362232208251953s
Netwrk_Proc [INFO] 1 responses sent | time elapsed 0.03846383094787598s
MainProcess [INFO] shovel proc finished, exit code: 0
Writer_Proc [INFO] batch 0-1000 checkpointed
MainProcess [INFO] Writer progress: Results: 1 Written: 1 Rows done: 1000 User time: 0.042 System time: 0.017

Unnamed: 0,row_id,0.0,1.0,explanation_1_feature,explanation_1_strength,explanation_2_feature,explanation_2_strength,explanation_3_feature,explanation_3_strength
0,0,0.747857,0.252143,int_rate,0.234768,term,0.187849,grade,0.134023
1,1,0.742024,0.257976,term,0.341669,sub_grade,0.323003,int_rate,0.217547
2,2,0.935344,0.064656,int_rate,-0.384449,title,-0.207765,grade,-0.198126
3,3,0.89605,0.10395,annual_inc,0.372695,int_rate,-0.269522,title,-0.180576
4,4,0.801387,0.198613,int_rate,0.295286,annual_inc,0.276217,inq_last_6mths,0.078378


# 4. Requesting prediction explanations manually with formatting

### (This is code suited to DateRobot <= 4.3)

##### (Thanks Shinn)

In [8]:
# 
# This creates mini batches of prediction requests and also captures prediction explanations.  
# The formatting output to csv should be tailored to your liking and needs.
# 

OUTPUT_FILE = 'reason_codes_long_format.csv'

MAX_CODES = 10
POSITIVE_CLASS = 1
BATCH_SIZE = 100 # how many records to send each time, typically less than 100 rows

# PROJECT_ID = '5bcf5d3cb64ee905a66dbe83'
# MODEL_ID = '5bcf66e52e7f0df1ffac8f33'
AUTH_KEY = '544ec55f-61bf-f6ee-0caf-15c7f919a45d'
USERNAME = os.environ['DATAROBOT_USERNAME']
API_TOKEN = os.environ['DATAROBOT_API_TOKEN']

SERVER_URL = 'https://cfds-ccm-prod.orm.datarobot.com/'

headers = {'Content-Type': 'text/plain; charset=UTF-8', 'datarobot-key': AUTH_KEY}

url_request = "{server_url}/predApi/v1.0/{project_id}/{model_id}/reasonCodesPredictions".format(
        server_url=SERVER_URL, project_id=PROJECT_ID, model_id=MODEL_ID)

def mini_batch_file(filepath, batch_size, encoding='utf-8'):
    with open(filepath) as infile:
        header = infile.readline()
        output_lines = []
        for line in infile:
#             output_lines.append(line.decode(encoding))  # May be needed if python 2
            output_lines.append(line)
            if len(output_lines) == batch_size:
                output_lines.insert(0, header) # put header up front
                yield ''.join(output_lines) # output csv text with header
                output_lines = []
        else:
            output_lines.insert(0, header) # put header up front
            yield ''.join(output_lines) # output csv text with header

def get_probability_prediction(rc_json_row, positive_class):
    return [p for p in rc_json_row['predictionValues'] if p['label'] == positive_class][0]['value']

params = {'maxCodes': MAX_CODES}

# Let's time this:
t1 = time.time()

reason_code_lines = [] # long so will have MAX_CODES * num_record lines
for batch_num, batch in enumerate(mini_batch_file(PRED_FILE, BATCH_SIZE)):
    try:
        sys.stderr.write('\r--- Making request {:,} totalling {:,} rows requested'.format(
            batch_num+1, (batch_num+1)*BATCH_SIZE))

        row_id_offset = batch_num*BATCH_SIZE # needed so row_id reflected of prediction file
        data = batch.encode('utf-8') # encoding must match in headers
        predictions_response = requests.post(url_request,
                                            auth=(USERNAME, API_TOKEN),
                                            data=data,
                                            headers=headers,
                                            params=params,
                                            timeout=120)
        print('pred_resp:', predictions_response)
        if predictions_response.status_code != 200:
            try:
                message = predictions_response.json().get('message', predictions_response.text)
                status_code = predictions_response.status_code
                reason = predictions_response.reason

                print(u'Status: {status_code} {reason}. Message: {message}.'.format(message=message,
                                                                                    status_code=status_code,
                                                                                    reason=reason))
            except ValueError:
                print('Prediction failed: {}'.format(predictions_response.reason))
                predictions_response.raise_for_status()
        else:
            reason_code_prediction_rows = predictions_response.json()['data']
#             print('- rc: reason_code_prediction_rows:', reason_code_prediction_rows)
            for rc_json_row in reason_code_prediction_rows:
                prediction = get_probability_prediction(rc_json_row, POSITIVE_CLASS)
                row_id = rc_json_row['rowId'] + row_id_offset
                for rc in rc_json_row['reasonCodes']:
                    rc['prediction'] = prediction
                    rc['row_id'] = row_id
                    reason_code_lines.append(rc)
            print('- status code:', predictions_response.status_code)
            print('- reason_code_lines shape', len(reason_code_lines))
    except KeyboardInterrupt:
        break

rcl = pd.DataFrame(reason_code_lines)
rcl.to_csv(OUTPUT_FILE, index=False)

# Output the time:
print('\nDone:  %0.3f seconds' % (time.time()-t1))
rcl.head(50)

--- Making request 1 totalling 100 rows requested

pred_resp: <Response [200]>
- status code: 200
- reason_code_lines shape 1000


--- Making request 2 totalling 200 rows requested

pred_resp: <Response [200]>
- status code: 200
- reason_code_lines shape 2000


--- Making request 3 totalling 300 rows requested

pred_resp: <Response [200]>
- status code: 200
- reason_code_lines shape 3000


--- Making request 4 totalling 400 rows requested

pred_resp: <Response [200]>
- status code: 200
- reason_code_lines shape 4000


--- Making request 5 totalling 500 rows requested

pred_resp: <Response [200]>
- status code: 200
- reason_code_lines shape 5000


--- Making request 6 totalling 600 rows requested

pred_resp: <Response [200]>
- status code: 200
- reason_code_lines shape 6000


--- Making request 7 totalling 700 rows requested

pred_resp: <Response [200]>
- status code: 200
- reason_code_lines shape 7000


--- Making request 8 totalling 800 rows requested

pred_resp: <Response [200]>
- status code: 200
- reason_code_lines shape 8000


--- Making request 9 totalling 900 rows requested

pred_resp: <Response [200]>
- status code: 200
- reason_code_lines shape 9000


--- Making request 10 totalling 1,000 rows requested

pred_resp: <Response [200]>
- status code: 200
- reason_code_lines shape 10000


--- Making request 11 totalling 1,100 rows requested

pred_resp: <Response [422]>
Status: 422 UNPROCESSABLE ENTITY. Message: No data to predict on.

Done:  98.650 seconds


Unnamed: 0,feature,featureValue,label,prediction,qualitativeStrength,row_id,strength
0,int_rate,17.56,1.0,0.252143,+++,0,0.234768
1,term,60 months,1.0,0.252143,++,0,0.187849
2,grade,E,1.0,0.252143,++,0,0.134023
3,annual_inc,42600,1.0,0.252143,++,0,0.103569
4,inq_last_6mths,3,1.0,0.252143,++,0,0.100954
5,sub_grade,E4,1.0,0.252143,++,0,0.096393
6,emp_title,Walgreen Costumer Care,1.0,0.252143,++,0,0.080559
7,title,Student loan,1.0,0.252143,-,0,-0.051017
8,desc,,1.0,0.252143,-,0,-0.031441
9,verification_status,VERIFIED - income source,1.0,0.252143,+,0,0.025562


# 5. Check health status of a model, retrain a new model, and redeploy it

In [13]:
# 
# This checks the health of a depoloyed model, and if it is failing (iow, has drifted) then retrain it
#

DEPLOYMENT_ID = '5bda1b807c6f8b2c45427f7e'

headers2 = {'Content-Type': 'application/json', 'Authorization': 'token %s' % API_TOKEN}

print('Getting model deployments')
health_response = requests.get('https://app.datarobot.com/api/v2/modelDeployments/%s/' % (DEPLOYMENT_ID), 
                               headers=headers2)

if health_response.json()['modelHealth'] == 'failing':
    print('Getting a failing model')
    model = dr.Model.get(model_id=health_response.json()['model']['id'], 
                                project=health_response.json()['project']['id'])
    
    print('Starting a project to retrain the model in manual mode')
    retrainProject = dr.Project.start(sourcedata=TRAIN_FILE, 
                                      project_name='Lending Club Retrain', target='is_bad', 
                                      autopilot_on=False,
                                      worker_count=4)
    
    print('Retraining the model')
    modelJobId = retrainProject.train(model.blueprint)
    newModel = dr.models.modeljob.wait_for_async_model_creation(project_id=retrainProject.id, 
                                                                       model_job_id=modelJobId)

    # # **Now unnecessary
    print('Calling feature impact prior to deploying the new model')
    fi = newModel.get_or_request_feature_impact(600)
    
    print('Updating the deployment to point to the new model')
    model_Update = requests.patch('https://app.datarobot.com/api/v2/modelDeployments/%s/model' % (DEPLOYMENT_ID), 
                                  headers=headers2, data="{'modelId':'%s'}" % newModel.id)
    
    pprint.pprint(model_Update)

pprint.pprint(health_response.json())

Getting model deployments
Getting a failing model
Starting a project to retrain the model in manual mode
Retraining the model




Calling feature impact prior to deploying the new model
Updating the deployment to point to the new model
<Response [202]>
{'accuracyHealth': 'unavailable',
 'createdAt': '2018-10-31 21:15:49.362000',
 'dataSourceTypes': ['scoring', 'training'],
 'deployed': False,
 'description': '',
 'externalDataInfo': None,
 'id': '5bda1b807c6f8b2c45427f7e',
 'instance': {'datarobotKey': '544ec55f-61bf-f6ee-0caf-15c7f919a45d',
              'hostName': 'cfds-ccm-prod.orm.datarobot.com',
              'id': '5a61d7a0fbd723001a2f70d9',
              'sslEnabled': True},
 'label': 'is_bad Predictions',
 'lastPredictionRequest': '2018-11-28 23:36:38+00:00',
 'model': {'hasAnomalyInsight': None,
           'id': '5bff1068f5768644fc65e5ae',
           'modelType': 'Gradient Boosted Trees Classifier with Early Stopping',
           'predictionThreshold': 0.5,
           'userId': '5a8a6402b11ba422e62b7c7a'},
 'modelHealth': 'failing',
 'modelHistory': [{'endDate': None,
                   'model': {'id': 