## Deployment - Batch prediction examples

In [7]:
import pandas as pd
import datarobot as dr
from datetime import datetime
import time
import os
import io
import requests
import json
from pprint import pprint as pp

USERNAME = os.environ['DATAROBOT_USERNAME']
API_KEY = os.environ['DATAROBOT_API_TOKEN']
DATAROBOT_KEY = os.environ['DATAROBOT_KEY']
HOSTNAME = 'https://app.datarobot.com/' # The host to use for the REST API
ENDPOINT = HOSTNAME + 'api/v2/' # The host to use for the REST API

dr.Client(token=API_KEY, endpoint=ENDPOINT)  

<datarobot.rest.RESTClientObject at 0x7fbcd4569700>

In [8]:
#
# Get the dataset for predictions
#
pred_file = './data/DR_Demo_10K_Lending_Club_Loans_pred.csv'
df_pred = pd.read_csv(pred_file)   # there may be a limit to the number of rows 
                                   # that can be passed in to BatchPredictionJob.score
df_pred_subset = df_pred[:150]  #pd.read_csv(pred_file)[:13]

df_pred = df_pred_subset

print('df_pred.shape:', df_pred.shape)
df_pred.head()

df_pred.shape: (150, 33)


Unnamed: 0,loan_amnt,funded_amnt,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,...,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,mths_since_last_major_derog,policy_code
0,8000,8000,60 months,17.56%,201.24,E,E4,Walgreen Costumer Care,5 years,RENT,...,,,3.0,0.0,7469,56.6,9.0,f,,1
1,28000,23650,60 months,19.29%,617.28,E,E4,Dow Chemical,9 years,MORTGAGE,...,,104.0,14.0,1.0,30690,64.2,35.0,f,,1
2,8000,8000,36 months,7.49%,248.82,A,A4,Sandia Corp.,4 years,MORTGAGE,...,,,15.0,0.0,3147,33.1,27.0,f,,1
3,8875,8875,36 months,7.51%,276.11,A,A3,Ashbrook Village Senior Community,1 year,MORTGAGE,...,,,13.0,0.0,19056,62.1,27.0,f,,1
4,7400,7400,36 months,16.77%,262.99,D,D2,Jpmorgan Chase,2 years,RENT,...,,,14.0,0.0,12235,63.2,15.0,f,,1


In [9]:
#
# Get the most recent lending club deployment
#

# Get deployments by search string
deployments_lc = dr.Deployment.list(search='is_bad Predictions')
deployments_lc

deployment = deployments_lc[-1]
print(deployment)

deployment.model

Deployment(is_bad Predictions MC Snowflake)


{'id': '626cb284046b292106773546',
 'type': 'Light Gradient Boosted Trees Classifier with Early Stopping',
 'target_name': 'is_bad',
 'project_id': '626cadcc48904c351de17d87',
 'target_type': 'Binary',
 'project_name': '_10K Lending Club Loans.csv',
 'unsupervised_mode': False,
 'unstructured_model_kind': False,
 'build_environment_type': 'DataRobot',
 'deployed_at': '2022-05-02T20:30:01.351000Z'}

In [10]:
project_id = deployment.model.get('project_id')
print('project_id:', project_id)

model_id = deployment.model.get('id')
print('model_id:', model_id)

project_id: 626cadcc48904c351de17d87
model_id: 626cb284046b292106773546


In [11]:
#
# FYI, the BATCH_PREDICTIONS_URL = 'https://app.datarobot.com/api/v2'
#

In [14]:
#
# Batch scoring synchronously (Program flow waits for call to return).  
# - If output_settings includes a 'file:' element, the call waits for completion before returning program flow.
#
# With prediction explanations if uncommented as shown. 
#
t0 = time.time()
dr.BatchPredictionJob.score(
    deployment.id,
    intake_settings={
        'type': 'localFile',
        'file': './data/DR_Demo_10K_Lending_Club_Loans_pred.csv' # Path or Pandas or file-like object.  
                                                                 # Such as: 'file': df_pred_data
    },
    output_settings={
        'type': 'localFile',
        'path': 'outputfile.csv'
    },
    download_timeout=6000,
    # If explanations are required, uncomment the line below
    # max_explanations=3,
    # Uncomment this for Prediction Warnings, if enabled for your deployment.
    # prediction_warning_enabled=True
)
print('- Execution time: %.3f min' % ((time.time() - t0)/60))

df_results = pd.read_csv('./outputfile.csv')
df_results

ClientError: 422 client error: {'message': 'No prediction server available for deployment'}

In [13]:
#
# Batch scoring asynchronously (Program flow waits for call to return).  
#

# Helper function to check the status of the prediction job
def check_job_status(job, sleep=10):
    """
    After a batch prediction request is posted asynchronously, the call retuns a jobid
    and program flow returns.  This helper function checks the request processing status and
    prints current state
    
    Parameters:
    - job id
    - sleep: the time to wait between status checks
    
    Returns:
    - nothing
    """

    out_str = ''
    
    def output(txt, add_flush=True):       
        if add_flush:
            print(txt, end='\r', flush=True)
        else:
            print(txt)

    job_status = job.get_status()

    while True: 
        job_status = job.get_status()
        elapsed_time = job_status['elapsed_time_sec']
        status = job_status.get('status')
        status_details = job_status['status_details']
        
        out_str = "Wait time {:.3f} minutes - Status {}. "  \
                  "Queue posiition: {} - {}".format(elapsed_time/60, 
                                                                                           status,
                                                                                           job._safe_data.get('queue_position'),
                                                                                           status_details)

        if job.get_status().get('status') == 'INITIALIZING':
            str_split = status_details.split()
            
            try:
                job_index = str_split.index("job(s)")
                num_jobs_ahead = str_split[job_index-1]
                status_details = str(num_jobs_ahead+' job(s) ahead')
            except Exception as e:
                status_details = '0 job(s) ahead'

            output(out_str)
        else:
            print()
            output(out_str, add_flush=False)

            if job.get_status().get('status') == 'COMPLETED' or \
               job.get_status().get('status') == 'ABORTED':
                    break
            
        time.sleep(sleep)

In [8]:
#
# Batch scoring asynchronously (Program flow returns a job id, poll the job for completion).  
#
job = dr.BatchPredictionJob.score(
    deployment.id,
#     prediction_instance={
#         'hostName': host.url,
#     },
    intake_settings={
        'type': 'localFile',
#         'file': './data/DR_Demo_10K_Lending_Club_Loans_pred.csv',  # pred file as csv
        'file': df_pred,  # pred file as dataframe
    },
    output_settings={
        'type': 'localFile',
    },
)

# Print the job object
print('job attributes:')
pp(job.__dict__)

job attributes:
{'_completed_resource_url': None,
 '_safe_data': {'created': '2022-04-20T04:41:42.496000Z',
                'created_by': {'full_name': 'Matthew Cohen',
                               'user_id': '5a8a6402b11ba422e62b7c7a',
                               'username': 'matthew.cohen@datarobot.com'},
                'elapsed_time_sec': 1,
                'failed_rows': 0,
                'id': '625f8f069278221f3fad4ad9',
                'job_spec': {'abort_on_error': True,
                             'chunk_size': 'auto',
                             'csv_settings': {'delimiter': ',',
                                              'encoding': 'utf-8',
                                              'quotechar': '"'},
                             'deployment_id': '5ff5f495966808346aa9b039',
                             'disable_row_level_error_handling': False,
                             'include_prediction_status': False,
                             'include_probabilities'

In [9]:
check_job_status(job, sleep=3)
print()

print('Results:')
result_bytes = job.get_result_when_complete()
result_str = result_bytes.decode("utf-8") 
result_io = io.StringIO(result_str)
# print(result_str)
df = pd.read_csv(result_io, sep=",")
df

Wait time 0.350 minutes - Status INITIALIZING. Queue posiition: 1 - Submitted job to queue. At the time of submission 1 job(s) was waiting for processing before this job. at 2022-04-20 04:41:43.086000
Wait time 0.417 minutes - Status RUNNING. Queue posiition: 1 - Job started processing at 2022-04-20 04:42:07.247000

Wait time 0.467 minutes - Status RUNNING. Queue posiition: 1 - Job started processing at 2022-04-20 04:42:07.247000

Wait time 0.483 minutes - Status COMPLETED. Queue posiition: 1 - Job done processing at 2022-04-20 04:42:11.891000

Results:


Unnamed: 0,is_bad_1_PREDICTION,is_bad_0_PREDICTION,is_bad_PREDICTION,THRESHOLD,POSITIVE_CLASS,DEPLOYMENT_APPROVAL_STATUS
0,0.276771,0.723229,0,0.3,1,APPROVED
1,0.262525,0.737475,0,0.3,1,APPROVED
2,0.039195,0.960805,0,0.3,1,APPROVED
3,0.058474,0.941526,0,0.3,1,APPROVED
4,0.235325,0.764675,0,0.3,1,APPROVED
...,...,...,...,...,...,...
145,0.214297,0.785703,0,0.3,1,APPROVED
146,0.206577,0.793423,0,0.3,1,APPROVED
147,0.149705,0.850295,0,0.3,1,APPROVED
148,0.091984,0.908016,0,0.3,1,APPROVED
