In [1]:
import pandas as pd
import datarobot as dr
import os
pd.options.display.max_columns = 200

In [2]:
# -------- Train test split the dataset --------

df_full = pd.read_csv('data/DR_Demo_10K_Lending_Club_Loans.csv', encoding = 'ISO-8859-1')
target = 'is_bad'

# Shuffle the rows
df_full = df_full.sample(frac=1, random_state=0)

# Split 90% for training, 10% for predictions
split = int(df_full.shape[0] * .1)
df = df_full[split:]
df_pred = df_full[:split].copy()

# Drop the target from the prediction dataset
prediction_data = df_pred.drop(target, axis=1)

print('Full data shape:      ',df_full.shape)
print('Training data shape:  ',df.shape)
print('Prediction data shape:',prediction_data.shape)

df.to_csv('data/DR_Demo_10K_Lending_Club_Loans_train.csv')
prediction_data.to_csv('data/DR_Demo_10K_Lending_Club_Loans_pred.csv')

Full data shape:       (10000, 34)
Training data shape:   (9000, 34)
Prediction data shape: (1000, 33)


In [4]:
dr.Client(token=os.environ['DATAROBOT_API_TOKEN'], endpoint='https://app.datarobot.com/api/v2')
project_id = '5bdcda3c38f00b610eff5d8f'
model_id = '5bdcdd9b16378b3d256a8d9c'

project = dr.Project.get(project_id=project_id)
model = dr.Model.get(project=project_id, model_id=model_id)
datasets = project.get_datasets()

print(project)
print(model)

Project(DR_Demo_10K_Lending_Club_Loans.csv)
Model('Gradient Boosted Trees Classifier with Early Stopping')


# 1. Module API

In [6]:
# -------- New Scoring Predictions directly on project and model object ---------

print('Uploading prediction dataset')
dataset_from_path = project.upload_dataset('data/DR_Demo_10K_Lending_Club_Loans_pred.csv')

print('Request predictions')
predict_job = model.request_predictions(dataset_from_path.id)

print('Waiting for prediction calculations')
predictions = predict_job.get_result_when_complete()

predictions.head()

Uploading prediction dataset
Request predictions
Waiting for prediction calculations


Unnamed: 0,positive_probability,prediction,prediction_threshold,row_id,class_0.0,class_1.0
0,0.234347,0.0,0.5,0,0.765653,0.234347
1,0.18163,0.0,0.5,1,0.81837,0.18163
2,0.045061,0.0,0.5,2,0.954939,0.045061
3,0.08644,0.0,0.5,3,0.91356,0.08644
4,0.203154,0.0,0.5,4,0.796846,0.203154


In [None]:
# This installs the batch prediction help code from:
# https://github.com/datarobot/batch-scoring
# pip install -U datarobot_batch_scoring

# 2. Standard post

In [None]:
import requests
import sys
import pandas
from pandas.io.json import json_normalize

API_TOKEN = 'xxxx'
USERNAME = 'xxxx@example.com'

DEPLOYMENT_ID = 'xxxx'

# set content-type header to JSON
headers = {'Content-Type': 'application/json; charset=UTF-8', 'datarobot-key': 'xxxx'}

# create dataframe from some external source - local csv file in this case
data = pandas.read_csv('10k_diabetes_dos.csv')

# generate JSON version of the dataframe to pass to API
data_json = data.to_json(orient='records')

# deliver request to API with JSON payload
predictions_response = requests.post('https://datarobot-support.orm.datarobot.com/predApi/v1.0/deployments/%s/predictions' % (DEPLOYMENT_ID),
                                    auth=(USERNAME, API_TOKEN), data=data_json, headers=headers)

# store the JSON response from the prediction API
response_json = predictions_response.json()

# normalize the JSON, which will flatten the data structure into a dataframe
# details here: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.io.json.json_normalize.html
results_df = json_normalize(data=response_json['data'])

# 3. Batch script

## https://github.com/datarobot/batch-scoring

In [None]:
%pip install -U datarobot_batch_scoring

batch_scoring --host=https://mycorp.orm.datarobot.com/ --user="greg@mycorp.com" 
              --out=pred.csv 5545eb20b4912911244d4835 5545eb71b4912911244d4847 /home/greg/Downloads/diabetes_test.csv
batch_scoring_sse --host=https://mycorp.orm.datarobot.com/ --out=pred.csv 
              0ec5bcea7f0f45918fa88257bfe42c09 /home/greg/Downloads/diabetes_test.csv
batch_scoring_deployment_aware --host=https://mycorp.orm.datarobot.com/ 
              --user="greg@mycorp.com" --out=pred.csv 5545eb71b4912911244d4848 /home/greg/Downloads/diabetes_test.csv

# 4. Batch script manual

In [None]:
import sys

import pandas as pd
import requests

PREDICTION_FILE = 'churn_dn5_date_fixed_sumgroup_recdaysle31_nodatesorextras_validationholdout.csv'
OUTPUT_FILE = 'reason_codes_long_format.csv'

PROJECT_ID = '5ab1519e5feaa758b227f436'
MODEL_ID = '5ab3c9ba5feaa7572313d02f'
MAX_CODES = 10
POSITIVE_CLASS = 1
BATCH_SIZE = 100 # how many records to send each time, typically less than 100 rows

USERNAME = ''
API_TOKEN = ''
SERVER_URL = ''
SERVER_KEY = ''

headers = {'Content-Type': 'text/plain; charset=UTF-8', 'datarobot-key': SERVER_KEY}

url_request = "{server_url}/predApi/v1.0/{project_id}/{model_id}/reasonCodesPredictions".format(
        server_url=SERVER_URL, project_id=PROJECT_ID, model_id=MODEL_ID)

def mini_batch_file(filepath, batch_size, encoding='utf-8'):
    with open(filepath) as infile:
        header = infile.readline()
        output_lines = []
        for line in infile:
            output_lines.append(line.decode(encoding))
            if len(output_lines) == batch_size:
                output_lines.insert(0, header) # put header up front
                yield ''.join(output_lines) # output csv text with header
                output_lines = []
        else:
            output_lines.insert(0, header) # put header up front
            yield ''.join(output_lines) # output csv text with header

def get_probability_prediction(rc_json_row, positive_class):
    return [p for p in rc_json_row['predictionValues'] if p['label'] == positive_class][0]['value']

params = {'maxCodes': MAX_CODES}

reason_code_lines = [] # long so will have MAX_CODES * num_record lines
for batch_num, batch in enumerate(mini_batch_file(PREDICTION_FILE, BATCH_SIZE)):
    try:
        sys.stderr.write('\r--- Making request {:,} totalling {:,} rows requested'.format(
            batch_num+1, (batch_num+1)*BATCH_SIZE))

        row_id_offset = batch_num*BATCH_SIZE # needed so row_id reflected of prediction file
        data = batch.encode('utf-8') # encoding must match in headers
        predictions_response = requests.post(url_request,
                                            auth=(USERNAME, API_TOKEN),
                                            data=data,
                                            headers=headers,
                                            params=params,
                                            timeout=120)
        if predictions_response.status_code != 200:
            try:
                message = predictions_response.json().get('message', predictions_response.text)
                status_code = predictions_response.status_code
                reason = predictions_response.reason

                print(u'Status: {status_code} {reason}. Message: {message}.'.format(message=message,
                                                                                    status_code=status_code,
                                                                                    reason=reason))
            except ValueError:
                print('Prediction failed: {}'.format(predictions_response.reason))
                predictions_response.raise_for_status()
        else:
            reason_code_prediction_rows = predictions_response.json()['data']
            for rc_json_row in reason_code_prediction_rows:
                prediction = get_probability_prediction(rc_json_row, POSITIVE_CLASS)
                row_id = rc_json_row['rowId'] + row_id_offset
                for rc in rc_json_row['reasonCodes']:
                    rc['prediction'] = prediction
                    rc['row_id'] = row_id
                    reason_code_lines.append(rc)
    except KeyboardInterrupt:
        break

pd.DataFrame(reason_code_lines).to_csv(OUTPUT_FILE, index=False)

# 5. Deployment predictions

In [None]:
ENDPOINT = 'https://app.datarobot.com/api/v2'
headers2 = {'Content-Type': 'application/json', 'Authorization': 'token %s' % API_TOKEN}

health_response = requests.get('https://app.datarobot.com/api/v2/modelDeployments/%s/' % (DEPLOYMENT_ID), headers=headers2)

dr = datarobot.Client(endpoint=ENDPOINT, token=API_TOKEN)

if health_response.json()['modelHealth'] == 'failing':
    model = datarobot.Model.get(model_id=health_response.json()['model']['id'], project=health_response.json()['project']['id'])
    retrainProject = datarobot.Project.start(sourcedata='/Users/james.johnston/Python/10K_Lending_Club_Loans.csv', project_name='Lending Club Retrain', target='is_bad', \
        autopilot_on=False)
    modelJobId = retrainProject.train(model.blueprint)
    newModel = datarobot.models.modeljob.wait_for_async_model_creation(project_id=retrainProject.id, model_job_id=modelJobId)
    fi = newModel.get_or_request_feature_impact(600)
    model_Update = requests.patch('https://app.datarobot.com/api/v2/modelDeployments/%s/model' % (DEPLOYMENT_ID), headers=headers2, data="{'modelId':'%s'}" % newModel.id)
    pprint.pprint(model_Update)


pprint.pprint(health_response.json())

## Dev

In [7]:
# Scratch stuff...

# 
# # Upload the scoring dataset if not already uploaded, else retrieve it
# if (len(datasets) == 0):
#     print('Uploading prediction dataset')
#     pred_ds = project.upload_dataset(prediction_data)
# else:
#     print('Retrieving the prediction dataset id')
#     for s in datasets:
#         if s.name == 'predict.csv':
#             pred_ds = s

# # Request predictions for the scoring dataset, if it hasn't been already requested
# try:                      
#     predict_job
# except NameError:
#     print('Requesitng predictions')
#     predict_job = model.request_predictions(pred_ds.id)
    
# # Get the predictions results when complete
# print('Waiting for prediction results')
# predictions = predict_job.get_result_when_complete()
# print('- done.  Predictions dataframe:')
# print(predictions.head())

# # Add a column for the target labels (y_true)
# predictions['y_true'] = df_pred[target].astype(int).tolist()
# print('\n',predictions.head())

# y_true = predictions['y_true'].values
# y_pred = predictions['class_1.0'].values

3 igor's

# API 1

In [None]:

# -------- Training Predictions ---------

try:
    # Calculate new training predictions on holdout partition of dataset
    training_predictions_job = model.request_training_predictions(dr.enums.DATA_SUBSET.ALL)
    print('Request training predictions, waiting for results')
    training_predictions = training_predictions_job.get_result_when_complete()

    # Fetch training predictions as data frame
    df_dr_train_preds = training_predictions.get_all_as_dataframe()
    print('- Done')
#     df_dr_train_preds.head()
except Exception as e:
    print('Already requested training predictions')
    # Fetch all training predictions for a project
    all_training_predictions = dr.TrainingPredictions.list(project_id)

    # Inspect all calculated training predictions
    for training_predictions in all_training_predictions:
        print(
            'Prediction {} is made for data subset "{}" of model {}'.format(
                training_predictions.prediction_id,
                training_predictions.data_subset,
                training_predictions.model_id
            )
        )
        if training_predictions.model_id == model_id:
            print('Getting training preds for model', training_predictions.model_id)
            prediction_id = training_predictions.prediction_id
#             df_dr_train_preds = training_predictions.get(project_id, prediction_id)
            df_dr_train_preds = training_predictions.get_all_as_dataframe()
            print('- Done')
#             print(df_dr_train_preds.head())

df_dr_train_preds.head()

In [None]:
# If you changed the threshold, subset at about that point to see that infact we are labeling by it...
df_dr_train_preds[df_dr_train_preds['class_1.0'] > .28].head(10)