In [2]:
import pandas as pd
import datarobot as dr
import os
pd.options.display.max_columns = 200

In [4]:

# -------- Train test split the dataset --------

df_full = pd.read_csv('data/DR_Demo_10K_Lending_Club_Loans.csv', encoding = 'ISO-8859-1')
target = 'is_bad'

# Shuffle the rows
df_full = df_full.sample(frac=1, random_state=0)

# Split 90% for training, 10% for predictions
split = int(df_full.shape[0] * .1)
df = df_full[split:]
df_pred = df_full[:split].copy()

# Drop the target from the prediction dataset
prediction_data = df_pred.drop(target, axis=1)

print('Full data shape:      ',df_full.shape)
print('Training data shape:  ',df.shape)
print('Prediction data shape:',prediction_data.shape)

df.to_csv('data/DR_Demo_10K_Lending_Club_Loans_train.csv')
prediction_data.to_csv('data/DR_Demo_10K_Lending_Club_Loans_pred.csv')

Full data shape:       (10000, 34)
Training data shape:   (9000, 34)
Prediction data shape: (1000, 33)


In [3]:
dr.Client(token=os.environ['DATAROBOT_API_TOKEN'], endpoint='https://app.datarobot.com/api/v2')
project_id = '5bc60d00b64ee921826dbe2f'
model_id = '5bc60d79b64ee922d96dbd7c'

project = dr.Project.get(project_id=project_id)
model = dr.Model.get(project=project_id, model_id=model_id)
datasets = project.get_datasets()

print(project)
print(model)

Project(DR_Demo_10K_Lending_Club_Loans.csv)
Model('Logistic Regression')


In [4]:

# -------- Training Predictions ---------

try:
    # Calculate new training predictions on holdout partition of dataset
    training_predictions_job = model.request_training_predictions(dr.enums.DATA_SUBSET.ALL)
    print('Request training predictions, waiting for results')
    training_predictions = training_predictions_job.get_result_when_complete()

    # Fetch training predictions as data frame
    df_dr_train_preds = training_predictions.get_all_as_dataframe()
    print('- Done')
#     df_dr_train_preds.head()
except Exception as e:
    print('Already requested training predictions')
    # Fetch all training predictions for a project
    all_training_predictions = dr.TrainingPredictions.list(project_id)

    # Inspect all calculated training predictions
    for training_predictions in all_training_predictions:
        print(
            'Prediction {} is made for data subset "{}" of model {}'.format(
                training_predictions.prediction_id,
                training_predictions.data_subset,
                training_predictions.model_id
            )
        )
        if training_predictions.model_id == model_id:
            print('Getting training preds for model', training_predictions.model_id)
            prediction_id = training_predictions.prediction_id
#             df_dr_train_preds = training_predictions.get(project_id, prediction_id)
            df_dr_train_preds = training_predictions.get_all_as_dataframe()
            print('- Done')
#             print(df_dr_train_preds.head())

df_dr_train_preds.head()

Already requested training predictions
Prediction 5bc6266a492eff5adae173d7 is made for data subset "all" of model 5bc60d79b64ee922d96dbd7d
Prediction 5bc62b58492eff5c23e170e7 is made for data subset "all" of model 5bc60d79b64ee922d96dbd7c
Getting training preds for model 5bc60d79b64ee922d96dbd7c
- Done


Unnamed: 0,row_id,partition_id,prediction,class_1.0,class_0.0
0,0,1.0,0.0,0.0841,0.9159
1,1,Holdout,0.0,0.088309,0.911691
2,2,Holdout,0.0,0.036402,0.963598
3,3,4.0,0.0,0.127688,0.872312
4,4,2.0,0.0,0.068998,0.931002


In [5]:
# If you changed the threshold, subset at about that point to see that infact we are labeling by it...
df_dr_train_preds[df_dr_train_preds['class_1.0'] > .28].head(10)

Unnamed: 0,row_id,partition_id,prediction,class_1.0,class_0.0
25,25,2.0,1.0,0.346161,0.653839
26,26,2.0,1.0,0.43973,0.56027
32,32,2.0,1.0,0.441475,0.558525
55,55,0.0,0.0,0.29061,0.70939
66,66,2.0,0.0,0.284935,0.715065
73,73,3.0,1.0,0.340956,0.659044
97,97,1.0,1.0,0.38632,0.61368
100,100,2.0,0.0,0.285536,0.714464
121,121,1.0,1.0,0.348771,0.651229
129,129,4.0,0.0,0.282596,0.717404


In [6]:
# -------- New Scoring Predictions ---------

print('Uploading prediction dataset')
dataset_from_path = project.upload_dataset('./DR_Demo_10K_Lending_Club_Loans_pred.csv')

print('Request predictions')
predict_job = model.request_predictions(dataset_from_path.id)

print('Waiting for prediction calculations')
predictions = predict_job.get_result_when_complete()

predictions.head()


Uploading prediction dataset
Request predictions
Waiting for prediction calculations


Unnamed: 0,positive_probability,prediction,prediction_threshold,row_id,class_0.0,class_1.0
0,0.229833,1.0,0.17,0,0.770167,0.229833
1,0.095122,0.0,0.17,1,0.904878,0.095122
2,0.087769,0.0,0.17,2,0.912231,0.087769
3,0.125421,0.0,0.17,3,0.874579,0.125421
4,0.389285,1.0,0.17,4,0.610715,0.389285


In [None]:
# This installs the batch prediction help code from:
# https://github.com/datarobot/batch-scoring
# pip install -U datarobot_batch_scoring

## Dev

In [7]:
# Scratch stuff...

# 
# # Upload the scoring dataset if not already uploaded, else retrieve it
# if (len(datasets) == 0):
#     print('Uploading prediction dataset')
#     pred_ds = project.upload_dataset(prediction_data)
# else:
#     print('Retrieving the prediction dataset id')
#     for s in datasets:
#         if s.name == 'predict.csv':
#             pred_ds = s

# # Request predictions for the scoring dataset, if it hasn't been already requested
# try:                      
#     predict_job
# except NameError:
#     print('Requesitng predictions')
#     predict_job = model.request_predictions(pred_ds.id)
    
# # Get the predictions results when complete
# print('Waiting for prediction results')
# predictions = predict_job.get_result_when_complete()
# print('- done.  Predictions dataframe:')
# print(predictions.head())

# # Add a column for the target labels (y_true)
# predictions['y_true'] = df_pred[target].astype(int).tolist()
# print('\n',predictions.head())

# y_true = predictions['y_true'].values
# y_pred = predictions['class_1.0'].values