# <u>DataRobot API examples</u>
#### Documentation:  https://datarobot-public-api-client.readthedocs-hosted.com/
## 1. Read a dataset from a csv file on the file system, split it into training and prediction sets.

In [1]:
import pandas as pd
from sklearn.metrics import roc_auc_score, log_loss, roc_curve, auc
from pprint import pprint
import os
import time
import requests
import datarobot as dr
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
%matplotlib inline

source_data_file = 'data/flight_delay_data_50k.csv'
target = 'DELAY_INDICATOR'

# Read in data from .csv
df_full = pd.read_csv(source_data_file)

# Shuffle the rows
df_full = df_full.sample(frac=1, random_state=0)

# Split 90% for training, 10% for predictions
split = int(df_full.shape[0] * .1)
df_train = df_full[split:]
df_pred_y = df_full[:split].copy()  # has target column

# Drop the target column from the prediction dataset
df_pred = df_pred_y.drop(target, axis=1)

print('Full data shape:      ',df_full.shape)
print('Training data shape:  ',df_train.shape)
print('Prediction data shape:',df_pred.shape)

Full data shape:       (50000, 16)
Training data shape:   (45000, 16)
Prediction data shape: (5000, 15)


## 2. Create a DataRobot project

In [3]:
# Establish client connection
dr.Client(token=os.environ['DATAROBOT_API_TOKEN'], endpoint='https://app.datarobot.com/api/v2')

<datarobot.rest.RESTClientObject at 0x10a611210>

## 3. The 'Out of the Box Autopilot'
### If we wanted to run autopilot on a dataset and it's features as-is, then we would simply run project.start, which only needs the source data (eg, a csv), the target name, and a project name as parameters.  Project creation, file upload and target selection are all combined in Project.start method.  DataRobot will do the rest in terms of data typing and using informative features.
```
project.start(sourcedata=df, 
              target='DELAY_INDICATOR'
              project_name='Flight delay classification'
```
### Instead, we'll use a few steps to create a project, upload the data, and set the target.  After feature engineering, we can run full autopilot or zmanually train selected models.

#### Autopilot mode means that the modeling process will proceed completely automatically, including running recommended models, running at different sample sizes, and blending.

#### Manual mode means that DataRobot will populate a list of recommended models, but will not insert any of them into the queue. Manual mode lets you select which models to execute before starting the modeling process.



In [4]:
import time
t1 = time.time()
# Create a project: upload the data, set the target, give the project a name
# Blueprint processing and training will run asynchronously after performing the second EDA pass
project = dr.Project.create(sourcedata=df_train, 
                            project_name='Flight Delay API example')

print('project id:  ', project.id)
print('project name:', project.project_name)
print('project.get_status():\n- ', project.get_status())
print('- create done: %0.3f' % (time.time()-t1))

t1 = time.time()
# Setting the target starts autopilot, but we'll use manual mode to kick off autopilot in a later step.
project.set_target(target=target, 
                   mode='manual',
                   worker_count=-1)
print('- set_target done: %0.3f' % (time.time()-t1))

project id:   5f0502a1c0a77c09cf379d53
project name: Flight Delay API example
project.get_status():
-  {'autopilot_done': False, 'stage_description': 'Ready to set target', 'stage': 'aim'}
- create done: 45.354
- set_target done: 49.719


## 4. Working with features
###  We'll generate a new feature list to do some feature engineering given some dirty data and our domain knowledge of the business.  Start by first retrieving the features from the project object.

In [5]:
# Output the identified project features
features = project.get_features()
sorted([f.name for f in features])

['ACT_AC_TYPE',
 'ARRV_A15_IND',
 'AVOIDABLE_DELAY_IND',
 'CARRIER',
 'DAY_OF_WEEK',
 'DELAY_CODE',
 'DELAY_CODE_DESCRIPTION',
 'DELAY_INDICATOR',
 'DESTINATION',
 'D_O_W_NUM',
 'FLT_DATE',
 'FLT_DATE (Day of Week)',
 'FLT_NBR',
 'HOUR_OF_DPTR',
 'ORIGIN',
 'UNAVOIDABLE_DELAY_IND',
 'Unnamed: 0']

### We'll transform the flght number feature from numeric to categorical, and create a new feature list omitting features with target leakage, such as other flight delay flags.

In [6]:
# Convert flight number to a categorical variable
new_feature = project.create_type_transform_feature(name='FLT_NBR_CAT', 
                                      parent_name='FLT_NBR', 
                                      variable_type='categoricalInt')
new_feature

Feature(FLT_NBR_CAT)

In [7]:
# Create a new feature list from a subset of features to eliminate target leaks and use the categorical flight number
new_list = ['DELAY_INDICATOR', 'ORIGIN', 'DESTINATION', 'CARRIER', 
            'DAY_OF_WEEK', 'HOUR_OF_DPTR', 'ACT_AC_TYPE', new_feature.name]
featurelist = project.create_featurelist(name='my feature list', features=new_list)
featurelist

Featurelist(my feature list)

## 5. Autopilot Modes

### Option 1 - Full Autopilot mode

In [8]:
project.start_autopilot(featurelist.id)

# Block until complete
# print('Waiting to complete')
# project.wait_for_autopilot()

### Option 2 - Manual mode
#### Select specific models to run

In [9]:
# Get the blueprints DataRobot provides in the repository.
# The available blueprints are those appropriate given our dataset and target type. 
blueprints = project.get_blueprints()
# blueprints

In [10]:
# Select a few blueprints to train those models
selected_blueprints = []
ex = en = rl = True
for bp in blueprints:
    if 'eXtreme' in bp.model_type and ex:
        selected_blueprints.append(bp)
        ex = False
    if 'Elastic-Net' in bp.model_type and en:
        selected_blueprints.append(bp)
        en = False
    if 'Regularized' in bp.model_type and rl:
        selected_blueprints.append(bp)
        rl = False
    if len(selected_blueprints) > 2:
        break
selected_blueprints

[Blueprint(eXtreme Gradient Boosted Trees Classifier with Early Stopping - Forest (10x)),
 Blueprint(Elastic-Net Classifier (L2 / Binomial Deviance)),
 Blueprint(Regularized Logistic Regression (L2))]

In [11]:
model_jobs = []
for bp in selected_blueprints:
    try:
        model_job_id = project.train(bp, featurelist_id=featurelist.id,
                                     scoring_type='crossValidation')
        model_jobs.append(model_job_id)
    except Exception as e:
        print( bp.model_type,'\n', type(e))
print('model jobs:', model_jobs)

wait_secs = 15
while True:
    jobs = project.get_model_jobs()
    print('Jobs in queue: %s' % len(jobs))
    if len(jobs) == 0:
        print('- Done.')
        break
    else:
        pprint(jobs)
        time.sleep(wait_secs)

model jobs: ['47', '53', '59']
Jobs in queue: 49
[ModelJob(Naive Bayes combiner classifier, status=inprogress),
 ModelJob(RandomForest Classifier (Gini), status=inprogress),
 ModelJob(Gradient Boosted Trees Classifier, status=inprogress),
 ModelJob(Decision Tree Classifier (Gini), status=inprogress),
 ModelJob(Logistic Regression, status=inprogress),
 ModelJob(Regularized Logistic Regression (L2), status=inprogress),
 ModelJob(Majority Class Classifier, status=inprogress),
 ModelJob(Gradient Boosted Trees Classifier, status=inprogress),
 ModelJob(Breiman and Cutler Random Forest Classifier, status=inprogress),
 ModelJob(RuleFit Classifier, status=inprogress),
 ModelJob(Regularized Logistic Regression (L2), status=inprogress),
 ModelJob(eXtreme Gradient Boosted Trees Classifier with Early Stopping, status=inprogress),
 ModelJob(Elastic-Net Classifier (L2 / Binomial Deviance), status=inprogress),
 ModelJob(Elastic-Net Classifier (mixing alpha=0.5 / Binomial Deviance), status=inprogress),

Jobs in queue: 37
[ModelJob(RandomForest Classifier (Gini), status=inprogress),
 ModelJob(Breiman and Cutler Random Forest Classifier, status=inprogress),
 ModelJob(Regularized Logistic Regression (L2), status=inprogress),
 ModelJob(Elastic-Net Classifier (L2 / Binomial Deviance), status=inprogress),
 ModelJob(RandomForest Classifier (Gini), status=inprogress),
 ModelJob(Generalized Additive2 Model, status=inprogress),
 ModelJob(Light Gradient Boosting on ElasticNet Predictions , status=inprogress),
 ModelJob(Regularized Logistic Regression (L2), status=queue),
 ModelJob(Elastic-Net Classifier (mixing alpha=0.5 / Binomial Deviance), status=queue),
 ModelJob(Elastic-Net Classifier (L2 / Binomial Deviance) with Binned numeric features, status=queue),
 ModelJob(Stochastic Gradient Descent Classifier, status=queue),
 ModelJob(RandomForest Classifier (Entropy), status=queue),
 ModelJob(ExtraTrees Classifier (Gini), status=queue),
 ModelJob(Nystroem Kernel SVM Classifier, status=queue),
 Mod

Jobs in queue: 19
[ModelJob(Regularized Logistic Regression (L2), status=inprogress),
 ModelJob(Elastic-Net Classifier (mixing alpha=0.5 / Binomial Deviance), status=inprogress),
 ModelJob(Elastic-Net Classifier (L2 / Binomial Deviance) with Binned numeric features, status=inprogress),
 ModelJob(Stochastic Gradient Descent Classifier, status=inprogress),
 ModelJob(RandomForest Classifier (Entropy), status=inprogress),
 ModelJob(ExtraTrees Classifier (Gini), status=inprogress),
 ModelJob(Nystroem Kernel SVM Classifier, status=inprogress),
 ModelJob(Gradient Boosted Trees Classifier with Early Stopping, status=inprogress),
 ModelJob(Gradient Boosted Greedy Trees Classifier with Early Stopping, status=inprogress),
 ModelJob(eXtreme Gradient Boosted Trees Classifier with Early Stopping, status=inprogress),
 ModelJob(eXtreme Gradient Boosted Trees Classifier with Early Stopping and Unsupervised Learning Features, status=inprogress),
 ModelJob(Elastic-Net Classifier (mixing alpha=0.5 / Binom

Jobs in queue: 15
[ModelJob(RandomForest Classifier (Entropy), status=inprogress),
 ModelJob(Gradient Boosted Greedy Trees Classifier with Early Stopping, status=inprogress),
 ModelJob(Eureqa Generalized Additive Model Classifier (1777 Generations), status=inprogress),
 ModelJob(Light Gradient Boosting on ElasticNet Predictions , status=inprogress),
 ModelJob(ExtraTrees Classifier (Gini), status=inprogress),
 ModelJob(Nystroem Kernel SVM Classifier, status=inprogress),
 ModelJob(Elastic-Net Classifier (L2 / Binomial Deviance) with Binned numeric features, status=inprogress),
 ModelJob(Regularized Logistic Regression (L2), status=inprogress),
 ModelJob(RandomForest Classifier (Gini), status=inprogress),
 ModelJob(Generalized Additive2 Model, status=inprogress),
 ModelJob(Light Gradient Boosted Trees Classifier with Early Stopping, status=inprogress),
 ModelJob(Stochastic Gradient Descent Classifier, status=inprogress),
 ModelJob(Gradient Boosted Trees Classifier with Early Stopping, sta

Jobs in queue: 3
[ModelJob(RandomForest Classifier (Entropy), status=inprogress),
 ModelJob(ExtraTrees Classifier (Gini), status=inprogress),
 ModelJob(eXtreme Gradient Boosted Trees Classifier with Early Stopping, status=inprogress)]
Jobs in queue: 3
[ModelJob(RandomForest Classifier (Entropy), status=inprogress),
 ModelJob(ExtraTrees Classifier (Gini), status=inprogress),
 ModelJob(eXtreme Gradient Boosted Trees Classifier with Early Stopping, status=inprogress)]
Jobs in queue: 3
[ModelJob(RandomForest Classifier (Entropy), status=inprogress),
 ModelJob(ExtraTrees Classifier (Gini), status=inprogress),
 ModelJob(eXtreme Gradient Boosted Trees Classifier with Early Stopping, status=inprogress)]
Jobs in queue: 3
[ModelJob(RandomForest Classifier (Entropy), status=inprogress),
 ModelJob(ExtraTrees Classifier (Gini), status=inprogress),
 ModelJob(eXtreme Gradient Boosted Trees Classifier with Early Stopping, status=inprogress)]
Jobs in queue: 2
[ModelJob(RandomForest Classifier (Entropy)

Jobs in queue: 15
[ModelJob(ExtraTrees Classifier (Gini), status=inprogress),
 ModelJob(ExtraTrees Classifier (Gini), status=inprogress),
 ModelJob(Vowpal Wabbit Classifier, status=inprogress),
 ModelJob(eXtreme Gradient Boosted Trees Classifier with Early Stopping, status=inprogress),
 ModelJob(eXtreme Gradient Boosted Trees Classifier with Early Stopping, status=inprogress),
 ModelJob(eXtreme Gradient Boosted Trees Classifier with Early Stopping, status=inprogress),
 ModelJob(Gradient Boosted Trees Classifier with Early Stopping, status=inprogress),
 ModelJob(Gradient Boosted Trees Classifier with Early Stopping, status=inprogress),
 ModelJob(Gradient Boosted Trees Classifier with Early Stopping, status=inprogress),
 ModelJob(Gradient Boosted Trees Classifier with Early Stopping, status=inprogress),
 ModelJob(RandomForest Classifier (Entropy), status=inprogress),
 ModelJob(RandomForest Classifier (Entropy), status=inprogress),
 ModelJob(RandomForest Classifier (Entropy), status=inpro

## 6. Get models and blueprints from the leaderboard

### To get the best performing model from the leaderboard, pop it from the top of the models list.  Or if we run in full autopilot mode, the DataRobot "Recommended" model will be generated and available

In [25]:
# Get the models, which are already ordered by rank from the leaderboard
models = project.get_models()

print('Number of models on the leaderboard: %s\n' % len(models))

# Get the best performing model (excluding the blenders, which are typically the top 4 models)
for model in models:
    if 'Blender' not in model.model_type:
        best_model = model
        break
print('Best model from the leaderboard:  \'%s\'' % best_model.model_type)

# Or get the recommended model, if available
try:
    recommendation = dr.ModelRecommendation.get(project.id)
    recommended_model = recommendation.get_model()
    print('Recommended model:\'%s\'' % recommended_model.model_type)
except Exception as e:
    print('No recommended model yet.  Either autopilot is still running or models were trained manually.')

# Get the blueprint
blueprint_id = best_model.blueprint_id
blueprint = dr.models.Blueprint.get(project.id, blueprint_id)
print('\nBest model blueprint preprocessing steps:')
pprint(blueprint.processes)

# Get the model scoring metrics
print('\nBest model metrics:')
pprint(best_model.metrics)

Number of models on the leaderboard: 65

Best model from the leaderboard:  'eXtreme Gradient Boosted Trees Classifier with Early Stopping - Forest (10x)'
Recommended model:'eXtreme Gradient Boosted Trees Classifier with Early Stopping - Forest (10x)'

Best model blueprint preprocessing steps:
['Ordinal encoding of categorical variables',
 'Missing Values Imputed',
 'eXtreme Gradient Boosted Trees Classifier with Early Stopping - Forest (10x)']

Best model metrics:
{'AUC': {'backtesting': None,
         'backtestingScores': None,
         'crossValidation': 0.6883520000000001,
         'holdout': 0.68311,
         'validation': 0.69104},
 'FVE Binomial': {'backtesting': None,
                  'backtestingScores': None,
                  'crossValidation': 0.07947199999999999,
                  'holdout': 0.07351,
                  'validation': 0.08241},
 'Gini Norm': {'backtesting': None,
               'backtestingScores': None,
               'crossValidation': 0.37670400000000004,


### With some data aggregation we can compare how each model performs.

In [24]:
# Generate a summary of all model performances and put it into a DataFrame
models = project.get_models()
    
val_scores = pd.DataFrame([{'model_type': model.model_type,
                       'blueprint info': model.blueprint,
                       'model_id': model.id,
                       'sample_pct': model.sample_pct,
                       'featurelist': model.featurelist_name,
                       'val_logloss': model.metrics['LogLoss']['validation'],
                       'cross_val_logloss': model.metrics['LogLoss']['crossValidation']}
#                            for model in flist_models if model.metrics['LogLoss'] is not None])
                       for model in models if model.metrics['LogLoss'] is not None])

modelframe = val_scores.sort_values(by='cross_val_logloss')
modelframe

Unnamed: 0,blueprint info,cross_val_logloss,featurelist,model_id,model_type,sample_pct,val_logloss
0,Blueprint(eXtreme Gradient Boosted Trees Class...,0.604044,my feature list,5f050a37367e0d6f5950a302,eXtreme Gradient Boosted Trees Classifier with...,100.0,0.60216
2,Blueprint(ENET Blender),0.606100,my feature list,5f050b0718bd1d69b1242904,ENET Blender,64.0,0.60252
1,Blueprint(AVG Blender),0.606178,my feature list,5f050b0418bd1d69b12428fe,AVG Blender,64.0,0.60231
3,Blueprint(ENET Blender),0.606734,my feature list,5f050b0718bd1d69b1242902,ENET Blender,64.0,0.60328
4,Blueprint(Advanced AVG Blender),0.607074,my feature list,5f050b0618bd1d69b1242900,Advanced AVG Blender,64.0,0.60409
6,Blueprint(eXtreme Gradient Boosted Trees Class...,0.607124,my feature list,5f05098c18bd1d627a2428c5,eXtreme Gradient Boosted Trees Classifier with...,80.0,0.60510
8,Blueprint(eXtreme Gradient Boosted Trees Class...,0.607494,my feature list,5f0505f9d292912a8898b906,eXtreme Gradient Boosted Trees Classifier with...,64.0,0.60528
5,Blueprint(RandomForest Classifier (Entropy)),0.607684,my feature list,5f0507f818bd1d5e3c242883,RandomForest Classifier (Entropy),64.0,0.60466
7,Blueprint(Light Gradient Boosting on ElasticNe...,0.609848,my feature list,5f0507f818bd1d5e3c242885,Light Gradient Boosting on ElasticNet Predicti...,64.0,0.60523
9,Blueprint(Light Gradient Boosted Trees Classif...,0.610270,my feature list,5f0507f818bd1d5e3c242888,Light Gradient Boosted Trees Classifier with E...,64.0,0.60611


## 7. Score new data
### Option 1 - direct on a project and model 
#### To make predctions on new data, simply upload a dataset to the project and request the predictions on the model you choose.  The scoring data should include all features except the target.

In [28]:
print('Scoring data on model \'%s\'\n' % best_model.model_type)

print('Uploading prediction dataset')
dataset_from_path = project.upload_dataset(df_pred)

print('Request predictions')
predict_job = best_model.request_predictions(dataset_from_path.id)

print('Waiting for prediction calculations')
predictions = predict_job.get_result_when_complete()

predictions.head()

Scoring data on model 'eXtreme Gradient Boosted Trees Classifier with Early Stopping - Forest (10x)'

Uploading prediction dataset
Request predictions
Waiting for prediction calculations


Unnamed: 0,positive_probability,prediction,prediction_threshold,row_id,class_0.0,class_1.0
0,0.509952,1.0,0.5,0,0.490048,0.509952
1,0.308132,0.0,0.5,1,0.691868,0.308132
2,0.699986,1.0,0.5,2,0.300014,0.699986
3,0.254196,0.0,0.5,3,0.745804,0.254196
4,0.564685,1.0,0.5,4,0.435315,0.564685


### Option 2 - Reference the Deployment ID
#### DataRobot's MLOps and model management dashboard provides for monitoring model data including performance and datadrift.  
Prediction requests are routed to a deployment that resides on a dedicated prediction server, which handles the prediction requests for a specific model.  This allows for swapping models in and out without requiring a code change to the deployment that contains it.

#### To do this manually, from the DataRobot application: 
- 1) Go to the model you want to deploy
- 2) Click 'Predict'
- 3) Click 'Deploy Model API'
- 4) Click the orange 'Add New Deployment' button.  Click to go to the deployment when its been deployed.
- 5) On the Deployments page, for a given deploment, click 'Integrations', then the 'Scoring Code' icon

If you want you can copy the DEPLOYMENT_ID and pass that to DataRobot API via the REST call below.  Or, you can do this via the API here:

In [32]:
# Get the prediction server ID that received prediction request
prediction_server_list = dr.PredictionServer.list()
prediction_server = prediction_server_list[0]
prediction_server

PredictionServer(https://datarobot-cfds.dynamic.orm.datarobot.com)

In [33]:
# Create a deployment for the best_model, and deploy to the DataRobot prediction server
deployment = dr.Deployment.create_from_learning_model(best_model.id, 'New Deployment', 
                                                      default_prediction_server_id=prediction_server.id)
deployment

Deployment(New Deployment)

In [34]:
# List deployments
deployments = dr.Deployment.list()

# Get the one you just deploy manually in the steps above ^^ (note:)
current_deployed_model = deployments[-1]
print(current_deployed_model)
print()

print(current_deployed_model.model)

current_deployed_model.id

Deployment(New Deployment)

{'id': '5f050a37367e0d6f5950a302', 'type': 'eXtreme Gradient Boosted Trees Classifier with Early Stopping - Forest (10x)', 'target_name': 'DELAY_INDICATOR', 'project_id': '5f0502a1c0a77c09cf379d53', 'target_type': 'Binary', 'project_name': 'Flight Delay API example', 'build_environment_type': 'DataRobot', 'deployed_at': '2020-07-08T00:12:23.932000Z', 'unsupervised_mode': False}


'5f050f67d292910698e23b13'

In [35]:
API_TOKEN = os.getenv('DATAROBOT_API_TOKEN')
ENDPOINT = os.getenv('DATAROBOT_ENDPOINT')
USERNAME = os.getenv('DATAROBOT_USERNAME')

DEPLOYMENT_ID = current_deployed_model.id  # My project's recommended model: XGBoost @ 80%

pred_file = '_temp.csv'
df_pred.to_csv(pred_file)
data = open(pred_file, 'rb').read()
os.remove(pred_file)

headers = {'Content-Type': 'text/plain; charset=UTF-8', 'datarobot-key': '544ec55f-61bf-f6ee-0caf-15c7f919a45d'}
predictions_response = requests.post('https://cfds-ccm-prod.orm.datarobot.com/predApi/v1.0/deployments/%s/predictions' % (DEPLOYMENT_ID),
                                     auth=(USERNAME, API_TOKEN), data=data, headers=headers)

predictions_response.raise_for_status()
df_preds = pd.DataFrame(predictions_response.json().get('data'))

# Flatten the nested predictions dict of label/value data in 'prediction values'
df_preds['label1'] = None
df_preds['proba1'] = None
df_preds['label2'] = None
df_preds['proba2'] = None
def func(row):
    for i, pair in enumerate(row['predictionValues']):
        name = pair.get('label')
        val = pair.get('value')
        col_name_n = 'label' + str(i+1)
        col_val_n = 'proba' + str(i+1)
        row[col_name_n] = name
        row[col_val_n] = val
    return row
df_preds_flat = df_preds.apply(lambda row: func(row), axis=1)

df_preds_flat.rename(columns={'proba1': 'class_1.0',
                              'proba2': 'class_0.0'},
                     inplace=True)
df_preds_flat.drop(['predictionValues', 'label1', 'label2'], axis=1, inplace=True)
df_preds_flat['prediction'] = df_preds_flat['prediction'].astype(int)

df_preds_flat.head()

Unnamed: 0,prediction,predictionThreshold,rowId,class_1.0,class_0.0
0,1,0.5,0,0.509952,0.490048
1,0,0.5,1,0.308132,0.691868
2,1,0.5,2,0.699986,0.300014
3,0,0.5,3,0.254196,0.745804
4,1,0.5,4,0.564685,0.435315
