# DataRobot API example
### Read the csv dataset from the file system, splitting off a small percentage from the training set to use for scoring predictions.

In [2]:
import pandas as pd
from sklearn.metrics import roc_auc_score, log_loss, roc_curve, auc
from pprint import pprint
import os
import time
import requests
import datarobot as dr
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
%matplotlib inline

source_data_file = 'data/flight_delay_data_50k.csv'
target = 'DELAY_INDICATOR'

# Read in data from .csv
df_full = pd.read_csv(source_data_file)

# Shuffle the rows
df_full = df_full.sample(frac=1, random_state=0)

# Split 90% for training, 10% for predictions
split = int(df_full.shape[0] * .1)
df_train = df_full[split:]
df_pred_y = df_full[:split].copy()  # has target column

# Drop the target column from the prediction dataset
df_pred = df_pred_y.drop(target, axis=1)

print('Full data shape:      ',df_full.shape)
print('Training data shape:  ',df_train.shape)
print('Prediction data shape:',df_pred.shape)

Full data shape:       (50000, 16)
Training data shape:   (45000, 16)
Prediction data shape: (5000, 15)


## Create a DataRobot project

In [3]:
# Establish client connection
dr.Client(token=os.environ['DATAROBOT_API_TOKEN'], endpoint='https://app.datarobot.com/api/v2')

<datarobot.rest.RESTClientObject at 0x105562588>

## The 'Out of the Box Autopilot'
### If we wanted to run autopilot on a dataset and it's features as-is, then we would simply run project.start, which only needs the source data (eg, a csv), the target name, and a project name as parameters.  Project creation, file upload and target selection are all combined in Project.start method.  DataRobot will do the rest in terms of data typing and using informative features.
```
project.start(sourcedata=df, 
              target='DELAY_INDICATOR'
              project_name='Flight delay classification'
```
### Instead, we'll use a few steps to create a project, upload the data, and set the target.  After feature engineering, we can run full autopilot or zmanually train selected models.

#### Autopilot mode means that the modeling process will proceed completely automatically, including running recommended models, running at different sample sizes, and blending.

#### Manual mode means that DataRobot will populate a list of recommended models, but will not insert any of them into the queue. Manual mode lets you select which models to execute before starting the modeling process.



In [3]:
import time
t1 = time.time()
# Create a project: upload the data, set the target, give the project a name
# Blueprint processing and training will run asynchronously after performing the second EDA pass
project = dr.Project.create(sourcedata=df_train, 
                            project_name='Flight Delay API example')

print('project id:  ', project.id)
print('project name:', project.project_name)
print('project.get_status():\n- ', project.get_status())
print('- create done: %0.3f' % (time.time()-t1))

t1 = time.time()
# Setting the target starts autopilot, but we'll use manual mode to kick off autopilot in a later step.
project.set_target(target=target, 
                   mode='manual',
                   worker_count=4)
print('- set_target done: %0.3f' % (time.time()-t1))

project id:   5be4b98c7947717fcacee328
project name: Flight Delay API example
project.get_status():
-  {'autopilot_done': False, 'stage_description': 'Ready to set target', 'stage': 'aim'}
- create done: 35.364
- set_target done: 42.893


## Working with features
###  We'll generate a new feature list to do some feature engineering given some dirty data and our domain knowledge of the business.  Start by first retrieving the features from the project object.

In [4]:
# Output the identified project features
features = project.get_features()
sorted([f.name for f in features])

['ACT_AC_TYPE',
 'ARRV_A15_IND',
 'AVOIDABLE_DELAY_IND',
 'CARRIER',
 'DAY_OF_WEEK',
 'DELAY_CODE',
 'DELAY_CODE_DESCRIPTION',
 'DELAY_INDICATOR',
 'DESTINATION',
 'D_O_W_NUM',
 'FLT_DATE',
 'FLT_DATE (Day of Week)',
 'FLT_NBR',
 'HOUR_OF_DPTR',
 'ORIGIN',
 'UNAVOIDABLE_DELAY_IND',
 'Unnamed: 0']

### We'll transform the flght number feature from numeric to categorical, and create a new feature list omitting features with target leakage, such as other flight delay flags.

In [5]:
# Convert flight number to a categorical variable
new_feature = project.create_type_transform_feature(name='FLT_NBR_CAT', 
                                      parent_name='FLT_NBR', 
                                      variable_type='categoricalInt')
new_feature

Feature(FLT_NBR_CAT)

In [6]:
# Create a new feature list from a subset of features to eliminate target leaks and use the categorical flight number
new_list = ['DELAY_INDICATOR', 'ORIGIN', 'DESTINATION', 'CARRIER', 
            'DAY_OF_WEEK', 'HOUR_OF_DPTR', 'ACT_AC_TYPE', new_feature.name]
featurelist = project.create_featurelist(name='my feature list', features=new_list)
featurelist

Featurelist(my feature list)

## Start Autopilot

### Option 1 - Full Autopilot mode

In [7]:
project.start_autopilot(featurelist.id)

# Block until complete
# print('Waiting to complete')
# project.wait_for_autopilot()

### Option 2 - Manual mode
#### Select specific models to run

In [7]:
# Get the blueprints DataRobot provides in the repository.
# The available blueprints are those appropriate given our dataset and target type. 
blueprints = project.get_blueprints()
# blueprints

In [8]:
# Select a few blueprints to train those models
selected_blueprints = []
ex = en = rl = True
for bp in blueprints:
    if 'eXtreme' in bp.model_type and ex:
        selected_blueprints.append(bp)
        ex = False
    if 'Elastic-Net' in bp.model_type and en:
        selected_blueprints.append(bp)
        en = False
    if 'Regularized' in bp.model_type and rl:
        selected_blueprints.append(bp)
        rl = False
    if len(selected_blueprints) > 2:
        break
selected_blueprints

[Blueprint(eXtreme Gradient Boosted Trees Classifier with Early Stopping),
 Blueprint(Elastic-Net Classifier (L1 / Binomial Deviance)),
 Blueprint(Regularized Logistic Regression (L2))]

In [9]:
model_jobs = []
for bp in selected_blueprints:
    try:
        model_job_id = project.train(bp, featurelist_id=featurelist.id,
                                     scoring_type='crossValidation')
        model_jobs.append(model_job_id)
    except Exception as e:
        print( bp.model_type,'\n', type(e))
print('model jobs:', model_jobs)

wait_secs = 15
while True:
    jobs = project.get_model_jobs()
    print('Jobs in queue: %s' % len(jobs))
    if len(jobs) == 0:
        print('- Done.')
        break
    else:
        pprint(jobs)
        time.sleep(wait_secs)

model jobs: ['11', '17', '23']
Jobs in queue: 15
[ModelJob(eXtreme Gradient Boosted Trees Classifier with Early Stopping, status=queue),
 ModelJob(Elastic-Net Classifier (L1 / Binomial Deviance), status=queue),
 ModelJob(Elastic-Net Classifier (L1 / Binomial Deviance), status=queue),
 ModelJob(Elastic-Net Classifier (L1 / Binomial Deviance), status=queue),
 ModelJob(Elastic-Net Classifier (L1 / Binomial Deviance), status=queue),
 ModelJob(Elastic-Net Classifier (L1 / Binomial Deviance), status=queue),
 ModelJob(Regularized Logistic Regression (L2), status=queue),
 ModelJob(Regularized Logistic Regression (L2), status=queue),
 ModelJob(Regularized Logistic Regression (L2), status=queue),
 ModelJob(Regularized Logistic Regression (L2), status=queue),
 ModelJob(Regularized Logistic Regression (L2), status=queue),
 ModelJob(eXtreme Gradient Boosted Trees Classifier with Early Stopping, status=inprogress),
 ModelJob(eXtreme Gradient Boosted Trees Classifier with Early Stopping, status=inpro

## Get models and blueprints from the leaderboard

### To get the best performing model from the leaderboard, pop it from the top of the models list.  Or if run via full autopilot, the recommended model is available

In [4]:
# Get the models, which are already ordered by rank from the leaderboard
models = project.get_models()

print('Number of models on the leaderboard: %s\n' % len(models))

# Get the best performing model (excluding the blenders, which are typically the top 4 models)
for model in models:
    if 'Blender' not in model.model_type:
        best_model = model
        break
print('Best model from the leaderboard:  \'%s\'' % best_model.model_type)

# Or get the recommended model, if available
try:
    recommendation = dr.ModelRecommendation.get(project.id)
    recommended_model = recommendation.get_model()
    print('Recommended model:\'%s\'' % recommended_model.model_type)
except Exception as e:
    print('No recommended model yet.  Either autopilot is still running or models were trained manually.')
# **Note for a deployment, you have to manually add the deployment and get the deployment ID from the UI.
#   Can't do this via the api at this time.

# Get the blueprint
blueprint_id = best_model.blueprint_id
blueprint = dr.models.Blueprint.get(project.id, blueprint_id)
print('\nBest model blueprint preprocessing steps:')
pprint(blueprint.processes)

# Get the model scoring metrics
print('\nBest model metrics:')
pprint(best_model.metrics)

NameError: name 'project' is not defined

### Alternatively we can compare how each model performs.

In [11]:
# Generate a summary of all model performances and put it into a DataFrame
models = project.get_models()
    
val_scores = pd.DataFrame([{'model_type': model.model_type,
                       'blueprint info': model.blueprint,
                       'model_id': model.id,
                       'sample_pct': model.sample_pct,
                       'featurelist': model.featurelist_name,
                       'val_logloss': model.metrics['LogLoss']['validation'],
                       'cross_val_logloss': model.metrics['LogLoss']['crossValidation']}
#                            for model in flist_models if model.metrics['LogLoss'] is not None])
                       for model in models if model.metrics['LogLoss'] is not None])

modelframe = val_scores.sort_values(by='cross_val_logloss')
modelframe

Unnamed: 0,blueprint info,cross_val_logloss,featurelist,model_id,model_type,sample_pct,val_logloss
0,Blueprint(eXtreme Gradient Boosted Trees Class...,0.60883,my feature list,5be4b9f67947717f7dcee362,eXtreme Gradient Boosted Trees Classifier with...,64.0,0.60602
2,Blueprint(Regularized Logistic Regression (L2)),0.626974,my feature list,5be4b9f97947717f7dcee370,Regularized Logistic Regression (L2),64.0,0.62459
1,Blueprint(Elastic-Net Classifier (L1 / Binomia...,0.627006,my feature list,5be4b9f87947717fd7cee5b0,Elastic-Net Classifier (L1 / Binomial Deviance),64.0,0.62455


## Score new data
### Option 1 - direct on a project and model 
#### To make predctions on new data, simply upload a dataset to the project and request the predictions on the model you choose.  The scoring data should include all features except the target.

In [12]:
print('Scoring data on model \'%s\'\n' % best_model.model_type)

print('Uploading prediction dataset')
dataset_from_path = project.upload_dataset(df_pred)

print('Request predictions')
predict_job = best_model.request_predictions(dataset_from_path.id)

print('Waiting for prediction calculations')
predictions = predict_job.get_result_when_complete()

predictions.head()

Scoring data on model 'eXtreme Gradient Boosted Trees Classifier with Early Stopping'

Uploading prediction dataset
Request predictions
Waiting for prediction calculations


Unnamed: 0,positive_probability,prediction,prediction_threshold,row_id,class_0.0,class_1.0
0,0.490129,0.0,0.5,0,0.509871,0.490129
1,0.302685,0.0,0.5,1,0.697315,0.302685
2,0.695894,1.0,0.5,2,0.304106,0.695894
3,0.206731,0.0,0.5,3,0.793269,0.206731
4,0.590266,1.0,0.5,4,0.409734,0.590266


### Option 2 - Reference the Deployment ID
#### DataRobot's Model Management dashboard provides for monitoring model data including performance and datadrift.  Prediction requests are routed to a deployment and not a specific model.  This allows for swapping models in and out without requiring a code change.

#### First, from the DataRobot application: 
- 1) Go to the model
- 2) Click 'Predict'
- 3) Click 'Deploy Model API'
- 4) Click the orange 'Add New Deployment' button
- 5) On the Deployments page, for a given deploment, click 'Integrations'
- 6) Copy the DEPLOYMENT_ID and pass that to DataRobot API via the REST call

In [1]:
API_TOKEN = os.getenv('DATAROBOT_API_TOKEN')
ENDPOINT = os.getenv('DATAROBOT_ENDPOINT')
USERNAME = os.getenv('DATAROBOT_USERNAME')

DEPLOYMENT_ID = '5be0e7a1fd2b980c56eecbc7'  # My project's recommended model: XGBoost @ 80%

pred_file = '_temp_pitch_pred.csv'
df_pred.to_csv(pred_file)
data = open(pred_file, 'rb').read()
os.remove(pred_file)

headers = {'Content-Type': 'text/plain; charset=UTF-8', 'datarobot-key': '544ec55f-61bf-f6ee-0caf-15c7f919a45d'}
predictions_response = requests.post('https://cfds-ccm-prod.orm.datarobot.com/predApi/v1.0/deployments/%s/predictions' % (DEPLOYMENT_ID),
                                     auth=(USERNAME, API_TOKEN), data=data, headers=headers)

predictions_response.raise_for_status()
df_preds = pd.DataFrame(predictions_response.json().get('data'))

# Flatten the nested predictions dict of label/value data in 'prediction values'
df_preds['label1'] = None
df_preds['proba1'] = None
df_preds['label2'] = None
df_preds['proba2'] = None
def func(row):
    for i, pair in enumerate(row['predictionValues']):
        name = pair.get('label')
        val = pair.get('value')
        col_name_n = 'label' + str(i+1)
        col_val_n = 'proba' + str(i+1)
        row[col_name_n] = name
        row[col_val_n] = val
    return row
df_preds_flat = df_preds.apply(lambda row: func(row), axis=1)

df_preds_flat.rename(columns={'proba1': 'class_1.0',
                              'proba2': 'class_0.0'},
                     inplace=True)
df_preds_flat.drop(['predictionValues', 'label1', 'label2'], axis=1, inplace=True)
df_preds_flat['prediction'] = df_preds_flat['prediction'].astype(int)

df_preds_flat.head()

NameError: name 'os' is not defined