### For a given project:
- 1. Upload an external dataset
- 2. Permute one feature
- 3. Calculate feature impact

In [1]:
import pandas as pd
import numpy as np
import os
import datarobot as dr
import time
from pprint import pprint

USERNAME = os.environ['DATAROBOT_USERNAME']
API_TOKEN = os.environ['DATAROBOT_API_TOKEN']
ENDPOINT = 'https://app.datarobot.com/api/v2'

In [2]:
def establish_dr_client():
    dr.Client(token=API_TOKEN, endpoint=ENDPOINT) 
    
establish_dr_client()

t0 = time.time()

prj = dr.Project.get('602d64004d7845cfbb178a8a') # 10k_diabetes_80
print('project: ', prj)

models = prj.get_models()  # [:12]  # get only the top n to narrow it down
print('%s models' % len(models))

# Load the external dataset
df = pd.read_excel('/Users/matthew.cohen/Documents/DR/Demo/_Diabetes/10k_diabetes_20.xlsx')

# Limit to the top n models
max_models = 3

# Limit columns, or make a subset list
selected_cols = df.columns[:10]

for i, col in enumerate(selected_cols):
    try:
        print('\n**** permuting column \'%s\' (%s of %s) ****\n' % (col, i+1, len(selected_cols)))
        df[col] = np.random.permutation(df[col])

        # Upload the external dataset
        ext_test = prj.upload_dataset(df)
        print('external dataset: %s, id: %s' % (ext_test, ext_test.id))

        # Request n models request/run the external dataset
        print('Request external dataset test for %s models.' % max_models)
        for i, model in enumerate(models):
            if i+1 > max_models:
                break
            model = models[i]
            # print('request model %s %s for external test set' % (i, model))
            model.request_external_test(ext_test.id)

        # Wait for each model to complete scoring the external test set scores before getting the scores
        while len(prj.get_all_jobs()) > 0:
            time.sleep(1)
        print('- Externals test requests complete.')

        # Get the scores for the external dataset for each model
        for i, score in enumerate(dr.ExternalScores.list(prj.id, dataset_id=ext_test.id)):
            cur_model = dr.models.Model.get(project=prj.id, model_id=score.model_id)
            print('model %s: %s, id: %s' % (i, 
                                            cur_model.model_type,
                                            score.model_id)
                 )
            pprint(score.scores)

            while len(prj.get_all_jobs()) > 0:
                time.sleep(1)

        # Delete the external dataset
        print('\ndeleting dataset:', ext_test.id)
        ext_test.delete()
    except ConnectionError as ce:
        print('\n*** Connection error, will re-establish ***')
        print(ce)
    except Exception as e:
        print('\n^^^ Other error ^^^')
        print(e)
establish_dr_client()
        
print('\nDone.  Time: %d.03 minutes' % ((time.time() - t0)/60))

project:  Project(10k_diabetes_80.xlsx)
35 models

**** permuting column 'race' (1 of 10) ****

external dataset: PredictionDataset('predict.csv'), id: 60a81565aa2741f1d5169684
Request external dataset test for 3 models.
- Externals test requests complete.
model 0: Light Gradient Boosted Trees Classifier with Early Stopping, id: 602d6718bc10c707f16bc0c1
[{'label': 'AUC', 'value': 0.71483},
 {'label': 'Area Under PR Curve', 'value': 0.61898},
 {'label': 'FVE Binomial', 'value': 0.1103},
 {'label': 'Gini Norm', 'value': 0.42966},
 {'label': 'Kolmogorov-Smirnov', 'value': 0.31103},
 {'label': 'LogLoss', 'value': 0.59749},
 {'label': 'Max MCC', 'value': 0.30817},
 {'label': 'RMSE', 'value': 0.45448},
 {'label': 'Rate@Top10%', 'value': 0.725},
 {'label': 'Rate@Top5%', 'value': 0.8},
 {'label': 'Rate@TopTenth%', 'value': 1.0}]
model 1: Light Gradient Boosted Trees Classifier with Early Stopping, id: 602d64db1be244c02d7bd243
[{'label': 'AUC', 'value': 0.7118},
 {'label': 'Area Under PR Curve'

model 1: Light Gradient Boosted Trees Classifier with Early Stopping, id: 602d6671406e5972767bd275
[{'label': 'AUC', 'value': 0.6913},
 {'label': 'Area Under PR Curve', 'value': 0.58402},
 {'label': 'FVE Binomial', 'value': 0.08647},
 {'label': 'Gini Norm', 'value': 0.3826},
 {'label': 'Kolmogorov-Smirnov', 'value': 0.28536},
 {'label': 'LogLoss', 'value': 0.61349},
 {'label': 'Max MCC', 'value': 0.27938},
 {'label': 'RMSE', 'value': 0.46188},
 {'label': 'Rate@Top10%', 'value': 0.71},
 {'label': 'Rate@Top5%', 'value': 0.75},
 {'label': 'Rate@TopTenth%', 'value': 0.5}]
model 2: Light Gradient Boosted Trees Classifier with Early Stopping, id: 602d64db1be244c02d7bd243
[{'label': 'AUC', 'value': 0.69359},
 {'label': 'Area Under PR Curve', 'value': 0.58714},
 {'label': 'FVE Binomial', 'value': 0.08539},
 {'label': 'Gini Norm', 'value': 0.38718},
 {'label': 'Kolmogorov-Smirnov', 'value': 0.28609},
 {'label': 'LogLoss', 'value': 0.61422},
 {'label': 'Max MCC', 'value': 0.28348},
 {'label': 'R

model 2: Light Gradient Boosted Trees Classifier with Early Stopping, id: 602d64db1be244c02d7bd243
[{'label': 'AUC', 'value': 0.64009},
 {'label': 'Area Under PR Curve', 'value': 0.53724},
 {'label': 'FVE Binomial', 'value': 0.04273},
 {'label': 'Gini Norm', 'value': 0.28018},
 {'label': 'Kolmogorov-Smirnov', 'value': 0.21565},
 {'label': 'LogLoss', 'value': 0.64287},
 {'label': 'Max MCC', 'value': 0.2118},
 {'label': 'RMSE', 'value': 0.47468},
 {'label': 'Rate@Top10%', 'value': 0.65},
 {'label': 'Rate@Top5%', 'value': 0.67},
 {'label': 'Rate@TopTenth%', 'value': 1.0}]

deleting dataset: 60a81895ecfd802cfcf97183

**** permuting column 'medical_specialty' (10 of 10) ****

external dataset: PredictionDataset('predict.csv'), id: 60a81908c0d6777501f9701c
Request external dataset test for 3 models.
- Externals test requests complete.
model 0: Light Gradient Boosted Trees Classifier with Early Stopping, id: 602d6718bc10c707f16bc0c1
[{'label': 'AUC', 'value': 0.62788},
 {'label': 'Area Under 