#### More information about the API can be found on: https://datarobot-public-api-client.readthedocs-hosted.com/en/v2.17.0/#

In [1]:
import requests
import os
import datarobot as dr
import pandas as pd
import yaml
import re
import numpy as np

In [2]:
with open('/Users/souriyo.soumpholphakdy/Documents/config/drconfig.yaml', 'r') as f:
    drconfig = yaml.safe_load(f)
    
#Yaml contains connection information:
#DATAROBOT_API_TOKEN: <look into your profile>
#DATAROBOT_ENDPOINT: <your datarobot url>/api/v2/
#DATAROBOT_USERNAME: john.doe@cimb.com


#### Connect to DataRobot via Python modelling API

In [3]:
response = dr.Client(token=drconfig['DATAROBOT_API_TOKEN'], endpoint=drconfig['DATAROBOT_ENDPOINT'])

### List available projects

In [4]:
#enter project name
project_name='Lending_Club'

In [5]:
project_list=dr.Project.list(search_params={'project_name': project_name})
project_list

[Project(10K_Lending_Club_Loans.xlsx),
 Project(10K_Lending_Club_Loans.xlsx),
 Project(10K_Lending_Club_Loans.xlsx),
 Project(10K_Lending_Club_Loans.xlsx),
 Project(10K_Lending_Club_Loans.xlsx),
 Project(10K_Lending_Club_Loans.xlsx),
 Project(10K_Lending_Club_Loans.xlsx),
 Project(AI_Exp_10K_Lending_Club_Loans.xlsx),
 Project(DemoReady_10K_Lending_Club_Loans),
 Project(Playground_10K_Lending_Club_Loans.xlsx)]

In [6]:
i = 0
#init empty dataframe of project
df_projects= pd.DataFrame({'project_name': [], 'id': []})

#loop through list of project and insert to empty dataframe
for project in project_list:
    df_projects = df_projects.append(
        {'project_name': project.project_name,
         'id': project.id
        }, ignore_index=True)
    i += 1

df_projects

Unnamed: 0,project_name,id
0,10K_Lending_Club_Loans.xlsx,5dcbc7c97be8693b0818924b
1,10K_Lending_Club_Loans.xlsx,5dcbbe807be8693a9c1891f8
2,10K_Lending_Club_Loans.xlsx,5dcbaccb7be869395218923b
3,10K_Lending_Club_Loans.xlsx,5dcba7be80155f38141cb703
4,10K_Lending_Club_Loans.xlsx,5dcb9f1a80155f37601cb715
5,10K_Lending_Club_Loans.xlsx,5dcb992280155f36ba1cb78a
6,10K_Lending_Club_Loans.xlsx,5dcb9421baa39437d5ce0cb0
7,AI_Exp_10K_Lending_Club_Loans.xlsx,5dc0eb21fddc7a6bd857eb68
8,DemoReady_10K_Lending_Club_Loans,5d8afb8c78132c62f49dda1d
9,Playground_10K_Lending_Club_Loans.xlsx,5d7616e478132c4893cd2650


### Connection to the project
#### Input the project id to connect to

In [7]:
#enter project id
selected_project='5d8afb8c78132c62f49dda1d'

In [8]:
project = dr.Project.get(project_id=selected_project)

### Get project models information

In [9]:
models = project.get_models()

def get_results (proj):
    """Args: A DataRobot project object
       returns: A dataframe sorted by log loss for cross-validation"""
    project= dr.Project.get(project_id=proj)
    
    #extract featurelist
    feature_lists = project.get_featurelists()
    
    #get informative features, the default for autopilot
    #you could update this to your feature list
    f_list = [lst for lst in feature_lists if
                     lst.name == 'Informative Features'][0]
    
    #get models
    models = project.get_models()
    flist_models = [model for model in models if model.featurelist_id == f_list.id]
    
    #print results
    val_scores = pd.DataFrame([{'model_type': model.model_type,
                           'blueprint info': model.blueprint,
                           'model_id': model.id,
                           'sample_pct': model.sample_pct,
                           'featurelist': model.featurelist_name,
                           'val_logloss': model.metrics['LogLoss']['validation'],
                           'cross_val_logloss': model.metrics['LogLoss']['crossValidation']}
                           for model in flist_models if model.metrics['LogLoss'] is not None])
    
    return val_scores.sort_values(by='cross_val_logloss')

modelframe = get_results(project.id)
modelframe



Unnamed: 0,model_type,blueprint info,model_id,sample_pct,featurelist,val_logloss,cross_val_logloss
0,eXtreme Gradient Boosted Trees Classifier with...,Blueprint(eXtreme Gradient Boosted Trees Class...,5d8afffa95d0302e0d79d1a5,80.0,Informative Features,0.36088,0.354428
1,eXtreme Gradient Boosted Trees Classifier with...,Blueprint(eXtreme Gradient Boosted Trees Class...,5d8afdba95d0301e0179d192,64.0,Informative Features,0.36616,0.357712
2,eXtreme Gradient Boosted Trees Classifier with...,Blueprint(eXtreme Gradient Boosted Trees Class...,5d8afdba95d0301e0179d188,64.0,Informative Features,0.36761,0.358614
4,eXtreme Gradient Boosted Trees Classifier with...,Blueprint(eXtreme Gradient Boosted Trees Class...,5d8afdba95d0301e0179d183,64.0,Informative Features,0.36911,0.358808
3,Generalized Additive2 Model,Blueprint(Generalized Additive2 Model),5d8afdba95d0301e0179d182,64.0,Informative Features,0.36777,0.359508
...,...,...,...,...,...,...,...
75,RuleFit Classifier,Blueprint(RuleFit Classifier),5d8afc1a8978f3025493620d,16.0,Informative Features,0.41483,
76,Naive Bayes combiner classifier,Blueprint(Naive Bayes combiner classifier),5d8afc1a8978f302549361ff,16.0,Informative Features,0.43670,
77,Logistic Regression,Blueprint(Logistic Regression),5d8afc1a8978f30254936203,16.0,Informative Features,0.46486,
78,Decision Tree Classifier (Gini),Blueprint(Decision Tree Classifier (Gini)),5d8afc1a8978f30254936202,16.0,Informative Features,0.48316,


In [10]:
print ("Other optimization metrics are available")
for metric in list(models[0].metrics.keys()):
    print ('- "' + metric + '"')

Other optimization metrics are available
- "AUC"
- "Rate@Top5%"
- "Max MCC"
- "RMSE"
- "Kolmogorov-Smirnov"
- "Rate@TopTenth%"
- "LogLoss"
- "FVE Binomial"
- "Gini Norm"
- "Rate@Top10%"


### Get model Advance Tuning setup 
#### Input the model to target

In [11]:
selected_model='5d8afffa95d0302e0d79d1a5' #Regularized Logistic Regression (L2)

In [12]:
model = dr.Model.get(project=selected_project,model_id= selected_model)
tune = model.start_advanced_tuning_session()

In [13]:
#function to help format hyper param 
param_types = ['int','float','intlist','floatlist','select','ascii','unicode']

def parameters_to_df(params):
    df = pd.DataFrame.from_dict(params['tuning_parameters'])
    df['param_type'] = ''
    df['supports_grid_search'] = False
    df['min_val'] = np.nan
    df['max_val'] = np.nan
    df['values_select'] = None
    df['values_select'] = df['values_select'].astype(object)
    df['multiple_types'] = False
    for i in np.arange(df.shape[0]):
        main = 0
        base_resord = df.iloc[i:i+1,:].copy()
        for ptype in param_types:
            if (ptype in df.constraints.iloc[i]):
                k = i
                if main > 0:
                    df.loc[i,'multiple_types'] = True
                    df = df.append(base_resord,ignore_index = True)
                    k = df.shape[0]-1
                    df.loc[k,'multiple_types'] = True
                main += 1
                df.loc[k,'param_type'] = ptype
                
                constr = df.constraints.iloc[i][ptype]
                if 'supports_grid_search' in constr:
                    df.loc[k,'supports_grid_search'] = constr['supports_grid_search']
                if ptype in ['int','float']:
                    df.loc[k,'min_val'] = constr['min']
                    df.loc[k,'max_val'] = constr['max']
                if ptype in ['intlist','floatlist','select']:
                    df.at[k,'values_select'] = (constr['values'])
    df['parameter_name_type'] = df['parameter_name'] + '_' + df['param_type']
    return df

In [16]:
prms

{'tuning_description': None,
 'tuning_parameters': [{'parameter_name': 'arbimp',
   'parameter_id': 'eyJhcmciOiJhcmJpbXAiLCJ2aWQiOiIxMSJ9',
   'default_value': -9999,
   'current_value': -9999,
   'task_name': 'Missing Values Imputed',
   'constraints': {'int': {'min': -99999,
     'max': 99999,
     'supports_grid_search': False}}},
  {'parameter_name': 'min_count_na',
   'parameter_id': 'eyJhcmciOiJtaW5fY291bnRfbmEiLCJ2aWQiOiIxMSJ9',
   'default_value': 5,
   'current_value': 5,
   'task_name': 'Missing Values Imputed',
   'constraints': {'int': {'min': 0,
     'max': 99999,
     'supports_grid_search': False}}},
  {'parameter_name': 'card_max',
   'parameter_id': 'eyJhcmciOiJjYXJkX21heCIsInZpZCI6IjEifQ',
   'default_value': 'None',
   'current_value': 'None',
   'task_name': 'Ordinal encoding of categorical variables',
   'constraints': {'select': {'values': ['None']},
    'int': {'min': 1, 'max': 9999999, 'supports_grid_search': False}}},
  {'parameter_name': 'method',
   'paramete

In [14]:
# Retrieve parameters in a dataframe
prms = model.get_parameters()

prms_df = parameters_to_df(prms)
prms_df[['task_name','parameter_name','parameter_name_type','current_value', 'default_value','param_type',
         'supports_grid_search','min_val', 'max_val','values_select',
         'parameter_id']].sort_values(['task_name','parameter_name_type'])

Unnamed: 0,task_name,parameter_name,parameter_name_type,current_value,default_value,param_type,supports_grid_search,min_val,max_val,values_select,parameter_id
0,Missing Values Imputed,arbimp,arbimp_int,-9999,-9999,int,False,-99999.0,99999.0,,eyJhcmciOiJhcmJpbXAiLCJ2aWQiOiIxMSJ9
1,Missing Values Imputed,min_count_na,min_count_na_int,5,5,int,False,0.0,99999.0,,eyJhcmciOiJtaW5fY291bnRfbmEiLCJ2aWQiOiIxMSJ9
2,Ordinal encoding of categorical variables,card_max,card_max_int,,,int,False,1.0,9999999.0,,eyJhcmciOiJjYXJkX21heCIsInZpZCI6IjEifQ
25,Ordinal encoding of categorical variables,card_max,card_max_select,,,select,False,,,[None],eyJhcmciOiJjYXJkX21heCIsInZpZCI6IjEifQ
3,Ordinal encoding of categorical variables,method,method_select,freq,freq,select,False,,,"[None, random, lex, freq, resp]",eyJhcmciOiJtZXRob2QiLCJ2aWQiOiIxIn0
4,Ordinal encoding of categorical variables,min_support,min_support_int,5,5,int,False,1.0,99999.0,,eyJhcmciOiJtaW5fc3VwcG9ydCIsInZpZCI6IjEifQ
5,eXtreme Gradient Boosted Trees Classifier with...,base_margin_initialize,base_margin_initialize_select,False,False,select,False,,,"[False, True]",eyJhcmciOiJiYXNlX21hcmdpbl9pbml0aWFsaXplIiwidm...
6,eXtreme Gradient Boosted Trees Classifier with...,colsample_bylevel,colsample_bylevel_float,1,1,float,True,0.1,1.0,,eyJhcmciOiJjb2xzYW1wbGVfYnlsZXZlbCIsInZpZCI6Ij...
7,eXtreme Gradient Boosted Trees Classifier with...,colsample_bytree,colsample_bytree_float,0.3,0.3,float,True,0.04,1.0,,eyJhcmciOiJjb2xzYW1wbGVfYnl0cmVlIiwidmlkIjoiMT...
8,eXtreme Gradient Boosted Trees Classifier with...,interval,interval_int,10,10,int,False,2.0,500.0,,eyJhcmciOiJpbnRlcnZhbCIsInZpZCI6IjEyIn0


In [18]:
tune = model.start_advanced_tuning_session()


In [27]:
model.get_advanced_tuning_parameters()

{'tuning_description': None,
 'tuning_parameters': [{'parameter_name': 'arbimp',
   'parameter_id': 'eyJhcmciOiJhcmJpbXAiLCJ2aWQiOiIxMSJ9',
   'default_value': -9999,
   'current_value': -9999,
   'task_name': 'Missing Values Imputed',
   'constraints': {'int': {'min': -99999,
     'max': 99999,
     'supports_grid_search': False}}},
  {'parameter_name': 'min_count_na',
   'parameter_id': 'eyJhcmciOiJtaW5fY291bnRfbmEiLCJ2aWQiOiIxMSJ9',
   'default_value': 5,
   'current_value': 5,
   'task_name': 'Missing Values Imputed',
   'constraints': {'int': {'min': 0,
     'max': 99999,
     'supports_grid_search': False}}},
  {'parameter_name': 'card_max',
   'parameter_id': 'eyJhcmciOiJjYXJkX21heCIsInZpZCI6IjEifQ',
   'default_value': 'None',
   'current_value': 'None',
   'task_name': 'Ordinal encoding of categorical variables',
   'constraints': {'select': {'values': ['None']},
    'int': {'min': 1, 'max': 9999999, 'supports_grid_search': False}}},
  {'parameter_name': 'method',
   'paramete

In [14]:
# Set parameter values for a new run
tune.set_parameter(parameter_name='threshold', value=70)

In [15]:
# Run with new values
job = tune.run()

#### Retrieve model coefficients 


In [16]:
dict_derived_features = dr.Model.get(project=selected_project, model_id=selected_model).get_parameters().derived_features
df_derived_features = pd.DataFrame(dict_derived_features)[['original_feature','derived_feature','coefficient']]
df_derived_features

Unnamed: 0,original_feature,derived_feature,coefficient
0,addr_state,addr_state-AZ,0.078547
1,addr_state,addr_state-CA,0.030598
2,addr_state,addr_state-CO,-0.031749
3,addr_state,addr_state-CT,0.012494
4,addr_state,addr_state-FL,-0.012211
5,addr_state,addr_state-GA,-0.020870
6,addr_state,addr_state-IL,-0.118720
7,addr_state,addr_state-MA,0.014594
8,addr_state,addr_state-MD,-0.052028
9,addr_state,addr_state-MI,-0.038519
