# Better Blenders Bureau
Script v1
Alex Wilson, Sept 2019

Take stacked predictions from one project and put them in a new project, so we can use feature associations and such to determine which models will combine to make the best blenders.

### Step 0: Specify configuration
- project id, data file
- configuration: which models to use

### Step 1: Download stacked predictions from DR 
(from models selected using specified heuristics)

### Step 2: Merge these Predictions together, along with the target and partitioning

### Step 3: Load merged predictions into new DR project

POSSIBLE FUTURE Step 4: run feature association 
Also possible: download prediction explanations for clustering

# Requirements
- libraries: datarobot, pandas
- credentials.py in same directory, loading dr_token and dr_endpoint with your credentials
- project id
- original data file

In [47]:
import datarobot as dr
import pandas as pd
dr.__version__

'2.17.0'

In [76]:
project_id = '5d6edb1278132c4e2fcd27fa'

# We can't get the target values out of DataRobot with the python client,
#   so we need the original file to get them:
original_data_file = 'easypay_training_data_9_3_2019.csv'

# Which models do we use?
limit_to_already_cv_only = False  # only use models that already ran CV
limit_to_top_models = True  # limit to top X models
training_pct_threshold = 0.60  # only use models trained on at least 60%
num_top_models = 30  # only use top 30 models

In [75]:
training_df = pd.read_csv(original_data_file, low_memory=False)
training_df.sample(5)

Unnamed: 0,ESS_Account_Number,AppId,Industry,Merchant_CID,Applicant_State,Applicant_Zip,UW_Decision,APR,Bank_Validation,Giact_Account_Code,...,CVPL_Score,Max_Funding_amount,date_created,contract_amount_financed,Lexis_Nexis_alt_score,Lexis_Nexis_Results,App_Type,Payment_Frequency,Infile_date_from_credit_report,loan_is_bad
16696,1173888,1173888,Pet,9664,IL,60585.0,4,151.99,Pass,_1111,...,532,2500.0,10/15/2018,2458.83,0,0,4,BiWeekly,10/15/2018,1
71489,902603,902603,Automotive,2201,FL,33414.0,5,188.96,Pass,_1111,...,505,1500.0,1/12/2018,800.0,571,0,6,BiWeekly,1/12/2018,0
90471,2236A199,804127,Automotive,2236,OH,43609.0,6,198.97,Pass,_1111,...,480,500.0,9/6/2017,500.0,605,0,4,Monthly,9/6/2017,1
13298,1191028,1191028,Furniture,5271,CA,92260.0,6,198.97,Pass,_1111,...,489,500.0,10/30/2018,445.45,576,0,4,Monthly,10/30/2018,0
66351,928154,928154,Pet,6208,CA,92595.0,3,58.97,Pass,_1111,...,611,4000.0,2/10/2018,1385.41,0,0,5,BiWeekly,2/10/2018,0


In [50]:
%run credentials.py
dr.Client(token=dr_token, endpoint=dr_endpoint)
project = dr.Project.get(project_id)
project

Project(easypay_training_data_9_3_2019.csv (Alex))

In [74]:
print("Project Optimization Metric: " + project.metric)
print("Project Target: " + project.target)
total_rows = training_df.shape[0]
print(f'Total Training Rows: {total_rows}')

Project Optimization Metric: LogLoss
Project Target: loan_is_bad
Total Training Rows: 100219


In [71]:
# assemble scoring data and attributes
# order by cv score, then by validation score

model_scores = pd.DataFrame(
    [[model.metrics[project.metric]['crossValidation'],
      model.metrics[project.metric]['validation'],
      model,
      model.training_row_count / total_rows,
      model.model_category] for model in project.get_models(with_metric=project.metric)],
    columns=['cv', 'v', 'model', 'pct', 'category']).sort_values(['cv', 'v'],
                                                                 na_position='last')
# TODO: add ability to detect need for reverse sorting for metrics like AUC
model_scores

Unnamed: 0,cv,v,model,pct,category
0,0.511878,0.51205,Model('AVG Blender'),0.639998,blend
1,0.511878,0.51206,Model('ENET Blender'),0.639998,blend
3,0.512202,0.51251,Model('AVG Blender'),0.639998,blend
2,0.512340,0.51220,Model('AVG Blender'),0.639998,blend
4,0.512532,0.51278,Model('AVG Blender'),0.639998,blend
5,0.513020,0.51300,Model('AVG Blender'),0.639998,blend
6,0.513050,0.51325,Model('eXtreme Gradient Boosted Trees Classifi...,0.639998,model
7,0.513150,0.51345,Model('eXtreme Gradient Boosted Trees Classifi...,0.639998,model
13,0.513230,0.51385,Model('eXtreme Gradient Boosted Trees Classifi...,0.639998,model
12,0.513378,0.51375,Model('eXtreme Gradient Boosted Trees Classifi...,0.639998,model


In [60]:
# Eliminate blenders, prime models, and scaleout models.
# Eliminate anything below the training percentage threshold.
# If we are not training new CV models, eliminate the ones that haven't run CV yet.
# If we are limiting to top models, select the top X models from remaining ones,
#     otherwise select all of them.

if limit_to_already_cv_only:
    models_to_use = model_scores.loc[
        (model_scores['category'] == 'model')
        & (model_scores['pct'] >= training_pct_threshold)
        & model_scores['cv'].notna(),
        'model'].tolist()
else:
    models_to_use = model_scores.loc[
        (model_scores['category'] == 'model')
        & (model_scores['pct'] >= training_pct_threshold),
        'model'].tolist()
if limit_to_top_models:
    models_to_use = models_to_use[:num_top_models]
models_to_use

[Model('eXtreme Gradient Boosted Trees Classifier with Early Stopping (learning rate =0.01)'),
 Model('eXtreme Gradient Boosted Trees Classifier with Early Stopping (learning rate =0.01)'),
 Model('eXtreme Gradient Boosted Trees Classifier with Early Stopping - Forest (10x)'),
 Model('eXtreme Gradient Boosted Trees Classifier with Early Stopping (learning rate =0.01)'),
 Model('eXtreme Gradient Boosted Trees Classifier with Early Stopping (learning rate =0.01)'),
 Model('eXtreme Gradient Boosted Trees Classifier with Early Stopping (learning rate =0.02)'),
 Model('eXtreme Gradient Boosted Trees Classifier with Early Stopping (learning rate =0.02)'),
 Model('eXtreme Gradient Boosted Trees Classifier with Early Stopping'),
 Model('eXtreme Gradient Boosted Trees Classifier with Early Stopping and Unsupervised Learning Features'),
 Model('eXtreme Gradient Boosted Trees Classifier with Early Stopping and Unsupervised Learning Features'),
 Model('eXtreme Gradient Boosted Trees Classifier wit

In [62]:
# kick off all training predictions jobs if they haven't already been run
train_pred_jobs = list()
for model in models_to_use:
    if not model.metrics[project.metric]['crossValidation']:  # if CV has not been run, try to run CV
        try:
            model.cross_validate()
            print('Cross Validation begun for model ' + model.id)
        except Exception as e:
            print('CV error: ', str(e))
    try:
        train_pred_jobs.append(model.request_training_predictions(dr.enums.DATA_SUBSET.ALL))
        print('Training Predictions begun for model ' + model.id)
    except Exception as e:
        if str(e) == "422 client error: {'message': 'Training predictions request already submitted for these parameters'}":
            print('Predictions already kicked off for model ' + model.id)
        else:
            print(str(e))
print('\nAll training prediction jobs kicked off')

Predictions already kicked off for model 5d6edd3878132c4e72cd2817
Predictions already kicked off for model 5d6edd3778132c4e72cd27fb
Predictions already kicked off for model 5d6edd3578132c4e72cd27df
Predictions already kicked off for model 5d6edd3578132c4e72cd27dc
Predictions already kicked off for model 5d6edd3578132c4e72cd27e3
Predictions already kicked off for model 5d6edd3878132c4e72cd280c
Predictions already kicked off for model 5d6edd3878132c4e72cd280a
Predictions already kicked off for model 5d6edd3878132c4e72cd2813
Predictions already kicked off for model 5d6edd3778132c4e72cd2801
Predictions already kicked off for model 5d6edd3578132c4e72cd27de
Predictions already kicked off for model 5d6edd3678132c4e72cd27ec
Predictions already kicked off for model 5d6fc22378132c6e59cd286b
Predictions already kicked off for model 5d6edd3778132c4e72cd27ff
Predictions already kicked off for model 5d6fc21978132c700dcd29aa
Cross Validation begun for model 5d6edd3678132c4e72cd27f5
Training Predictio

In [64]:
model_ids = [model.id for model in models_to_use]
# create a mapping from model ids to names, which start with model id and end with model_type,
#   like "5d6edd3578132c4e72cd27df eXtreme Gradient Boosted Trees Classifier with Early Stopping - Forest (10x)"
# I set them in this order so that the model id would be copyable, since tooltips are not. I would prefer using
#   "M127 eXtreme Gradient Boosted Trees Classifier with Early Stopping - Forest (10x)" but the M### are not available
model_ids_to_names = {model.id: model.id + ' ' + model.model_type
                      for model in models_to_use}

In [63]:
for train_pred_job in train_pred_jobs:
    train_pred_job.get_result_when_complete()
print('All training prediction jobs complete!')

All training prediction jobs complete!


In [65]:
tp_list = dr.TrainingPredictions.list(project_id=project_id)
predictions = [(model_ids_to_names[tp.model_id], tp.get_all_as_dataframe())
               for tp in tp_list if tp.model_id in model_ids and tp.data_subset == 'all']
predictions

[('5d6edd3778132c4e72cd27ff Light Gradient Boosting on ElasticNet Predictions ',
          row_id partition_id  prediction  class_1.0  class_0.0
  0            0          0.0         1.0   0.544143   0.455857
  1            1          2.0         0.0   0.271642   0.728358
  2            2          3.0         0.0   0.101378   0.898622
  3            3          1.0         0.0   0.175952   0.824048
  4            4          4.0         0.0   0.307233   0.692767
  5            5      Holdout         0.0   0.280588   0.719412
  6            6          2.0         0.0   0.136862   0.863138
  7            7          3.0         0.0   0.096983   0.903017
  8            8          2.0         0.0   0.274321   0.725679
  9            9          3.0         0.0   0.043561   0.956439
  10          10          4.0         1.0   0.511695   0.488305
  11          11          0.0         0.0   0.221415   0.778585
  12          12          1.0         1.0   0.558006   0.441994
  13          13       

In [66]:
pred_df = pd.DataFrame()
for model, tp_df in predictions:
    tp_df['Model'] = model
    pred_df = pred_df.append(tp_df, sort=True)

In [67]:
# quick sanity check to make sure partitions are consistent
all(pred_df.groupby(['partition_id', 'row_id']).agg({'Model': 'count'}).reset_index()['Model'] == 14)

False

In [68]:
# merge all the predictions...
merged_df = pd.DataFrame()
merged_df['target'] = training_df[project.target]
first = True
for model, tp_df in predictions:
    if first:
        merged_df['partition_id'] = tp_df['partition_id']
        first = False
    merged_df[model] = tp_df['class_1.0']
    print(model + ' predictions downloaded')
merged_df

5d6edd3778132c4e72cd27ff Light Gradient Boosting on ElasticNet Predictions  predictions downloaded
5d6edd3878132c4e72cd2813 eXtreme Gradient Boosted Trees Classifier with Early Stopping predictions downloaded
5d6edd3878132c4e72cd280a eXtreme Gradient Boosted Trees Classifier with Early Stopping (learning rate =0.02) predictions downloaded
5d6edd3878132c4e72cd280c eXtreme Gradient Boosted Trees Classifier with Early Stopping (learning rate =0.02) predictions downloaded
5d6edd3578132c4e72cd27df eXtreme Gradient Boosted Trees Classifier with Early Stopping - Forest (10x) predictions downloaded
5d6edd3778132c4e72cd27fb eXtreme Gradient Boosted Trees Classifier with Early Stopping (learning rate =0.01) predictions downloaded
5d6edd3878132c4e72cd2817 eXtreme Gradient Boosted Trees Classifier with Early Stopping (learning rate =0.01) predictions downloaded
5d6edd3678132c4e72cd27ec eXtreme Gradient Boosted Trees Classifier with Early Stopping predictions downloaded
5d6edd3778132c4e72cd2801 eXt

Unnamed: 0,target,partition_id,5d6edd3778132c4e72cd27ff Light Gradient Boosting on ElasticNet Predictions,5d6edd3878132c4e72cd2813 eXtreme Gradient Boosted Trees Classifier with Early Stopping,5d6edd3878132c4e72cd280a eXtreme Gradient Boosted Trees Classifier with Early Stopping (learning rate =0.02),5d6edd3878132c4e72cd280c eXtreme Gradient Boosted Trees Classifier with Early Stopping (learning rate =0.02),5d6edd3578132c4e72cd27df eXtreme Gradient Boosted Trees Classifier with Early Stopping - Forest (10x),5d6edd3778132c4e72cd27fb eXtreme Gradient Boosted Trees Classifier with Early Stopping (learning rate =0.01),5d6edd3878132c4e72cd2817 eXtreme Gradient Boosted Trees Classifier with Early Stopping (learning rate =0.01),5d6edd3678132c4e72cd27ec eXtreme Gradient Boosted Trees Classifier with Early Stopping,...,5d6fc22278132c703dcd2a52 eXtreme Gradient Boosted Trees Classifier with Early Stopping - Forest (10x),5d6edd3678132c4e72cd27ea Generalized Additive Model,5d6edd3878132c4e72cd2816 Nystroem Kernel SVM Classifier,5d6edd3778132c4e72cd27fa Elastic-Net Classifier (L2 / Binomial Deviance),5d6edd3578132c4e72cd27e8 Elastic-Net Classifier (L2 / Binomial Deviance) with Binned numeric features,5d6edd3678132c4e72cd27f3 RandomForest Classifier (Gini),5d6edd3878132c4e72cd2810 Gradient Boosted Trees Classifier,5d6edd3578132c4e72cd27e2 Nystroem Kernel SVM Classifier,5d6edd3778132c4e72cd27fc Regularized Logistic Regression (L2),5d6edd3878132c4e72cd2811 Regularized Logistic Regression (L2)
0,0,0.0,0.544143,0.586868,0.537351,0.499777,0.568448,0.529626,0.540316,0.564793,...,0.566832,0.536226,0.545676,0.542792,0.563134,0.533528,0.555347,0.532217,0.554277,0.520564
1,1,2.0,0.271642,0.332193,0.360062,0.350863,0.357057,0.331964,0.366810,0.331746,...,0.363360,0.278535,0.297597,0.324074,0.235563,0.353728,0.364583,0.281321,0.319992,0.389124
2,0,3.0,0.101378,0.079288,0.091337,0.112245,0.096829,0.107494,0.099587,0.078203,...,0.085788,0.127622,0.168584,0.166101,0.132128,0.155594,0.145078,0.175114,0.180299,0.194728
3,0,1.0,0.175952,0.202529,0.205419,0.191582,0.212787,0.160086,0.175652,0.195324,...,0.200247,0.183593,0.169446,0.170868,0.190508,0.243162,0.214279,0.152887,0.159614,0.177635
4,0,4.0,0.307233,0.349622,0.300034,0.284898,0.318136,0.259094,0.238039,0.321386,...,0.263493,0.256685,0.281336,0.297164,0.275562,0.230987,0.233982,0.264893,0.290395,0.288367
5,0,Holdout,0.280588,0.292676,0.315655,0.354636,0.319390,0.303826,0.304577,0.333205,...,0.312893,0.236722,0.261305,0.273931,0.306458,0.269448,0.299965,0.213228,0.267259,0.287377
6,1,2.0,0.136862,0.107709,0.101433,0.081368,0.119521,0.117207,0.111070,0.126953,...,0.116001,0.102079,0.119590,0.124101,0.104559,0.138886,0.134923,0.101065,0.087093,0.124729
7,0,3.0,0.096983,0.066524,0.074482,0.102713,0.078172,0.092397,0.071177,0.083405,...,0.079705,0.104182,0.096663,0.111806,0.128761,0.115586,0.098191,0.098151,0.139595,0.111774
8,1,2.0,0.274321,0.176701,0.190316,0.209268,0.283586,0.198473,0.206540,0.222674,...,0.295797,0.301979,0.262140,0.269282,0.309604,0.272390,0.263051,0.260792,0.240889,0.268370
9,0,3.0,0.043561,0.077878,0.059960,0.065673,0.061615,0.066163,0.064693,0.053376,...,0.062765,0.091527,0.044630,0.060623,0.093877,0.067049,0.090818,0.057344,0.088888,0.061840


In [69]:
# custom partitioning isn't working with strings, so I'm converting it to integers.
merged_df.loc[merged_df['partition_id'] == 'Holdout', 'partition_id'] = -1
merged_df['partition_id'] = merged_df['partition_id'].astype(float).astype(int)

In [70]:
# kick off the new project
custom_partition = dr.UserCV(user_partition_col='partition_id', cv_holdout_level=-1)
project2 = dr.Project.start(sourcedata=merged_df,
                            project_name='bbb_' + project.project_name,
                            target='target',
                            autopilot_on=False,
                            partitioning_method=custom_partition)
project2

Project(bbb_easypay_training_data_9_3_2019.csv (Alex))

In [72]:
# and open it in your browser
project2.open_leaderboard_browser()

True