In [1]:
import sys
# sys.path.insert(0, '/Users/matthew.cohen/Dev/DataRobot-my\ github\ repo/David\ Shinn')

import find_optimal_thresholds_from_roc_curves as fo
import pandas as pd
import datarobot as dr

from pprint import pprint as pprint

In [2]:
#
# The non-functional programming version of David Shinn's code
# 
project = dr.Project.get('5c706f7f114a6d305a75e34c')

def find_optimal_threshold(tn, fn, tp, fp, model, source, objective_label='objective', maximize=True):

    def payoff_function(roc_point):
        tns = roc_point['true_negative_score']
        fns = roc_point['false_negative_score']
        tps = roc_point['true_positive_score']
        fps = roc_point['false_positive_score']
        return tn*tns + fn*fns + tp*tps + fp*fps

    # dict to return, holds all values we want to see/output
    output_values = {'project_id': model.project_id,  
                     'model_id': model.id,
                     'model_name': model.model_type,
                     objective_label: None}
    
    # Get the roc points for the source partition we want:
    # - validation', 'crossValidation', or 'holdout'
    for roc_curve in model.get_all_roc_curves():
        if roc_curve.source == source:
            break
    else: # source not found
        print('source not found:', output_values)
        return output_values

    # initialize the current best value to the first roc point
    best_index = 0
    best_value = payoff_function(roc_curve.roc_points[0])

    # update best_value as the cost value improves
    for current_index, roc_point in enumerate(roc_curve.roc_points[1:], 1):
        current_value = payoff_function(roc_point)
        if maximize:
            if current_value > best_value:
                best_value = current_value
                best_index = current_index
        else:
            if current_value < best_value:
                best_value = current_value
                best_index = current_index
    output_values.update(roc_curve.roc_points[best_index])
    output_values[objective_label] = best_value
    return output_values

model = project.get_models()[0]
objective_label = '_best_value'
source = 'crossValidation'  

# Lending Club example, where TN is worth 10 of interest and a FN is worth -100 of principal lost. 
# TP and FP have no effect since we’re not capturing opportunity costs.
find_optimal_threshold(tn=10, fn=-100, tp=0, fp=0,
                       model=model, source=source, objective_label=objective_label, maximize=True)

{'_best_value': 12820,
 'accuracy': 0.45625,
 'f1_score': 0.2899118511263467,
 'false_negative_score': 148,
 'false_positive_rate': 0.6033888569787479,
 'false_positive_score': 4202,
 'fraction_predicted_as_negative': 0.36375,
 'fraction_predicted_as_positive': 0.63625,
 'lift_negative': 1.0903399120859165,
 'lift_positive': 1.34717934325007,
 'matthews_correlation_coefficient': 0.1770992697535978,
 'model_id': '5c70713ecaf9721bbf6ebe9c',
 'model_name': 'Gradient Boosted Trees Classifier with Early Stopping',
 'negative_predictive_value': 0.9491408934707903,
 'positive_predictive_value': 0.17445972495088408,
 'project_id': '5c706f7f114a6d305a75e34c',
 'threshold': 0.08811994579510264,
 'true_negative_rate': 0.39661114302125217,
 'true_negative_score': 2762,
 'true_positive_rate': 0.8571428571428571,
 'true_positive_score': 888}

In [3]:
# 
# David's single model cost matrix optimize
# 
threshold = fo.find_optimal_threshold(model=project.get_models()[0],
                                        objective_function=fo.create_payoff_function(tn=10, fn=-100, tp=0, fp=0), 
                                        objective_label='payoff', source='crossValidation', maximize=True)
threshold

{'accuracy': 0.45625,
 'f1_score': 0.2899118511263467,
 'false_negative_score': 148,
 'false_positive_rate': 0.6033888569787479,
 'false_positive_score': 4202,
 'fraction_predicted_as_negative': 0.36375,
 'fraction_predicted_as_positive': 0.63625,
 'lift_negative': 1.0903399120859165,
 'lift_positive': 1.34717934325007,
 'matthews_correlation_coefficient': 0.1770992697535978,
 'model_id': '5c70713ecaf9721bbf6ebe9c',
 'negative_predictive_value': 0.9491408934707903,
 'payoff': 12820,
 'positive_predictive_value': 0.17445972495088408,
 'project_id': '5c706f7f114a6d305a75e34c',
 'threshold': 0.08811994579510264,
 'true_negative_rate': 0.39661114302125217,
 'true_negative_score': 2762,
 'true_positive_rate': 0.8571428571428571,
 'true_positive_score': 888}

In [4]:
# 
# David's all models cost matrix optimize
# 
thresholds = fo.find_optimal_thresholds(project=project, 
                                        objective_function=fo.create_payoff_function(tn=10, fn=-100, tp=0, fp=0), 
                                        objective_label='payoff', source='crossValidation', maximize=True)
thresholds

[{'accuracy': 0.45625,
  'f1_score': 0.2899118511263467,
  'false_negative_score': 148,
  'false_positive_rate': 0.6033888569787479,
  'false_positive_score': 4202,
  'fraction_predicted_as_negative': 0.36375,
  'fraction_predicted_as_positive': 0.63625,
  'lift_negative': 1.0903399120859165,
  'lift_positive': 1.34717934325007,
  'matthews_correlation_coefficient': 0.1770992697535978,
  'model_id': '5c70713ecaf9721bbf6ebe9c',
  'negative_predictive_value': 0.9491408934707903,
  'payoff': 12820,
  'positive_predictive_value': 0.17445972495088408,
  'project_id': '5c706f7f114a6d305a75e34c',
  'threshold': 0.08811994579510264,
  'true_negative_rate': 0.39661114302125217,
  'true_negative_score': 2762,
  'true_positive_rate': 0.8571428571428571,
  'true_positive_score': 888},
 {'accuracy': 0.496625,
  'f1_score': 0.2961020800559343,
  'false_negative_score': 189,
  'false_positive_rate': 0.5511200459506032,
  'false_positive_score': 3838,
  'fraction_predicted_as_negative': 0.414375000000