In [1]:
# Load the H2O library and start up the H2O cluter locally on your machine
import h2o

# Number of threads, nthreads = -1, means use all cores on your machine
# max_mem_size is the maximum memory (in GB) to allocate to H2O
h2o.init(nthreads = -1, max_mem_size = 8)

  def _ipython_display_formatter_default(self):
  def _formatters_default(self):
  def _deferred_printers_default(self):
  def _singleton_printers_default(self):
  def _type_printers_default(self):
  def _singleton_printers_default(self):
  def _type_printers_default(self):
  def _deferred_printers_default(self):


0,1
H2O cluster uptime:,2 hours 39 minutes 28 seconds 982 milliseconds
H2O cluster version:,3.8.2.6
H2O cluster name:,H2O_started_from_python_laurend_rbc272
H2O cluster total nodes:,1
H2O cluster total free memory:,3.27 GB
H2O cluster total cores:,8
H2O cluster allowed cores:,8
H2O cluster healthy:,True
H2O Connection ip:,127.0.0.1
H2O Connection port:,54321


In [2]:
#h2o.shutdown(prompt=False)
#h2o.init(nthreads = -1, max_mem_size = 8)

## Train a binary classification GBM model

In [6]:
# A small clean telecommunications sample dataset (https://www.ibm.com/communities/analytics/watson-analytics-blog/predictive-insights-in-the-telco-customer-churn-data-set/
telco_dataset = h2o.import_file("/Users/laurend/Desktop/projects/WA_Fn-UseC_-Telco-Customer-Churn.csv")
# select all columns as predictors except the customerID (which is like an index) and the response column
features_list = list(telco_dataset.columns[1:-1])
response_name = 'Churn'
# specify the response column
response_col = telco_dataset['Churn']


Parse Progress: [##################################################] 100%


In [7]:
# Partition data into 70%, 15%, 15% chunks
# Setting a seed will guarantee reproducibility
splits = telco_dataset.split_frame(ratios=[0.75,0.15], seed=1234)

train = splits[0]
valid = splits[1]
test = splits[2]

In [8]:
# Import H2O GBM:
from h2o.estimators.gbm import H2OGradientBoostingEstimator

In [9]:
# Initialize and train the GBM estimator:

gbm_fit1 = H2OGradientBoostingEstimator(model_id='gbm_fit1', seed=1234)
gbm_fit1.train(x=features_list, y=response_name, training_frame=train)


gbm Model Build Progress: [##################################################] 100%


In [11]:
pred = gbm_fit1.predict(valid)


gbm prediction Progress: [##################################################] 100%


In [70]:
performance_train = gbm_fit1.model_performance(train)
print performance_train.auc()
performance_valid = gbm_fit1.model_performance(valid)
print performance_valid.auc()

0.900074549019
0.849726612388


In [18]:
models_predictions = gbm_fit1.predict(valid)
print models_predictions 


gbm prediction Progress: [##################################################] 100%
predict          No        Yes
---------  --------  ---------
Yes        0.45102   0.54898
No         0.82172   0.17828
Yes        0.114567  0.885433
No         0.953581  0.0464187
No         0.76541   0.23459
Yes        0.324923  0.675077
No         0.928357  0.0716425
Yes        0.273315  0.726685
No         0.936856  0.063144
Yes        0.385016  0.614984

[1029 rows x 3 columns]


In [12]:
intervention_cost = 3.0  # Cost of classification 
effectiveness = 0.1      # 10% of users will be influenced by this particular intervention

In [25]:
# this is temporary for debubgging only
# we can set a threshold to use for now (this will be a variable in the future)
newdata = valid
model = gbm_fit1
threshold = 0.6
pred = model.predict(newdata)
pred['predict'] = pred['Yes']>threshold


gbm prediction Progress: [##################################################] 100%


In [26]:
pred

predict,No,Yes
0,0.45102,0.54898
0,0.82172,0.17828
1,0.114567,0.885433
0,0.953581,0.0464187
0,0.76541,0.23459
1,0.324923,0.675077
0,0.928357,0.0716425
1,0.273315,0.726685
0,0.936856,0.063144
1,0.385016,0.614984




Now that we have a trained model and predictions (0/1) on a validation set, we can compute the confusion matrix to get the numbers of TP, FP, TN, FN.

In [89]:
conf = model.confusion_matrix()
print conf
print type(conf)


Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.331672324216: 


0,1,2,3,4
,No,Yes,Error,Rate
No,3221.0,661.0,0.1703,(661.0/3882.0)
Yes,267.0,1148.0,0.1887,(267.0/1415.0)
Total,3488.0,1809.0,0.1752,(928.0/5297.0)



<class 'h2o.model.confusion_matrix.ConfusionMatrix'>


In [32]:
conf_df = conf.table.as_data_frame()
print conf_df

              No     Yes   Error             Rate
0     No  3221.0   661.0  0.1703   (661.0/3882.0)
1    Yes   267.0  1148.0  0.1887   (267.0/1415.0)
2  Total  3488.0  1809.0  0.1752   (928.0/5297.0)


In [47]:
TN = conf_df.ix[0,1]  #True Negative
FN = conf_df.ix[0,2]  #False Negative
FP = conf_df.ix[1,1]  #False Positive
TP = conf_df.ix[1,2]  #True Positive

## Define cost parameters

- `unit_full_price`: Unit cost in USD, e.g. "cost of a Starbucks coffee"
- `unit_discount`: The discount of the coupon, e.g. 0.4 = 40% off
- `discount_effectiveness`: The effectiveness of the coupon to prevent churn, e.g. 0.3 = 30% of people who would have churned, but received the coupon, will decide to buy the unit at a discounted price instead of not buying (churning).

In [90]:
unit_full_price = 1.00
unit_discount = 0.4
discount_effectiveness = 0.3
discounted_unit_price = (1 - unit_discount) * unit_full_price
print discounted_unit_price

0.6


### Send coupon to people who are predicted to churn

For all the people who we predict as "Will Churn", we send them a coupon to try to get them not to churn.  There are two groups in this population:

1. TP: Apply the coupon and 30% of these folks will decide to buy the discounted item instead of not buying.  Company makes money.
2. FP: These people were not going to churn -- so now they have 40% off coupon for something they were going to pay full price for.  Company loses money.

In [56]:
# Total reward of TP group: TP * 0.6 * 0.3  #Discounted price is $0.60, which is 40% off of $1.00
TP_value = TP * discounted_unit_price * discount_effectiveness 
print TP_value

# Total cost of FP group: FP * 0.4 * 1.00  #Coupon is $0.40, which is 40% of $1.00, lost for each FP
FP_value = FP * unit_full_price * unit_discount * -1
print FP_value

206.64
-106.8


### Don't send coupon to people who are not predicted to churn

For all the people who we predict as "Will Not Churn", we don't bother to send them a coupon.  There are two groups in this population:

1. TN: These people were not going to churn, and they didn't get a coupon, so there's no loss or gain here.
2. FN: These people do churn, and since they didn't get an inticing coupon in the mail, there is nothing to prevent them from churning.  These are lost customers.  Company loses money.

In [82]:
# Total cost of FN group: 30% of these churns could have been saved at a 40% discount
FN_value = FN * discounted_unit_price * discount_effectiveness * -1
print FN_value

# Total cost/reward of TN group: Nothing, the intervention has no effect on the outcome for this population
TN_value = 0.00
print TN_value 

-118.98
0.0


In [83]:
intervention_net_value = TP_value + FP_value + TN_value + FN_value
print intervention_net_value

-19.14


### Intervention Value Function

Ok, lets wrap all this up in a function and try different interventions.

In [86]:
def intervention_value(intervention_params, newdata, model, threshold = None):
    ''' 
    intervention_params is a dict specifying intervention parameters
    model must be a binomial H2O model
    threshold is a number between 0 and 1
    newdata is an H2OFrame of test data
    '''
    
    # Parse parameters
    unit_full_price = intervention_params['unit_full_price']
    unit_discount = intervention_params['unit_discount']
    discount_effectiveness = intervention_params['discount_effectiveness']
    discounted_unit_price = (1 - unit_discount) * unit_full_price #CHANGED THIS FROM unit_cost
    
#     assert(unit_discount > 0.0)
    
    if threshold is not None:
        # Update the predictions using specified threshold
        pred = model.predict(newdata)
        pred['predict'] = pred['Yes']>threshold
    
    # Confusion matrix
    conf = model.confusion_matrix()
    print conf
    conf_df = conf.table.as_data_frame()
    TN = conf_df.ix[0,1]  #True Negative
    FN = conf_df.ix[0,2]  #False Negative
    FP = conf_df.ix[1,1]  #False Positive
    TP = conf_df.ix[1,2]  #True Positive
    
    # Total reward of TP group: TP * 0.6 * 0.3  #Discounted price is $0.60, which is 40% off of $1.00
    TP_value = TP * discounted_unit_price * discount_effectiveness 
    print TP_value

    # Total cost of FP group: FP * 0.4 * 1.00  #Coupon is $0.40, which is 40% of $1.00, lost for each FP
    FP_value = FP * unit_full_price * unit_discount * -1
    print FP_value
    
    # Total cost of FN group: 30% of these churns could have been saved at a 40% discount
    FN_value = TN * discounted_unit_price * discount_effectiveness * -1
    print FN_value

    # Total cost/reward of TN group: Nothing, the intervention has no effect on the outcome for this population
    TN_value = 0.00
    print TN_value 
    
    intervention_net_value = TP_value + FP_value + TN_value + FN_value
    print 'Value of intervention is %.2f'%intervention_net_value
    return intervention_net_value
    

In [85]:
# Let's try some interventions:

intervention1 = {'unit_full_price': 1.00,
                 'unit_discount': 0.4,
                 'discount_effectiveness': 0.3}

ival1 = intervention_value(intervention_params = intervention1, newdata = test, model = model, threshold = 0.2)


gbm prediction Progress: [##################################################] 100%

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.331672324216: 


0,1,2,3,4
,No,Yes,Error,Rate
No,3221.0,661.0,0.1703,(661.0/3882.0)
Yes,267.0,1148.0,0.1887,(267.0/1415.0)
Total,3488.0,1809.0,0.1752,(928.0/5297.0)



206.64
-106.8
-579.78
0.0
Value of intervention is -479.94


In [76]:
# A unit discount of $0.00 should produce a value of intervention of 0.00

intervention2 = {'unit_full_price': 1.00,
                 'unit_discount': 0.1,
                 'discount_effectiveness': 0.3}

ival2 = intervention_value(intervention_params = intervention2, newdata = test, model = model, threshold = 0.2)


gbm prediction Progress: [##################################################] 100%

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.331672324216: 


0,1,2,3,4
,No,Yes,Error,Rate
No,3221.0,661.0,0.1703,(661.0/3882.0)
Yes,267.0,1148.0,0.1887,(267.0/1415.0)
Total,3488.0,1809.0,0.1752,(928.0/5297.0)



68.88
-213.6
-193.26
0.0
Value of intervention is -337.98


In [92]:
# A unit discount of $0.00 should produce a value of intervention of 0.00... hmm maybe something is wrong above?

intervention2 = {'unit_full_price': 1.00,
                 'unit_discount': 0.0,
                 'discount_effectiveness': 0.3}

ival = intervention_value(intervention_params = intervention2, newdata = test, model = model, threshold = None)


Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.331672324216: 


0,1,2,3,4
,No,Yes,Error,Rate
No,3221.0,661.0,0.1703,(661.0/3882.0)
Yes,267.0,1148.0,0.1887,(267.0/1415.0)
Total,3488.0,1809.0,0.1752,(928.0/5297.0)



344.4
-0.0
-966.3
0.0
Value of intervention is -621.90
