In [1]:
import h2o
import pandas as pd
import numpy as np
import processing_functions
import matplotlib.pyplot as plt
from h2o.estimators.random_forest import H2ORandomForestEstimator as H2ORFE
from h2o.estimators.gbm import H2OGradientBoostingEstimator as H2OGBE
from h2o.grid.grid_search import H2OGridSearch as H2OGS
h2o.init()

from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error


Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O cluster uptime:,7 mins 10 secs
H2O cluster timezone:,America/Los_Angeles
H2O data parsing timezone:,UTC
H2O cluster version:,3.28.0.3
H2O cluster version age:,1 month and 5 days
H2O cluster name:,H2O_from_python_tshih_0ff18d
H2O cluster total nodes:,1
H2O cluster free memory:,3.476 Gb
H2O cluster total cores:,2
H2O cluster allowed cores:,2


In [2]:
df = h2o.upload_file('data/train.csv')

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [3]:
df = processing_functions.process_df(df)
df.head()

C1,unix_pickup,unix_dropoff,trip_duration,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tolls_amount,improvement_surcharge,total_amount,trip_type,congestion_surcharge,tip_amount,did_tip
0,7672880.0,7673140.0,256,1,75,236,1,0.87,5.0,0,0.5,0,0.3,9.55,1,2.75,1.0,1
1,7671450.0,7671850.0,400,1,65,54,1,1.28,7.0,0,0.5,0,0.3,7.8,1,0.0,0.0,0
2,6889110.0,6889830.0,720,1,260,157,1,1.27,9.0,1,0.5,0,0.3,12.96,1,0.0,2.16,1
3,7584930.0,7585260.0,335,1,75,74,1,1.36,6.5,1,0.5,0,0.3,9.3,1,0.0,1.0,1
4,6093050.0,6095080.0,2031,1,65,246,1,7.27,28.0,0,0.5,0,0.3,37.86,1,2.75,6.31,1
5,5820870.0,5821270.0,403,1,25,195,1,1.6,7.0,0,0.5,0,0.3,9.8,1,0.0,2.0,1
6,6282570.0,6287040.0,4470,1,65,132,1,28.32,79.5,1,0.5,0,0.3,97.56,1,0.0,16.26,1
7,6115000.0,6116460.0,1459,1,65,162,1,6.52,22.5,1,0.5,0,0.3,32.46,1,2.75,5.41,1
8,7497990.0,7498720.0,729,1,7,138,1,3.06,12.5,1,0.5,0,0.3,17.16,1,0.0,2.86,1
9,7746290.0,7748800.0,2504,1,181,164,1,8.92,32.5,0,0.5,0,0.3,33.3,1,0.0,0.0,0




In [None]:
train, valid, test = df.split_frame(ratios = [.8,.1], seed = 2020)
train_1 = train[train['did_tip']]
valid_1 = valid[valid['did_tip']]

In [None]:
#Getting the names of the columns to use for models
col_name_X = df.names[:-2]
col_name_X.remove('C1')

In [None]:
#generating the RF models

rf_bool = H2ORFE(
    model_id="rf_did_tip",
    max_depth = 40,
    ntrees=200,
    stopping_rounds=4,
    score_each_iteration=True,
    stopping_metric = 'auc',
    seed=2020)

rf_value = H2ORFE(
    model_id='rf_tip_amount_no_0',
    max_depth = 50,
    ntrees=200,
    stopping_rounds=4,
    score_each_iteration=True,
    seed = 2020
)

rf_with_0 = H2ORFE(
    model_id='rf_tip_amount_0',
    max_depth = 50,
    ntrees = 200,
    stopping_rounds=4,
    score_each_iteration=True,
    seed = 2020
)
rf_bool.train(col_name_X,'did_tip',training_frame = train, validation_frame = valid)
rf_value.train(col_name_X,'tip_amount',training_frame = train_1, validation_frame = valid_1)
rf_with_0.train(col_name_X,'tip_amount', training_frame = train, validation_frame = valid)

In [None]:
#generating the Gradient Boost Machines
grid_params = {
    'ntrees' :[10,20,30,40],
    'max_depth' :[5,10,20,30],
    'sample_rate' : [.7,1.0],
    'col_sample_rate' : [.5,1.0]
}

gbm_bool = H2OGBE(
    model_id = 'gbm_did_tip',
    ntrees = 30,
    max_depth = 20,
    score_each_iteration = True,
    seed = 2020
)

gbm_value = H2OGBE(
    model_id = 'gbm_tip_amount_no_0',
    ntrees = 40,
    max_depth = 20,
    sample_rate = .7,
    score_each_iteration = True,
    seed = 2020
)

gbm_with_0 = H2OGBE(
    model_id = 'gbm_tip_amount_0',
    ntrees = 30,
    max_depth = 20,
    score_each_iteration = True,
    seed = 2020
)


#Hyperparameter Grid Search
# gbm_grid = H2OGS(model = H2OGBE,grid_id = 'gbm_grid_0s',hyper_params = grid_params, parallelism = 0)
# gbm_grid_2 = H2OGS(model = H2OGBE,grid_id = 'gbm_grid_bool',hyper_params = grid_params, parallelism = 0)
# gbm_grid_3 = H2OGS(model = H2OGBE,grid_id = 'gbm_grid_values',hyper_params = grid_params, parallelism = 0)

# gbm_grid_3.train(col_name_X,'tip_amount',training_frame = train_1, validation_frame = valid_1)
# gbm_grid_2.train(col_name_X,'did_tip',training_frame = train, validation_frame = valid)
# gbm_grid.train(col_name_X,'tip_amount',training_frame = train, validation_frame = valid)


gbm_with_0.train(col_name_X,'tip_amount',training_frame = train, validation_frame = valid)
gbm_bool.train(col_name_X,'did_tip',training_frame = train, validation_frame = valid)
gbm_value.train(col_name_X,'tip_amount', training_frame = train_1, validation_frame = valid_1)

In [None]:
#test_results is a dataframe that is identical to test, with 6 extra columns
#first four extra columns are permutation of using gbm/rf for classifier/regressor combo where we only regress when classifier predicts 1
#next 2 extra columns are just regressors using rf/gbm

#generating did_tip predictions and making a copy of each
test_results = test.cbind(rf_bool.predict(test)['predict'])
test_results = test_results.cbind(test_results[:,'predict'])
test_results = test_results.cbind(gbm_bool.predict(test)['predict'])
test_results = test_results.cbind(test_results[:,'predict1'])

#pure regressor
test_results = test_results.cbind(rf_with_0.predict(test).round(2))
test_results = test_results.cbind(gbm_with_0.predict(test).round(2))

#Running the tips regressor on what the classifier thinks were tipped
rf_tipped_1 = test_results[test_results['predict']]
gbm_tipped_1 = test_results[test_results['predict1']]
rf_rf_tipped_1 = rf_tipped_1['C1'].cbind(rf_value.predict(rf_tipped_1).round(2))
rf_gbm_tipped_1 = rf_tipped_1['C1'].cbind(gbm_value.predict(rf_tipped_1).round(2))
gbm_rf_tipped_1 = gbm_tipped_1['C1'].cbind(rf_value.predict(gbm_tipped_1).round(2))
gbm_gbm_tipped_1 = gbm_tipped_1['C1'].cbind(gbm_value.predict(gbm_tipped_1).round(2))

#Combining the 0s of the classifier with the predictions of the regressor
merge_slice = test_results[:,'C1']
rf_rf_slice = merge_slice.merge(rf_rf_tipped_1,all_x = True)
rf_gbm_slice = merge_slice.merge(rf_gbm_tipped_1,all_x = True)
gbm_rf_slice = merge_slice.merge(gbm_rf_tipped_1,all_x = True)
gbm_gbm_slice = merge_slice.merge(gbm_gbm_tipped_1,all_x = True)

rf_rf_slice[rf_rf_slice['predict'].isna(),'predict'] = 0
rf_gbm_slice[rf_gbm_slice['predict'].isna(),'predict'] = 0
gbm_rf_slice[gbm_rf_slice['predict'].isna(),'predict'] = 0
gbm_gbm_slice[gbm_gbm_slice['predict'].isna(),'predict'] = 0

#putting in the predictions into the original data frame
test_results['predict'] = rf_rf_slice['predict']
test_results['predict0'] = rf_gbm_slice['predict']
test_results['predict1'] = gbm_rf_slice['predict']
test_results['predict10'] = gbm_gbm_slice['predict']




In [None]:
#Generating the table of predictions
pandas_df = test_results[:,test_results.col_names[-8:]].as_data_frame().drop('did_tip',axis=1)
pandas_df.head(20)

In [None]:
#Generating the Metric Matrix for the different models
pandas_df = pandas_df.rename(columns = {'predict':'RF_RF','predict0' : 'RF_GBM', 'predict1' :'GBM_RF', 'predict10' : 'GBM_GBM', 'predict2' : 'RF' , 'predict3' : 'GBM'})
column_names = pandas_df.columns[1:]
model_names = ['RF_RF ClassReg', 'RF_GBM ClassReg','GBM_RF ClassReg','GBM_GBM ClassReg','RF Regress','GBM Regress']


metric_matrix = np.zeros((3,len(column_names)))

#for each model, calculate R2, MSE, and MAE
for i in range(len(column_names)):
    metric_matrix[0,i] = r2_score(pandas_df['tip_amount'],pandas_df[column_names[i]]).round(5)
    metric_matrix[1,i] = mean_squared_error(pandas_df['tip_amount'],pandas_df[column_names[i]]).round(5)
    metric_matrix[2,i] = mean_absolute_error(pandas_df['tip_amount'],pandas_df[column_names[i]]).round(5)
    
#Displaying the matrix
metric_df = pd.DataFrame(metric_matrix,columns = model_names)
metric_df.insert(0,'Metric',['R2','MSE','MAE'])
metric_df = metric_df.set_index('Metric')
metric_df.head()

In [None]:
#Plotting actual vs predicted tip amount
plt.figure(figsize = (7,7))
plt.xlim(-.5,20)
plt.ylim(-.5,20)

plt.title('Actual vs Predicted Tip')
plt.xlabel('Actual')
plt.ylabel('Predicted')

plt.axes().set_aspect('equal')

line = [.01*i for i in range(int(80/.01))]
plt.plot('tip_amount','RF','b.',markersize=3,data = pandas_df)
lin, = plt.plot(line,line,'r*',markersize=1)
lin.set_label('Actual = Predicted')
plt.legend()

plt.show()

In [None]:
h2o.save_model(model=rf_bool,path = 'models/',force = True)
h2o.save_model(model=rf_value,path = 'models/',force = True)
h2o.save_model(model=rf_with_0,path = 'models/',force = True)
h2o.save_model(model=gbm_bool,path = 'models/',force = True)
h2o.save_model(model=gbm_value,path = 'models/',force = True)
h2o.save_model(model=gbm_with_0,path = 'models/',force = True)