In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
import h2o
h2o.init()

In [None]:
def add_time_features(df):
    df['date'] = pd.to_datetime(df['date'], format='%Y%m%d', errors='ignore')
    df['year'] = df['date'].apply(lambda x: x.year)
    df['month'] = df['date'].apply(lambda x: x.month)
    df['day'] = df['date'].apply(lambda x: x.day)
    df['weekday'] = df['date'].apply(lambda x: x.weekday())
    
    return df
def load_df(csv_path='../input/train.csv', nrows=None):
    JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']

    df = pd.read_csv(csv_path, dtype={'fullVisitorId': 'str'}, nrows=nrows)

    for column in JSON_COLUMNS:
        df = df.join(pd.DataFrame(df.pop(column).apply(pd.io.json.loads).values.tolist(), index=df.index))

    return df
train = load_df("../input/train.csv")
test = load_df("../input/test.csv")
train.head()

train = add_time_features(train)
test = add_time_features(test)
# Convert target feature to 'float' type.
train["transactionRevenue"] = train["transactionRevenue"].astype('float')
time_agg = train.groupby('date')['transactionRevenue'].agg(['count', 'sum'])
year_agg = train.groupby('year')['transactionRevenue'].agg(['sum'])
month_agg = train.groupby('month')['transactionRevenue'].agg(['sum'])
day_agg = train.groupby('day')['transactionRevenue'].agg(['sum'])
weekday_agg = train.groupby('weekday')['transactionRevenue'].agg(['count','sum'])

# Drop stange 'dict' column
train = train.drop(['adwordsClickInfo'], axis=1)
test = test.drop(['adwordsClickInfo'], axis=1)
# Drop column that exists only in train data
train = train.drop(['campaignCode'], axis=1)
# Input missing transactionRevenue values
train["transactionRevenue"].fillna(0, inplace=True)

test_ids = test["fullVisitorId"].values
#browser operatingSystem source fullVisitorId
#browser operatingSystem source sessionId fullVisitorId browserSize browserVersion fullVisitorId isTrueDirect keyword operatingSystemVersion
# Unwanted columns 
unwanted_columns = [  'visitId', 'visitStartTime', 
                     'flashVersion', 
                    'mobileDeviceInfo', 'mobileDeviceMarketingName', 'mobileDeviceModel', 
                    'mobileInputSelector', 'screenColors', 
                    'metro','networkDomain', 'networkLocation', 'adContent', 'campaign', 
                    'referralPath',
                    'day','year']

train = train.drop(unwanted_columns, axis=1)
test = test.drop(unwanted_columns, axis=1)
# Constant columns
constant_columns = [c for c in train.columns if train[c].nunique()<=1]
print('Columns with constant values: ', constant_columns)
train = train.drop(constant_columns, axis=1)
test = test.drop(constant_columns, axis=1)
# Columns with more than 50% null data
high_null_columns = [c for c in train.columns if train[c].count()<=len(train) * 0.5]
print('Columns more than 50% null values: ', high_null_columns)
train = train.drop(high_null_columns, axis=1)
test = test.drop(high_null_columns, axis=1)

print('TRAIN SET')
print('Rows: %s' % train.shape[0])
print('Columns: %s' % train.shape[1])
print('Features: %s' % train.columns.values)
print()
print('TEST SET')
print('Rows: %s' % test.shape[0])
print('Columns: %s' % test.shape[1])
print('Features: %s' % test.columns.values)



In [None]:
print("DONE")

In [None]:
train['sessionId'].replace(regex=True,inplace=True,to_replace=r'_',value=r'')
test['sessionId'].replace(regex=True,inplace=True,to_replace=r'_',value=r'')
train['sessionId']
test['sessionId']

In [None]:

htrain=h2o.H2OFrame(train)
htest=h2o.H2OFrame(test)
htrain.summary()
del train
del test

In [None]:
#
x = htrain.columns
y = "transactionRevenue"
x.remove(y)
x.remove("date")
print("done")

In [None]:
htrain.describe()

In [None]:

# For binary classification, response should be a factor
#categoricalvars= "deviceCategory", "isMobile", "continent", "month", "weekday"
htrain["month"] = htrain["month"].asfactor()
htrain["weekday"] = htrain["weekday"].asfactor()


htest["month"] = htest["month"].asfactor()
htest["weekday"] = htest["weekday"].asfactor()
#htrain["continent"] = htrain["continent"].asfactor()
#htrain["month"] = htrain["month"].asfactor()
#htrain["weekday"] = htrain["weekday"].asfactor()
#test[y] = test[y].asfactor()

htrain.describe()
#test[,y] <- as.factor(test[,y])

In [None]:
lentrain=len(htrain)
transactionRevenue=htrain["transactionRevenue"]
htrain=htrain.drop(y)
htrain=htrain.drop('date')
htest=htest.drop('date')


htrain.describe()
htest.describe()

htotals=htrain.rbind(htest)
htotals.describe()



In [None]:
#fullVisitorId+sessionId in autoencode 50
from h2o.estimators.deeplearning import H2OAutoEncoderEstimator, H2ODeepLearningEstimator


def runmodelae(data,numouts):
    if numouts<3:
        numouts=5
    if numouts>25:
        numouts=25
    labels = data.columns
    print(labels)
    ae_model = H2OAutoEncoderEstimator(activation="Tanh",
                                       hidden=[numouts],
                                       model_id=labels,
                                       epochs=3,                                      
                                      # mini_batch_size=256, 
                                       ignore_const_cols=True,
                                     #  reproducible=True,
                                       categorical_encoding='one_hot_internal',
                                       seed=1)
    ae_model.train(labels, training_frame=data)  
    
    return ae_model

allmodels=[]
#fullVisitorId+sessionId in autoencode 50
#fin=htotals["fullVisitorId"].cbind(htotals["sessionId"])
#allmodels.append(runmodelae(fin,150))

#channelGrouping medium source
fin=htotals["channelGrouping"].cbind(htotals["medium"])
fin=fin.cbind(htotals["source"])
fin=fin.cbind(htotals["fullVisitorId"])
allmodels.append(runmodelae(fin,150))

#browser deviceCategory isMobile operatingSystem
fin=htotals["browser"].cbind(htotals["deviceCategory"])
fin=fin.cbind(htotals["isMobile"])
fin=fin.cbind(htotals["operatingSystem"])
fin=fin.cbind(htotals["sessionId"])
fin=fin.cbind(htotals["channelGrouping"])
allmodels.append(runmodelae(fin,150))


#pageviews hits visitNumber
fin=htotals["pageviews"].cbind(htotals["hits"])
fin=fin.cbind(htotals["visitNumber"])
fin=fin.cbind(htotals["fullVisitorId"])
fin=fin.cbind(htotals["sessionId"])
fin=fin.cbind(htotals["channelGrouping"])
allmodels.append(runmodelae(fin,150))


#city continent region country subContinent
fin=htotals["city"].cbind(htotals["continent"])
fin=fin.cbind(htotals["region"])
fin=fin.cbind(htotals["country"])
fin=fin.cbind(htotals["sessionId"])
fin=fin.cbind(htotals["subContinent"])
allmodels.append(runmodelae(fin,150))



In [None]:
#channelGrouping medium source
fin=htrain["channelGrouping"].cbind(htrain["medium"])
fin=fin.cbind(htrain["source"])
fin=fin.cbind(htrain["fullVisitorId"])
f1=allmodels[0].deepfeatures(fin,0)
f1

#browser deviceCategory isMobile operatingSystem
fin=htrain["browser"].cbind(htrain["deviceCategory"])
fin=fin.cbind(htrain["isMobile"])
fin=fin.cbind(htrain["operatingSystem"])
fin=fin.cbind(htrain["sessionId"])
fin=fin.cbind(htrain["channelGrouping"])
f2=allmodels[1].deepfeatures(fin,0)


#pageviews hits visitNumber
fin=htrain["pageviews"].cbind(htrain["hits"])
fin=fin.cbind(htrain["visitNumber"])
fin=fin.cbind(htrain["fullVisitorId"])
fin=fin.cbind(htrain["sessionId"])
fin=fin.cbind(htrain["channelGrouping"])
f3=allmodels[2].deepfeatures(fin,0)


#city continent region country subContinent
fin=htrain["city"].cbind(htrain["continent"])
fin=fin.cbind(htrain["region"])
fin=fin.cbind(htrain["country"])
fin=fin.cbind(htrain["sessionId"])
fin=fin.cbind(htrain["subContinent"])
f4=allmodels[3].deepfeatures(fin,0)

In [None]:
f1

In [None]:
f1
allframes=[]
allframes.append(f1.as_data_frame())
allframes.append(f2.as_data_frame())
allframes.append(f3.as_data_frame())
allframes.append(f4.as_data_frame())


t3=pd.concat(allframes, axis=1, sort=False)
t3.columns=[str(x) for x in (range(0,len(t3.columns)))]
print("done")
print(t3)

In [None]:
t3.columns

In [None]:
import gc

allframes.clear()  # Added in python3.(? maybe 5?)
del allframes[:]

gc.collect()
#h2o.shutdown()
#h2o.init()

In [None]:

f=h2o.H2OFrame(t3)
f.summary()
del t3

In [None]:

xx = f.columns
transactionRevenue=transactionRevenue.log1p()
train_supervised_features=f.cbind(transactionRevenue)
train_supervised_features.describe()

In [None]:
#htrain["transactionRevenue"]
#htrain.describe()
y = "log1p(transactionRevenue)"

In [None]:
train, valid = train_supervised_features.split_frame(
    ratios=[0.9],
    seed=1234, 
    destination_frames=['train.hex','valid.hex']
)
print("done")
print(len(train))

In [None]:
# Build model
#from h2o.estimators.deeplearning import H2ODeepWaterEstimator
#from h2o.estimators.deepwater import H2ODeepWaterEstimator
from h2o.estimators.deeplearning import H2ODeepLearningEstimator
#Huber 
#quantile 
#quantile_alpha=0.8
#distribution="Gaussian"
#, mini_batch_size=2560
model = H2ODeepLearningEstimator(epochs=5, activation="RectifierWithDropout", hidden=[200,100,50,25], ignore_const_cols=True, input_dropout_ratio=0.0, 
                                hidden_dropout_ratios=[0.5,0.5,0.5,0.5],nfolds=0,ignored_columns=["date"],
                                stopping_rounds=5,  stopping_tolerance=0.0001,stopping_metric="rmse",
                                 #score_interval=2, score_duty_cycle=0.5,score_training_samples=1000,score_validation_samples=1000,
                                  #  fold_assignment="Modulo",
                                   # keep_cross_validation_predictions=True,
                                    seed=1,standardize=True
                                )
    #categorical_encoding=one_hot_internal
model.train(x=xx, y=y, training_frame=train,validation_frame = valid)
model.show()

In [None]:
param = {
      "ntrees" : 200
    , "distribution":"tweedie" #poisson tweedie gamma gaussian  
    ,'tree_method':"hist"
    ,'grow_policy':"lossguide"
    #, "max_depth" : 10
    , "learn_rate" : 0.01
    , "sample_rate" : 0.7
    , "col_sample_rate_per_tree" : 0.9
    , "min_rows" : 5
    , "score_tree_interval": 100
    #, "nfolds":5
    #, "fold_assignment":"Modulo"
    #, "keep_cross_validation_predictions":True
    , "seed":1
    #,"categorical_encoding":'one_hot_internal' 
    #,"max_abs_leafnode_pred" :0.5
    ,"stopping_rounds":10
    ,"stopping_metric":'rmse'
    ,'booster':"dart"
    #,"standardize":False
}
#one_hot_internal 
#max_abs_leafnode_pred 
#categorical_encoding:
#distribution:
from h2o.estimators import H2OXGBoostEstimator
xgmodel = H2OXGBoostEstimator(**param)
xgmodel.train(x = xx, y = y, training_frame = train, validation_frame = valid)
xgmodel.show()

In [None]:
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.grid.grid_search import H2OGridSearch

## Depth 10 is usually plenty of depth for most datasets, but you never know
#hyper_params = {'max_depth' : [range(1,30,2)]}
#hyper_params = {'max_depth' : [4,6,8,12,16,20,24,30,35,40,50,60]} ##faster for larger datasets
hyper_params = {'max_depth' : [6,8,12,16,20,24]} ##faster for larger datasets
# GBM hyperparameters

# Train and validate a cartesian grid of GBMs

#Build initial GBM Model
gbm_grid = H2OGradientBoostingEstimator(
        distribution='Gaussian',
        ## more trees is better if the learning rate is small enough 
        ## here, use "more than enough" trees - we have early stopping
        ntrees=3000,
        ## smaller learning rate is better
        ## since we have learning_rate_annealing, we can afford to start with a 
        #bigger learning rate
        learn_rate=0.01,
        ## learning rate annealing: learning_rate shrinks by 1% after every tree 
        ## (use 1.00 to disable, but then lower the learning_rate)
        learn_rate_annealing = 0.99,
        ## sample 80% of rows per tree
        sample_rate = 0.8,
        ## sample 80% of columns per split
        col_sample_rate = 0.8,
        ## fix a random number generator seed for reproducibility
        seed = 1234,
        ## score every 10 trees to make early stopping reproducible 
        #(it depends on the scoring interval)
        score_tree_interval = 10, 
        ## early stopping once the validation AUC doesn't improve by at least 0.01% for 
        #5 consecutive scoring events
        #categorical_encoding='one_hot_internal',
        stopping_rounds = 3,
        stopping_metric = "RMSE",
        stopping_tolerance = 1e-4)

#Build grid search with previously made GBM and hyper parameters
grid = H2OGridSearch(model=gbm_grid,
                     grid_id='depth_grid',
                     hyper_params=hyper_params,
                    search_criteria = {'strategy': "Cartesian"})

#search_criteria = {'strategy': "Cartesian"}

#Train grid search
grid.train(x=xx, 
           y=y,
           training_frame = train,
           validation_frame = valid)


print (grid)

In [None]:
## sort the grid models by decreasing MSE
sorted_grid = grid.get_grid(sort_by='RMSE',decreasing=False)
print(sorted_grid)
max_depths = sorted_grid.sorted_metric_table()['max_depth'][0:3]
new_max = int(max(max_depths, key=int))
new_min = int(min(max_depths, key=int))

print ("MaxDepth", new_max)
print ("MinDepth", new_min)

In [None]:
import math
# create hyperameter and search criteria lists (ranges are inclusive..exclusive))
hyper_params_tune = {'max_depth' : list(range(new_min,new_max+1,1)),
                'sample_rate': [x/100. for x in range(20,101)],
                'col_sample_rate' : [x/100. for x in range(20,101)],
                'col_sample_rate_per_tree': [x/100. for x in range(20,101)],
                'col_sample_rate_change_per_level': [x/100. for x in range(90,111)],
                'min_rows': [2**x for x in range(0,int(math.log(train.nrow,2)-1)+1)],
                'nbins': [2**x for x in range(4,11)],
                'nbins_cats': [2**x for x in range(4,13)],
                'min_split_improvement': [0,1e-8,1e-6,1e-4],
                'histogram_type': ["UniformAdaptive","QuantilesGlobal","RoundRobin"]}

search_criteria_tune = {'strategy': "RandomDiscrete",
                   'max_runtime_secs': 3600,  ## limit the runtime to 60 minutes
                   'max_models': 100,  ## build no more than 100 models
                   'seed' : 1234,
                   'stopping_rounds' : 3,
                   'stopping_metric' : "RMSE",
                   'stopping_tolerance': 1e-3
                   }

gbm_final_grid = H2OGradientBoostingEstimator(
                    #distribution='gaussian',
                    distribution='Gaussian',
                    ## more trees is better if the learning rate is small enough 
                    ## here, use "more than enough" trees - we have early stopping
                    ntrees=3000,
                    ## smaller learning rate is better
                    ## since we have learning_rate_annealing, we can afford to start with a 
                    #bigger learning rate
                    learn_rate=0.01,
                    ## learning rate annealing: learning_rate shrinks by 1% after every tree 
                    ## (use 1.00 to disable, but then lower the learning_rate)
                    learn_rate_annealing = 0.99,
                    ## score every 10 trees to make early stopping reproducible 
                    #(it depends on the scoring interval)
                    seed=1234,
                    score_tree_interval = 10,
                    ## fix a random number generator seed for reproducibility                                       
                    ## early stopping once the validation AUC doesn't improve by at least 0.01% for 
                    #5 consecutive scoring events
                    stopping_rounds = 3,
                    #categorical_encoding='one_hot_internal',
                    stopping_metric = "RMSE",
                    stopping_tolerance = 1e-3)
            
#Build grid search with previously made GBM and hyper parameters
final_grid = H2OGridSearch(gbm_final_grid, hyper_params = hyper_params_tune,
                                    grid_id = 'final_grid',
                                    search_criteria = search_criteria_tune)
#Train grid search
final_grid.train(x=xx, 
           y=y,
           ## early stopping based on timeout (no model should take more than 1 hour - modify as needed)
           max_runtime_secs = 3600, 
           training_frame = train,
           validation_frame = valid)

print (final_grid)

In [None]:
sorted_final_grid = final_grid.get_grid(sort_by='RMSE',decreasing=False)
print (sorted_final_grid)

#Get the best model from the list (the model name listed at the top of the table)
best_model = h2o.get_model(sorted_final_grid.sorted_metric_table()['model_ids'][0])
performance_best_model = best_model.model_performance(test)
print (performance_best_model.rmse())



In [None]:
print(best_model.params)
params_list = []
for key, value in best_model.params.items():
    params_list.append(str(key)+" = "+str(value['actual']))
print(params_list)

In [None]:


gbm = h2o.get_model(sorted_final_grid.sorted_metric_table()['model_ids'][0])
#get the parameters from the Random grid search model and modify them slightly
#params = gbm.params
#new_params = {"nfolds":5, "model_id":None}
#for key in new_params.keys():
#    params[key]['actual'] = new_params[key] 
    
#gbm_best = H2OGradientBoostingEstimator()
#for key in params.keys():
#    if key in dir(gbm_best) and getattr(gbm_best,key) != params[key]['actual']:
#        if(key=='training_frame' or key=='validation_frame'  ):
#            1+1
#        else:
#            #print(params[key]+"  "+params[key]['actual'])
#            setattr(gbm_best,key,params[key]['actual'])
            
        
setattr(gbm,'nfolds',5)
setattr(gbm,'stopping_rounds',50)
setattr(gbm,'fold_assignment',"Modulo")
setattr(gbm,'keep_cross_validation_predictions',True)
setattr(gbm,'seed',1)
gbm.train(x=xx, y=y, training_frame=train,validation_frame=valid)

#gbm = h2o.get_model(gbm_best)        
#gbm=gbm_best
print (gbm.cross_validation_metrics_summary())

In [None]:
from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator

# Train a stacked ensemble using the GBM and GLM above
ensemble = H2OStackedEnsembleEstimator(model_id="my_ensemble_binomial3",
                                       base_models=[model,gbm,xgmodel])
ensemble.train(x=xx, y=y, training_frame=train,validation_frame=valid)
ensemble.show()
# Eval ensemble performance on the test data


In [None]:
perf_stack_test = ensemble.model_performance(valid)
print(perf_stack_test)

perf_stack_test = model.model_performance(valid)
print(perf_stack_test)

perf_stack_test = gbm.model_performance(valid)
print(perf_stack_test)

perf_stack_test = xgmodel.model_performance(valid)
print(perf_stack_test)

In [None]:
train_supervised_features = ae_model.deepfeatures(htest, 1)
train_supervised_features.summary()

predictions = ensemble.predict(train_supervised_features)


submission = pd.DataFrame({"fullVisitorId":test_ids})
predictions[predictions<0] = 0
submission["PredictedLogRevenue"] = predictions
submission = submission.groupby("fullVisitorId")["PredictedLogRevenue"].sum().reset_index()
submission.columns = ["fullVisitorId", "PredictedLogRevenue"]
submission["PredictedLogRevenue"] = submission["PredictedLogRevenue"]
submission.to_csv("submission.csv", index=False)


print(sybmission)