In [None]:
# Import and init h2o lib. Running on java
import h2o
from h2o.automl import H2OAutoML
import pandas as pd
h2o.init()

In [None]:
# Reading csv to h2o dataframe 
col_types_train = {'store_id': 'enum',
 'chain_name': 'enum',
 'mall_name': 'enum',
 'lv1': 'enum',
 'lv2': 'enum',
 'lv3': 'enum',
 'revenue': 'int',
 'total_nbr_people': 'int',
 'nbr_people_per_store_in_grunnkrets': 'int',
 'counts_gr_lv2': 'int',
 'counts_municipality_lv2': 'int',
 'busstops_within_50m': 'int',
 'busstops_within_100m': 'int',
 'busstops_within_400m': 'int',
 'busstops_within_800m': 'int',
 'busstops_within_1500m': 'int',
 'num_stores_within_100m_and_same_lvl2': 'int',
 'num_stores_within_500m_and_same_lvl2': 'int',
 'num_stores_within_1km_and_same_lvl2': 'int',
 'num_stores_within_5km_and_same_lvl2': 'int',
 'num_stores_within_10km_and_same_lvl2': 'int',
 'num_stores_within_20km_and_same_lvl2': 'int',
 'municipality_size_group': 'enum',
 'mean_revenue_for_municipality_size_group': 'int',
 'median_revenue_for_municipality_size_group': 'int',
 'st_dev_of_revenue_for_municipality_size_group': 'int',
 'mean_revenue_for_level1': 'int',
 'mean_revenue_for_level2': 'int',
 'mean_revenue_for_level3': 'int',
 'mean_revenue_chain': 'int'}

col_types_test = {'store_id': 'enum',
 'chain_name': 'enum',
 'mall_name': 'enum',
 'lv1': 'enum',
 'lv2': 'enum',
 'lv3': 'enum',
 'total_nbr_people': 'int',
 'nbr_people_per_store_in_grunnkrets': 'int',
 'counts_gr_lv2': 'int',
 'counts_municipality_lv2': 'int',
 'busstops_within_50m': 'int',
 'busstops_within_100m': 'int',
 'busstops_within_400m': 'int',
 'busstops_within_800m': 'int',
 'busstops_within_1500m': 'int',
 'num_stores_within_100m_and_same_lvl2': 'int',
 'num_stores_within_500m_and_same_lvl2': 'int',
 'num_stores_within_1km_and_same_lvl2': 'int',
 'num_stores_within_5km_and_same_lvl2': 'int',
 'num_stores_within_10km_and_same_lvl2': 'int',
 'num_stores_within_20km_and_same_lvl2': 'int',
 'municipality_size_group': 'enum',
 'mean_revenue_for_municipality_size_group': 'int',
 'median_revenue_for_municipality_size_group': 'int',
 'st_dev_of_revenue_for_municipality_size_group': 'int',
 'mean_revenue_for_level1': 'int',
 'mean_revenue_for_level2': 'int',
 'mean_revenue_for_level3': 'int',
 'mean_revenue_chain': 'int'}

 
train_data = h2o.import_file('feature_data/training_set_final_w_mean_rev.csv', col_types = col_types_train)
test_data = h2o.import_file('feature_data/testing_set_final_w_mean_rev.csv', col_types = col_types_test)

test = pd.read_csv('feature_data/testing_set_dropped.csv')

# Save store ids, then remove this column
store_ids = test['store_id'].to_numpy()
ids = pd.DataFrame(store_ids)
ids.rename(columns={0 :'id'}, inplace=True )

train_data.types



In [None]:
# Removing labels from train set
y = "revenue"
x = train_data.columns
x.remove(y)

In [None]:
# Running fit/training on train set with cross validation n=5
aml = H2OAutoML(max_models = 10, seed = 1, stopping_metric = "RMSE")
aml.train(x = x, y = y, training_frame = train_data)

In [None]:
lb = aml.leaderboard

In [None]:
# Showing best perforimg models
lb.head()

In [None]:
preds = aml.predict(test_data)

In [None]:
#print variable importance
aml.varimp(use_pandas = True)

In [None]:
#preds = h2o.as_list(preds)
print(preds)

In [None]:
# Convert result to desired format

import pandas as pd
import numpy as np


pandas_preds = preds.as_data_frame()
pandas_preds.rename(columns={'predict' :'predicted'}, inplace=True)
pandas_preds['predicted'] = np.expm1(pandas_preds['predicted'])

def save_submission(pred):
    predicted = pd.DataFrame(pred, columns = ['predicted'])
    output = pd.concat([ids,predicted],axis=1)
    output.to_csv('h2o.csv', index=False)
    
save_submission(pandas_preds)

