In [319]:
import sys
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PowerTransformer
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.cluster import KMeans

sys.path.append("/Users/nwong/Workspace/Projects/tdt4173_project/src")

from feature_engineering.sklearn_transformers import *
from feature_engineering.store_features import *
from feature_engineering.bus_stop_features import *
from feature_engineering.utils import *
import h2o
from h2o.automl import H2OAutoML

stop_importance_levels = ['Mangler viktighetsnivå',
                          'Standard holdeplass',
                          'Lokalt knutepunkt',
                          'Nasjonalt knutepunkt',
                          'Regionalt knutepunkt',
                          'Annen viktig holdeplass']
store_types = ['lv1_desc', 'lv2_desc', 'lv3_desc', 'lv4_desc']
geo_groups = ['grunnkrets_id','t_district','municipality_name']

In [320]:
stores_train_df = set_year_2016(pd.read_csv("/Users/nwong/Workspace/Projects/tdt4173_project/data/raw/stores_train.csv"))
stores_test_df = set_year_2016(pd.read_csv("/Users/nwong/Workspace/Projects/tdt4173_project/data/raw/stores_test.csv"))
stores_extra_df = set_year_2016(pd.read_csv("/Users/nwong/Workspace/Projects/tdt4173_project/data/raw/stores_extra.csv"))

income_dist = set_year_2016(pd.read_csv("/Users/nwong/Workspace/Projects/tdt4173_project/data/raw/grunnkrets_income_households.csv"))
age_dist = set_year_2016(pd.read_csv("/Users/nwong/Workspace/Projects/tdt4173_project/data/raw/grunnkrets_age_distribution.csv"))
household_dist = set_year_2016(pd.read_csv("/Users/nwong/Workspace/Projects/tdt4173_project/data/raw/grunnkrets_households_num_persons.csv"))
grunnkrets_df = combine_keys(set_year_2016(pd.read_csv("/Users/nwong/Workspace/Projects/tdt4173_project/data/raw/grunnkrets_norway_stripped.csv")))
plaace_df = pd.read_csv("/Users/nwong/Workspace/Projects/tdt4173_project/data/raw/plaace_hierarchy.csv")

bus_stops_df = bus_stops_lat_lon(pd.read_csv("/Users/nwong/Workspace/Projects/tdt4173_project/data/raw/busstops_norway.csv"))


  bus_stops_df[['lon', 'lat']] = bus_stops_df['lng_lat'].str.split(


In [321]:
stores_train_merged = encode_levels(stores_train_df.merge(plaace_df, on="plaace_hierarchy_id", how="left"))
stores_test_merged = encode_levels(stores_test_df.merge(plaace_df, on="plaace_hierarchy_id", how="left"))
stores_extra_merged = encode_levels(stores_extra_df.merge(plaace_df, on="plaace_hierarchy_id", how="left"))

In [322]:
compare_train_df = pd.concat([stores_train_merged, stores_extra_merged], ignore_index=True)
compare_test_df = pd.concat([stores_test_merged, stores_extra_merged], ignore_index=True)

In [323]:
def stores_in_radius_new(stores_merged, compare_df, radius=0.05):
    lv_1 = stores_in_radius(stores_merged, compare_df, radius=radius, store_type_group='lv1_desc')
    lv_2 = stores_in_radius(stores_merged, compare_df, radius=radius, store_type_group='lv2_desc')
    lv_3 = stores_in_radius(stores_merged, compare_df, radius=radius, store_type_group='lv3_desc')
    lv_4 = stores_in_radius(stores_merged, compare_df, radius=radius, store_type_group='lv4_desc')
    all_count = stores_in_radius(stores_merged, compare_df, radius=radius, store_type_group=None)
    
    return lv_1.merge(lv_2, on="store_id", how="inner").merge(lv_3, on="store_id", how="inner").merge(lv_4, on="store_id", how="inner").merge(all_count, on="store_id", how="inner")

In [324]:
def new_pop_density(stores_df, age_dist, grunnkrets_df, geo_groups):
    population_density = population_density_grouped_by_geo_group(stores_df, age_dist, grunnkrets_df, geo_groups)
    return population_density.fillna(population_density.mean())

In [325]:

def new_age_dist(stores_df, age_df, grunnkrets_df, geo_groups):
    _age_dist = age_dist_by_geo_group(stores_train_df, age_dist, grunnkrets_df, geo_groups)
    return _age_dist.fillna(_age_dist.mean())

In [326]:
cluster_simil = ClusterSimilarity(n_clusters=100, gamma=1., random_state=42, sample_weight=stores_train_df[['revenue']])
similarities = cluster_simil.fit(stores_train_df[["lat", "lon"]])

def new_clustering(cluster_simil, stores_df):
    return pd.DataFrame(cluster_simil.transform(stores_df[['lat', 'lon']]), columns=cluster_simil.get_feature_names_out(), index=stores_df.store_id).reset_index()

In [327]:
kmeans = KMeans(n_clusters=1200, init='k-means++')
kmeans.fit(stores_train_df[['lat', 'lon']], sample_weight=stores_train_df['revenue'])

def new_kmeans_weighted(dataframe):
    dataframe['cluster_label'] = kmeans.predict(dataframe[['lat', 'lon']])
    dataframe['cluster_label_str'] = dataframe['cluster_label'].astype(str) + '_cluster'
    dataframe.drop('cluster_label', axis=1, inplace=True)


In [328]:
# new_kmeans_weighted(stores_train_df)
# new_kmeans_weighted(stores_test_df)

In [329]:
def distance_to_closest_group(stores_df, compare_df, group):
    """
        Mall or chain
    """
    mat = cdist(
        stores_df[['lat', 'lon']],
        compare_df[compare_df[group].notna()][['lat', 'lon']], metric="euclidean"
    )
    
    new_df = pd.DataFrame(
        mat, index=stores_df['store_id'], columns=compare_df[compare_df[group].notna()]['store_id']
    )
    
    new_df = new_df[new_df > 0]
    
    stores = new_df.index
    # closest = new_df.idxmin(axis=1)
    distance = new_df.min(axis=1)
    
    return pd.DataFrame({'store_id': stores.values, f'distance_closest_{group}': distance.values})

In [330]:
merged_stores_train = stores_train_merged \
    .merge(grunnkrets_df, on="grunnkrets_id", how="left") \
    .merge(income_dist, on="grunnkrets_id", how="left") \
    .merge(store_closest_by_store_groups(stores_train_merged, compare_train_df, store_types), on="store_id", how="left") \
    .merge(stores_in_radius_new(stores_train_merged, compare_train_df), on="store_id", how="left") \
    .merge(new_clustering(cluster_simil, stores_train_df), on="store_id", how="left") \
    .merge(new_pop_density(stores_train_df, age_dist, grunnkrets_df, geo_groups), on="store_id", how="left") \
    .merge(average_household_income_by_geo_groups(stores_train_df, geo_groups, income_dist, household_dist, grunnkrets_df)) \
    .merge(bus_stops_distance_by_importance(stores_train_df, bus_stops_df, stop_importance_levels).reset_index(level=0), on="store_id", how="left") \
    .merge(new_age_dist(stores_train_df, age_dist, grunnkrets_df, geo_groups), on="store_id", how="left") \
    .merge(distance_to_closest_group(stores_train_merged, compare_train_df, "mall_name"), on="store_id", how="left") \
    .merge(distance_to_closest_group(stores_train_merged, compare_train_df, "chain_name"), on="store_id", how="left") 
        
merged_stores_test = stores_test_merged \
    .merge(grunnkrets_df, on="grunnkrets_id", how="left") \
    .merge(income_dist, on="grunnkrets_id", how="left") \
    .merge(store_closest_by_store_groups(stores_test_merged, compare_test_df, store_types), on="store_id", how="left") \
    .merge(stores_in_radius_new(stores_test_merged, compare_test_df), on="store_id", how="left") \
    .merge(new_clustering(cluster_simil, stores_test_df), on="store_id", how="left") \
    .merge(new_pop_density(stores_test_df, age_dist, grunnkrets_df, geo_groups), on="store_id", how="left") \
    .merge(average_household_income_by_geo_groups(stores_test_df, geo_groups, income_dist, household_dist, grunnkrets_df)) \
    .merge(bus_stops_distance_by_importance(stores_test_df, bus_stops_df, stop_importance_levels).reset_index(level=0), on="store_id", how="left") \
    .merge(new_age_dist(stores_test_df, age_dist, grunnkrets_df, geo_groups), on="store_id", how="left") \
    .merge(distance_to_closest_group(stores_test_merged, compare_test_df, "mall_name"), on="store_id", how="left") \
    .merge(distance_to_closest_group(stores_test_merged, compare_test_df, "chain_name"), on="store_id", how="left") 

  return population_density.fillna(population_density.mean())
  return _age_dist.fillna(_age_dist.mean())
  return population_density.fillna(population_density.mean())
  return _age_dist.fillna(_age_dist.mean())


In [331]:
target_labels = merged_stores_train[['store_id', 'revenue']].copy()
merged_stores_train = merged_stores_train.copy().drop('revenue', axis=1)

In [332]:
pd.DataFrame(merged_stores_train.columns).to_csv("features.csv")

In [370]:
merged_stores_train['store_name_first'] = merged_stores_train['store_name'].str.split(' ').str[0]
merged_stores_train['store_name_last'] = merged_stores_train['store_name'].str.split(' ').str[-1]
merged_stores_train['address_first'] = merged_stores_train['address'].str.extract('(\D+)\s+(\d+)\s?(.*)')[0]

merged_stores_test['store_name_first'] = merged_stores_test['store_name'].str.split(' ').str[0]
merged_stores_test['store_name_last'] = merged_stores_test['store_name'].str.split(' ').str[-1]
merged_stores_test['address_first'] = merged_stores_test['address'].str.extract('(\D+)\s+(\d+)\s?(.*)')[0]

In [371]:
merged_stores_train['is_chain'] = merged_stores_train['chain_name'].isna()
merged_stores_train['is_mall'] = merged_stores_train['mall_name'].isna()

merged_stores_test['is_chain'] = merged_stores_test['chain_name'].isna()
merged_stores_test['is_mall'] = merged_stores_test['mall_name'].isna()

In [372]:
merged_stores_train['store_type_chain'] = merged_stores_train['lv3_desc'] + merged_stores_train['is_chain'].astype(str)
merged_stores_train['store_type_mall'] = merged_stores_train['lv3_desc'] + merged_stores_train['is_mall'].astype(str)

merged_stores_test['store_type_chain'] = merged_stores_test['lv3_desc'] + merged_stores_test['is_chain'].astype(str)
merged_stores_test['store_type_mall'] = merged_stores_test['lv3_desc'] + merged_stores_test['is_mall'].astype(str)

In [373]:
merged_stores_train['store_type_municipality'] = merged_stores_train['lv3_desc'] + merged_stores_train['municipality_name']
merged_stores_test['store_type_municipality'] = merged_stores_test['lv3_desc'] + merged_stores_test['municipality_name'].astype(str)

In [374]:
bus_col_names = [
    'distance_to_mangler_viktighetsnivå',
    'distance_to_standard_holdeplass',
    'distance_to_lokalt_knutepunkt',
    'distance_to_nasjonalt_knutepunkt',
    'distance_to_regionalt_knutepunkt',
    'distance_to_annen_viktig_holdeplass'
]
merged_stores_train["bus_distance_to_store"] = merged_stores_train[bus_col_names].min(axis = 1)
merged_stores_test["bus_distance_to_store"] = merged_stores_train[bus_col_names].min(axis = 1)

In [380]:
inc_cols = [
    'store_id',
    # 'store_name_first',
    # 'store_name_first',
    # 'store_name_last',
    'address_first',
    
    # 'plaace_hierarchy_id',
    'chain_name',
    # 'is_chain',
    # 'mall_name',
    # 'is_mall',
    # 'store_type_chain',
    # 'store_type_mall',
    'sales_channel_name_x',
    # 'store_type_municipality',
    
    'grunnkrets_id',
    'municipality_name',
    't_district',
    # 'address',
    
    # 'lv1_desc',
    'lv2_desc',
    'lv3_desc',
    # 'lv4',
]

yeo_cols = [
    # 'all_households',
    'avg_household_income_t_district',
    'avg_household_income_municipality_name',
    't_district_density',
    # 'municipality_name_density',
    
    'all_stores_in_radius',
    # 'lv1_desc_in_radius',
    'lv2_desc_in_radius',
    'lv3_desc_in_radius',
    # 'lv4_desc_in_radius',
]

box_cols = [
    # 'area_km2',
    # 'distance_to_lv1_desc',
    'distance_to_lv2_desc',
    'distance_to_lv3_desc',
    # 'distance_to_lv4_desc',
    # 'grunnkrets_id_density',
    # 't_district_density',
    'municipality_name_density',
    'distance_to_lokalt_knutepunkt',
    'distance_to_regionalt_knutepunkt',
    'distance_to_annen_viktig_holdeplass',
    'distance_to_nasjonalt_knutepunkt',
    'distance_closest_mall_name',
    'distance_closest_chain_name',
    'bus_distance_to_store',
]

_merged_stores_train = merged_stores_train.filter(inc_cols+yeo_cols+box_cols)
_merged_stores_test = merged_stores_test.filter(inc_cols+yeo_cols+box_cols)

In [381]:
# _merged_stores_train['grunnkrets_id'] = _merged_stores_train['grunnkrets_id'].astype('str')

In [382]:
# _merged_stores_test['grunnkrets_id'] = _merged_stores_test['grunnkrets_id'].astype('str')

In [383]:
yeo_pipeline = make_pipeline(
    SimpleImputer(strategy="mean"),
    PowerTransformer()
)
box_pipeline = make_pipeline(
    SimpleImputer(strategy="mean"),
    PowerTransformer(method="box-cox")
)

preprocessing = make_column_transformer(
        (yeo_pipeline, yeo_cols),
        (box_pipeline, box_cols),
        remainder="passthrough"
    )

def new_transformer(merged_stores_df, preprocessing):
    return pd.DataFrame(preprocessing.transform(merged_stores_df), columns=preprocessing.get_feature_names_out(), index=merged_stores_df.index)

In [384]:
preprocessing.fit(_merged_stores_train)

_merged_stores_train = new_transformer(_merged_stores_train, preprocessing)
_merged_stores_test = new_transformer(_merged_stores_test, preprocessing)

In [385]:
pt = PowerTransformer()
rev_transformed = pt.fit_transform(target_labels[["revenue"]])
_merged_stores_train["revenue"] = rev_transformed

In [386]:
# _merged_stores_train["z_score"] = (_merged_stores_train.revenue - _merged_stores_train.revenue.mean()) / _merged_stores_train.revenue.std()
# train_no_outliers = (_merged_stores_train[(_merged_stores_train.z_score > -2) & (_merged_stores_train.z_score < 2)])

# _merged_stores_train.drop(['z_score'], axis=1, inplace=True)

In [387]:
# Start the H2O cluster (locally)
h2o.init()

# Import a sample binary outcome train/test set into H2O
train = h2o.H2OFrame(_merged_stores_train)
test = h2o.H2OFrame(_merged_stores_test)

cat_vars = inc_cols

cat_vars = [f'remainder__{i}' for i in cat_vars]

for cat in cat_vars:
    train[cat] = train[cat].asfactor()
    test[cat] = test[cat].asfactor()


#test = h2o.H2OFrame(pd.concat([test_set, stores_test_enriched[['store_id']]], axis=1).drop(drop_cols, axis=1))

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,6 hours 52 mins
H2O_cluster_timezone:,Europe/Oslo
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.38.0.2
H2O_cluster_version_age:,16 days
H2O_cluster_name:,H2O_from_python_nwong_l2zh49
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,506.6 Mb
H2O_cluster_total_cores:,16
H2O_cluster_allowed_cores:,16


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [388]:
# Identify predictors and response
x = train.columns
y = "revenue"
x.remove(y)

# Run AutoML for 20 base models
aml = H2OAutoML(max_models=20, seed=1, exclude_algos=['deeplearning'])
aml.train(x=x, y=y, training_frame=train)

AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%


Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
mae,0.5516057,0.0052678,0.5443357,0.5535008,0.5507921,0.5588678,0.550532
mean_residual_deviance,0.5315828,0.0133089,0.5153766,0.5398697,0.5369474,0.545997,0.5197235
mse,0.5315828,0.0133089,0.5153766,0.5398697,0.5369474,0.545997,0.5197235
null_deviance,2572.4175,88.38749,2661.035,2536.0876,2643.515,2579.3167,2442.133
r2,0.4679823,0.0177329,0.492458,0.4495317,0.4673867,0.4529541,0.4775809
residual_deviance,1367.3533,57.22548,1350.2866,1395.0233,1407.876,1409.2183,1274.362
rmse,0.7290514,0.0091411,0.7178974,0.7347583,0.7327669,0.7389161,0.7209185
rmsle,,0.0,,,,,


In [393]:
lb = aml.leaderboard
lb.head(rows=lb.nrows)  # Print all rows instead of default (10 rows)

model_id,rmse,mse,mae,rmsle,mean_residual_deviance
StackedEnsemble_AllModels_1_AutoML_16_20221112_185249,0.729161,0.531675,0.55142,,0.531675
StackedEnsemble_BestOfFamily_1_AutoML_16_20221112_185249,0.731381,0.534918,0.554243,,0.534918
GBM_grid_1_AutoML_16_20221112_185249_model_2,0.743748,0.553161,0.563469,,0.553161
GBM_grid_1_AutoML_16_20221112_185249_model_3,0.744651,0.554505,0.565793,,0.554505
GBM_5_AutoML_16_20221112_185249,0.74704,0.558069,0.567244,,0.558069
DRF_1_AutoML_16_20221112_185249,0.749248,0.561372,0.566137,,0.561372
GBM_1_AutoML_16_20221112_185249,0.753661,0.568005,0.571113,,0.568005
GBM_2_AutoML_16_20221112_185249,0.753891,0.568351,0.570992,,0.568351
GBM_4_AutoML_16_20221112_185249,0.75402,0.568546,0.572275,,0.568546
GBM_3_AutoML_16_20221112_185249,0.756033,0.571586,0.576042,,0.571586


In [394]:
# The leader model is stored here
aml.leader

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
mae,0.5516057,0.0052678,0.5443357,0.5535008,0.5507921,0.5588678,0.550532
mean_residual_deviance,0.5315828,0.0133089,0.5153766,0.5398697,0.5369474,0.545997,0.5197235
mse,0.5315828,0.0133089,0.5153766,0.5398697,0.5369474,0.545997,0.5197235
null_deviance,2572.4175,88.38749,2661.035,2536.0876,2643.515,2579.3167,2442.133
r2,0.4679823,0.0177329,0.492458,0.4495317,0.4673867,0.4529541,0.4775809
residual_deviance,1367.3533,57.22548,1350.2866,1395.0233,1407.876,1409.2183,1274.362
rmse,0.7290514,0.0091411,0.7178974,0.7347583,0.7327669,0.7389161,0.7209185
rmsle,,0.0,,,,,


In [395]:
m = h2o.get_model(lb[3,"model_id"])
m.varimp(use_pandas=True)

Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,remainder__chain_name,13385.482422,1.0,0.362508
1,remainder__municipality_name,7647.637695,0.571338,0.207114
2,remainder__sales_channel_name_x,5541.031738,0.413958,0.150063
3,remainder__lv3_desc,4737.943848,0.353961,0.128314
4,remainder__store_id,1492.585693,0.111508,0.040422
5,remainder__lv2_desc,1241.927246,0.092782,0.033634
6,pipeline-2__distance_closest_mall_name,441.410889,0.032977,0.011954
7,pipeline-2__distance_closest_chain_name,398.204803,0.029749,0.010784
8,pipeline-1__all_stores_in_radius,202.360138,0.015118,0.00548
9,remainder__grunnkrets_id,189.566483,0.014162,0.005134


In [392]:
preds_avg = aml.predict(test)
preds_best = aml.leader.predict(test)

stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |



███████████████████████████████████████████| (done) 100%


In [369]:
df = test.cbind(preds_best)
df = df.as_data_frame(use_pandas=True)
result = df.loc[:,("remainder__store_id", 'predict')]
submission = result.rename(columns = {"remainder__store_id": "id",  "predict" : "predicted"})
submission['predicted'] = pt.inverse_transform(submission[['predicted']])
submission.to_csv("StackedEnsembleBestOfFamily13.csv", index = False)

submission

Feature names unseen at fit time:
- predicted
Feature names seen at fit time, yet now missing:
- revenue



Unnamed: 0,id,predicted
0,914206820-914239427-717245,6.238341
1,916789157-916823770-824309,10.529318
2,913341082-977479363-2948,8.670094
3,889682582-889697172-28720,11.532712
4,997991699-998006945-417222,11.832539
...,...,...
8572,917323003-917383529-844309,4.058961
8573,917353379-917411824-845904,4.214480
8574,917072302-917089248-833647,6.424311
8575,916960557-916993161-829908,3.847106
