In [1]:

import sys
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

sys.path.append("/Users/nwong/Workspace/Projects/tdt4173_project/src")

from feature_engineering.sklearn_transformers import *
from feature_engineering.store_features import *
from feature_engineering.utils import *

In [2]:
# Import datasets
stores_train_df = pd.read_csv("/Users/nwong/Workspace/Projects/tdt4173_project/data/raw/stores_train.csv")
stores_test_df = pd.read_csv("/Users/nwong/Workspace/Projects/tdt4173_project/data/raw/stores_test.csv")
stores_extra_df = pd.read_csv("/Users/nwong/Workspace/Projects/tdt4173_project/data/raw/stores_extra.csv")
bus_stops_df = pd.read_csv("/Users/nwong/Workspace/Projects/tdt4173_project/data/raw/busstops_norway.csv")

# Grouping mechanism 
geo_groups = ['grunnkrets_id','t_district','municipality_name']
store_types = ['lv1_desc', 'lv2_desc', 'lv3_desc', 'lv4_desc']
stop_importance_levels = ['Mangler viktighetsnivå',
                          'Standard holdeplass',
                          'Lokalt knutepunkt',
                          'Nasjonalt knutepunkt',
                          'Regionalt knutepunkt',
                          'Annen viktig holdeplass']

In [3]:
raw_path = "/Users/nwong/Workspace/Projects/tdt4173_project/data/raw/" # Folder with the raw data files

stores_extra_merged = enrich_keys(stores_extra_df, raw_path=raw_path)
stores_train_enriched = data_enricher(stores_train_df, raw_path=raw_path, geo_groups=geo_groups, importance_levels=stop_importance_levels)
stores_test_enriched = data_enricher(stores_test_df, raw_path=raw_path, geo_groups=geo_groups, importance_levels=stop_importance_levels)

stores_train_data = stores_train_enriched.drop("revenue", axis=1)
stores_train_labels = stores_train_enriched[['store_id', 'revenue']]

  bus_stops_df[['lon', 'lat']] = bus_stops_df['lng_lat'].str.split(
  bus_stops_df[['lon', 'lat']] = bus_stops_df['lng_lat'].str.split(


In [6]:
num_stores_district_lv1 = AggTransformer(
    agg_cols=["t_district", "lv1_desc"], 
    agg_name="store_count", 
    calculations=store_count,
    stores_extra=stores_extra_merged
)

num_stores_district_lv2 = AggTransformer(
    agg_cols=["t_district", "lv2_desc"], 
    agg_name="store_count", 
    calculations=store_count,
    stores_extra=stores_extra_merged
)

num_stores_district_lv3 = AggTransformer(
    agg_cols=["t_district", "lv3_desc"], 
    agg_name="store_count", 
    calculations=store_count,
    stores_extra=stores_extra_merged
)

num_stores_municipality_lv1 = AggTransformer(
    agg_cols=["municipality_name", "lv1_desc"], 
    agg_name="store_count", 
    calculations=store_count,
    stores_extra=stores_extra_merged
)

num_stores_municipality_lv2 = AggTransformer(
    agg_cols=["municipality_name", "lv2_desc"], 
    agg_name="store_count", 
    calculations=store_count,
    stores_extra=stores_extra_merged
)

num_stores_municipality_lv3 = AggTransformer(
    agg_cols=["municipality_name", "lv3_desc"], 
    agg_name="store_count", 
    calculations=store_count,
    stores_extra=stores_extra_merged
)

In [7]:
avg_rev_mall = AggTransformer(
    agg_cols=["mall_name"], 
    agg_name="avg_revenue", 
    calculations=average_revenue, 
    sample_revenue=stores_train_labels
)

avg_rev_mall_lv2 = AggTransformer(
    agg_cols=["mall_name", "lv2_desc"], 
    agg_name="avg_revenue", 
    calculations=average_revenue, 
    sample_revenue=stores_train_labels
)

avg_rev_chain = AggTransformer(
    agg_cols=["chain_name"], 
    agg_name="avg_revenue", 
    calculations=average_revenue, 
    sample_revenue=stores_train_labels
)

avg_rev_district = AggTransformer(
    agg_cols=['t_district'],
    agg_name='avg_rev_district',
    calculations=average_revenue,
    sample_revenue=stores_train_labels
)

avg_rev_municipality = AggTransformer(
    agg_cols=['municipality_name'],
    agg_name='avg_rev_district',
    calculations=average_revenue,
    sample_revenue=stores_train_labels
)

avg_rev_district_lv3 = AggTransformer(
    agg_cols=['t_district', 'lv3_desc'],
    agg_name='avg_rev_district',
    calculations=average_revenue,
    sample_revenue=stores_train_labels
)

avg_rev_municipality_lv3 = AggTransformer(
    agg_cols=['municipality_name', 'lv3_desc'],
    agg_name='avg_rev_municipality',
    calculations=average_revenue,
    sample_revenue=stores_train_labels
)

avg_rev_lv3 = AggTransformer(
    agg_cols=["lv3_desc"], 
    agg_name="avg_rev_lv3", 
    calculations=average_revenue,
    sample_revenue=stores_train_labels
)

avg_rev_lv4 = AggTransformer(
    agg_cols=["lv4_desc"], 
    agg_name="avg_rev_lv4", 
    calculations=average_revenue,
    sample_revenue=stores_train_labels
)

In [8]:
# Stores in radius irrespective of store type
num_stores_radius = StoresInRadiusTransformer(
    radius=0.1)

num_stores_radius_extra = StoresInRadiusTransformer(
    radius=0.1,
    stores_extra=stores_extra_merged)

In [9]:
cluster_simil = ClusterSimilarity(
    n_clusters=20, 
    gamma=1., 
    random_state=42, 
    sample_weight=stores_train_labels)

In [10]:
closest_stores = ClosestStore(
    stores_extra=stores_extra_merged,
    store_type_groups=store_types
)

In [11]:
num_pipeline = Pipeline([
    ("impute", SimpleImputer(strategy="median")),
    ("standardize", StandardScaler()),
])

cat_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore")
)

kmeans_pipeline = make_pipeline(
    StandardScaler(),
    ClusterSimilarity()
)

In [12]:

preprocessing = make_column_transformer(
        (is_null_pipeline(), ['mall_name']),
        (is_null_pipeline(), ['chain_name']),
        (num_stores_district_lv1, ['store_id', 't_district', 'lv1_desc']),
        (num_stores_district_lv3, ['store_id', 't_district', 'lv3_desc']),
        (num_stores_district_lv2, ['store_id', 't_district', 'lv2_desc']),
        (num_stores_municipality_lv1, ['store_id', "municipality_name", "lv1_desc"]),
        (num_stores_municipality_lv2, ['store_id', "municipality_name", "lv2_desc"]),
        (num_stores_municipality_lv3, ['store_id', "municipality_name", "lv3_desc"]),
        (avg_rev_mall, ['store_id', 'mall_name']),
        (avg_rev_mall_lv2, ['store_id', 'mall_name', 'lv2_desc']),
        (avg_rev_chain, ['store_id', 'chain_name']),
        (avg_rev_district, ['store_id', 't_district']),
        (avg_rev_district_lv3, ['store_id', 't_district', 'lv3_desc']),
        (avg_rev_municipality_lv3, ['store_id', 'municipality_name', 'lv3_desc']),
        (avg_rev_lv3, ['store_id', 'lv3_desc']),
        (avg_rev_lv4, ['store_id', 'lv4_desc']),
        (num_stores_radius_extra, ['store_id', 'lat', 'lon']),
        (cluster_simil, ['lat', 'lon']),
        (closest_stores, ['store_id', 'lat', 'lon'] + store_types),
        
        remainder="passthrough"
)

preprocessing.fit(stores_train_data)

In [13]:
train_ = preprocessing.transform(stores_train_data)

In [14]:
test_ = preprocessing.transform(stores_test_enriched)

In [15]:
train_set = pd.DataFrame(train_, columns=preprocessing.get_feature_names_out(), index=stores_train_data.index)

In [16]:
test_set = pd.DataFrame(test_, columns=preprocessing.get_feature_names_out(), index=stores_test_enriched.index)

In [17]:
train_set

Unnamed: 0,pipeline-1__is,pipeline-2__is,aggtransformer-1__store_count,aggtransformer-2__store_count,aggtransformer-3__store_count,aggtransformer-4__store_count,aggtransformer-5__store_count,aggtransformer-6__store_count,aggtransformer-7__avg_revenue,aggtransformer-8__avg_revenue,...,remainder__number_of_lokalt_knutepunkt,remainder__number_of_nasjonalt_knutepunkt,remainder__number_of_regionalt_knutepunkt,remainder__number_of_annen_viktig_holdeplass,remainder__distance_to_mangler_viktighetsnivå,remainder__distance_to_standard_holdeplass,remainder__distance_to_lokalt_knutepunkt,remainder__distance_to_nasjonalt_knutepunkt,remainder__distance_to_regionalt_knutepunkt,remainder__distance_to_annen_viktig_holdeplass
0,True,True,57.0,1.0,38.0,140.0,103.0,3.0,3.5277,8.999,...,25,0,1,0,0.005315,0.0126,0.005467,0.294614,0.016952,0.155313
1,False,True,203.0,3.0,146.0,1808.0,1317.0,28.0,,,...,74,10,13,0,0.001579,0.045599,0.001257,0.020375,0.001559,0.116319
2,True,True,39.0,1.0,29.0,50.0,37.0,1.0,8.0577,16.099,...,12,0,5,7,0.000371,0.011161,0.025783,0.326362,0.01733,0.054763
3,True,True,60.0,1.0,45.0,103.0,76.0,2.0,6.640667,8.918,...,21,2,9,0,0.005329,0.010858,0.016854,0.012352,0.000522,5.257526
4,True,True,25.0,2.0,14.0,385.0,267.0,5.0,3.831,4.528,...,25,0,0,6,0.003096,0.002093,0.00337,0.319223,0.644539,0.006737
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12854,False,False,83.0,2.0,15.0,125.0,30.0,4.0,,,...,6,0,4,3,0.001349,0.008576,0.006203,0.176835,0.005978,0.009062
12855,True,True,108.0,2.0,11.0,190.0,32.0,2.0,8.471471,1.816,...,18,0,2,0,0.002951,0.118968,0.022918,0.333546,0.051408,0.168095
12856,True,True,193.0,2.0,21.0,402.0,55.0,4.0,10.366205,25.9074,...,13,0,15,24,0.001587,0.102015,0.025107,0.249433,0.032679,0.037453
12857,True,False,191.0,2.0,31.0,437.0,71.0,3.0,3.04225,3.642,...,4,0,0,0,0.001097,0.005468,0.080072,0.116255,0.144443,0.101411


In [18]:
test_set

Unnamed: 0,pipeline-1__is,pipeline-2__is,aggtransformer-1__store_count,aggtransformer-2__store_count,aggtransformer-3__store_count,aggtransformer-4__store_count,aggtransformer-5__store_count,aggtransformer-6__store_count,aggtransformer-7__avg_revenue,aggtransformer-8__avg_revenue,...,remainder__number_of_lokalt_knutepunkt,remainder__number_of_nasjonalt_knutepunkt,remainder__number_of_regionalt_knutepunkt,remainder__number_of_annen_viktig_holdeplass,remainder__distance_to_mangler_viktighetsnivå,remainder__distance_to_standard_holdeplass,remainder__distance_to_lokalt_knutepunkt,remainder__distance_to_nasjonalt_knutepunkt,remainder__distance_to_regionalt_knutepunkt,remainder__distance_to_annen_viktig_holdeplass
0,False,False,16.0,1.0,10.0,2184.0,1591.0,35.0,,,...,64,10,13,0,0.000231,0.030165,0.010282,0.026472,0.013886,0.159301
1,False,True,5.0,1.0,3.0,76.0,54.0,2.0,,,...,3,0,7,0,0.022422,0.001014,0.019396,0.489463,0.019258,0.135651
2,True,True,23.0,1.0,14.0,2184.0,1591.0,35.0,3.358375,1.307667,...,20,0,2,9,0.00369,0.017462,0.012602,0.171475,0.078346,0.024432
3,False,True,23.0,1.0,15.0,471.0,341.0,7.0,,,...,15,0,0,4,0.002887,0.003348,0.019146,0.409014,0.746042,0.033637
4,False,True,31.0,1.0,26.0,2184.0,1591.0,35.0,,,...,64,10,13,0,0.000919,0.022926,0.007982,0.026247,0.017349,0.155829
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8572,False,True,36.0,2.0,7.0,234.0,49.0,3.0,,,...,3,0,0,0,0.003276,0.100738,0.028465,0.161441,0.121447,0.471161
8573,False,False,13.0,1.0,5.0,705.0,134.0,6.0,,,...,20,8,2,8,0.004102,0.002065,0.011969,0.014388,0.031321,0.013542
8574,False,True,14.0,1.0,6.0,24.0,11.0,1.0,,,...,3,0,2,0,0.002042,0.023005,0.073009,0.271889,0.001239,0.103923
8575,False,False,10.0,1.0,4.0,101.0,18.0,2.0,,,...,2,0,6,0,0.001162,0.060774,0.075504,0.160333,0.091335,0.125114


In [None]:
drop_cols = ['remainder__store_name',
 'remainder__plaace_hierarchy_id',
 'remainder__sales_channel_name_x',
 'remainder__grunnkrets_id',
 'remainder__address',
 'remainder__grunnkrets_name',
 'remainder__geometry',
 'remainder__sales_channel_name_y',
 'remainder__lv1',
 'remainder__lv2',
 'remainder__lv3',
 'remainder__lv4']

# 'remainder__grunnkrets_name_num_kids',
#  'remainder__grunnkrets_name_num_kids+',
#  'remainder__grunnkrets_name_num_youths',
#  'remainder__grunnkrets_name_num_youthAdult',
#  'remainder__grunnkrets_name_num_adult',
#  'remainder__grunnkrets_name_num_adults+',
#  'remainder__grunnkrets_name_num_pensinors',
#  'remainder__grunnkrets_name_kids_%',
#  'remainder__grunnkrets_name_kids+_%',
#  'remainder__grunnkrets_name_youths_%',
#  'remainder__grunnkrets_name_youthAdult_%',
#  'remainder__grunnkrets_name_adult_%',
#  'remainder__grunnkrets_name_adults+_%',
#  'remainder__grunnkrets_name_pensinors_%',
#  'remainder__district_name_num_kids',
#  'remainder__district_name_num_kids+',
#  'remainder__district_name_num_youths',
#  'remainder__district_name_num_youthAdult',
#  'remainder__district_name_num_adult',
#  'remainder__district_name_num_adults+',
#  'remainder__district_name_num_pensinors',
#  'remainder__district_name_kids_%',
#  'remainder__district_name_kids+_%',
#  'remainder__district_name_youths_%',
#  'remainder__district_name_youthAdult_%',
#  'remainder__district_name_adult_%',
#  'remainder__district_name_adults+_%',
#  'remainder__district_name_pensinors_%',
#  'remainder__municipality_name_num_kids',
#  'remainder__municipality_name_num_kids+',
#  'remainder__municipality_name_num_youths',
#  'remainder__municipality_name_num_youthAdult',
#  'remainder__municipality_name_num_adult',
#  'remainder__municipality_name_num_adults+',
#  'remainder__municipality_name_num_pensinors',
#  'remainder__municipality_name_kids_%',
#  'remainder__municipality_name_kids+_%',
#  'remainder__municipality_name_youths_%',
#  'remainder__municipality_name_youthAdult_%',
#  'remainder__municipality_name_adult_%',
#  'remainder__municipality_name_adults+_%',
#  'remainder__municipality_name_pensinors_%',
#  'remainder__grunnkrets_name_couple_children_0_to_5_years',
#  'remainder__grunnkrets_name_couple_children_18_or_above',
#  'remainder__grunnkrets_name_couple_children_6_to_17_years',
#  'remainder__grunnkrets_name_couple_without_children',
#  'remainder__grunnkrets_name_single_parent_children_0_to_5_years',
#  'remainder__grunnkrets_name_single_parent_children_18_or_above',
#  'remainder__grunnkrets_name_single_parent_children_6_to_17_years',
#  'remainder__grunnkrets_name_singles',
#  'remainder__grunnkrets_name_%_dist_of_couple_children_0_to_5_years',
#  'remainder__grunnkrets_name_%_dist_of_couple_children_18_or_above',
#  'remainder__grunnkrets_name_%_dist_of_couple_children_6_to_17_years',
#  'remainder__grunnkrets_name_%_dist_of_couple_without_children',
#  'remainder__grunnkrets_name_%_dist_of_single_parent_children_0_to_5_years',
#  'remainder__grunnkrets_name_%_dist_of_single_parent_children_18_or_above',
#  'remainder__grunnkrets_name_%_dist_of_single_parent_children_6_to_17_years',
#  'remainder__grunnkrets_name_%_dist_of_singles',
#  'remainder__district_name_couple_children_0_to_5_years',
#  'remainder__district_name_couple_children_18_or_above',
#  'remainder__district_name_couple_children_6_to_17_years',
#  'remainder__district_name_couple_without_children',
#  'remainder__district_name_single_parent_children_0_to_5_years',
#  'remainder__district_name_single_parent_children_18_or_above',
#  'remainder__district_name_single_parent_children_6_to_17_years',
#  'remainder__district_name_singles',
#  'remainder__district_name_%_dist_of_couple_children_0_to_5_years',
#  'remainder__district_name_%_dist_of_couple_children_18_or_above',
#  'remainder__district_name_%_dist_of_couple_children_6_to_17_years',
#  'remainder__district_name_%_dist_of_couple_without_children',
#  'remainder__district_name_%_dist_of_single_parent_children_0_to_5_years',
#  'remainder__district_name_%_dist_of_single_parent_children_18_or_above',
#  'remainder__district_name_%_dist_of_single_parent_children_6_to_17_years',
#  'remainder__district_name_%_dist_of_singles',
#  'remainder__municipality_name_couple_children_0_to_5_years',
#  'remainder__municipality_name_couple_children_18_or_above',
#  'remainder__municipality_name_couple_children_6_to_17_years',
#  'remainder__municipality_name_couple_without_children',
#  'remainder__municipality_name_single_parent_children_0_to_5_years',
#  'remainder__municipality_name_single_parent_children_18_or_above',
#  'remainder__municipality_name_single_parent_children_6_to_17_years',
#  'remainder__municipality_name_singles',
#  'remainder__municipality_name_%_dist_of_couple_children_0_to_5_years',
#  'remainder__municipality_name_%_dist_of_couple_children_18_or_above',
#  'remainder__municipality_name_%_dist_of_couple_children_6_to_17_years',
#  'remainder__municipality_name_%_dist_of_couple_without_children',
#  'remainder__municipality_name_%_dist_of_single_parent_children_0_to_5_years',
#  'remainder__municipality_name_%_dist_of_single_parent_children_18_or_above',
#  'remainder__municipality_name_%_dist_of_single_parent_children_6_to_17_years',
#  'remainder__municipality_name_%_dist_of_singles',

In [None]:
import h2o
from h2o.automl import H2OAutoML

# Start the H2O cluster (locally)
h2o.init()

# Import a sample binary outcome train/test set into H2O
train = h2o.H2OFrame(pd.concat([train_set[1000:], stores_train_labels[1000:]], axis=1).drop(drop_cols, axis=1))
test = h2o.H2OFrame(train_set[0:1000].drop(drop_cols, axis=1))
test_labels = stores_train_labels[0:1000]
#test = h2o.H2OFrame(pd.concat([test_set, stores_test_enriched[['store_id']]], axis=1).drop(drop_cols, axis=1))

# Identify predictors and response
x = train.columns
y = "revenue"
x.remove(y)

# Run AutoML for 20 base models
aml = H2OAutoML(max_models=20, seed=1)
aml.train(x=x, y=y, training_frame=train)

# View the AutoML Leaderboard
lb = aml.leaderboard
lb.head(rows=lb.nrows)  # Print all rows instead of default (10 rows)

# The leader model is stored here
aml.leader

In [None]:
preds_avg = aml.predict(test)

preds_best = aml.leader.predict(test)

In [None]:
df = test.cbind(preds_best)
df = df.as_data_frame(use_pandas=True)
result = df.loc[:,("store_id", 'predict')]
submission = result.rename(columns = {"store_id": "id",  "predict" : "predicted"})
submission.to_csv("StackedEnsembleBestOfFamily2.csv", index = False)

submission

In [None]:
m = h2o.get_model(lb[1,"model_id"])
m.varimp(use_pandas=True)

In [None]:
pd.concat([train_set, stores_train_labels], axis=1).to_csv("train_set.csv", index=False)
pd.concat([test_set, stores_test_enriched[['store_id']]], axis=1).to_csv("test_set.csv", index=False)

In [None]:
from sklearn.preprocessing import StandardScaler
num_cols = train_set.select_dtypes(include=[np.number])

std_scaler = StandardScaler()
housing_num_std_scaled = std_scaler.fit_transform(housing_num)