In [6]:

import sys
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

sys.path.append("/Users/nwong/Workspace/Projects/tdt4173_project/src")

from feature_engineering.sklearn_transformers import *
from feature_engineering.store_features import *
from feature_engineering.utils import *

In [7]:
# Import datasets
stores_train_df = pd.read_csv("/Users/nwong/Workspace/Projects/tdt4173_project/data/raw/stores_train.csv")
stores_test_df = pd.read_csv("/Users/nwong/Workspace/Projects/tdt4173_project/data/raw/stores_test.csv")
stores_extra_df = pd.read_csv("/Users/nwong/Workspace/Projects/tdt4173_project/data/raw/stores_extra.csv")
bus_stops_df = pd.read_csv("/Users/nwong/Workspace/Projects/tdt4173_project/data/raw/busstops_norway.csv")

# Grouping mechanism 
geo_groups = ['t_grunnkrets','t_district','municipality_name']
store_types = ['lv1_desc', 'lv2_desc', 'lv3_desc', 'lv4_desc']
stop_importance_levels = ['Mangler viktighetsnivå',
                          'Standard holdeplass',
                          'Lokalt knutepunkt',
                          'Nasjonalt knutepunkt',
                          'Regionalt knutepunkt',
                          'Annen viktig holdeplass']

In [8]:
raw_path = "/Users/nwong/Workspace/Projects/tdt4173_project/data/raw/" # Folder with the raw data files

stores_extra_merged = enrich_keys(stores_extra_df, raw_path=raw_path)
stores_train_enriched = data_enricher(stores_train_df, raw_path=raw_path, geo_groups=geo_groups, importance_levels=stop_importance_levels)
stores_test_enriched = data_enricher(stores_test_df, raw_path=raw_path, geo_groups=geo_groups, importance_levels=stop_importance_levels)

stores_train_data = stores_train_enriched.drop("revenue", axis=1)
stores_train_labels = stores_train_enriched[['store_id', 'revenue']]

  bus_stops_df[['lon', 'lat']] = bus_stops_df['lng_lat'].str.split(
  bus_stops_df[['lon', 'lat']] = bus_stops_df['lng_lat'].str.split(


In [4]:
stores_train_enriched.grunnkrets_id.nunique()

3817

In [11]:
stores_train_enriched.t_grunnkrets.nunique()

3789

In [None]:
num_stores_district_lv1 = AggTransformer(
    agg_cols=["district_name", "lv1_desc"], 
    agg_name="store_count", 
    calculations=store_count,
    stores_extra=stores_extra_merged
)

num_stores_district_lv2 = AggTransformer(
    agg_cols=["district_name", "lv2_desc"], 
    agg_name="store_count", 
    calculations=store_count,
    stores_extra=stores_extra_merged
)

num_stores_district_lv3 = AggTransformer(
    agg_cols=["district_name", "lv3_desc"], 
    agg_name="store_count", 
    calculations=store_count,
    stores_extra=stores_extra_merged
)

num_stores_municipality_lv1 = AggTransformer(
    agg_cols=["municipality_name", "lv1_desc"], 
    agg_name="store_count", 
    calculations=store_count,
    stores_extra=stores_extra_merged
)

num_stores_municipality_lv2 = AggTransformer(
    agg_cols=["municipality_name", "lv2_desc"], 
    agg_name="store_count", 
    calculations=store_count,
    stores_extra=stores_extra_merged
)

num_stores_municipality_lv3 = AggTransformer(
    agg_cols=["municipality_name", "lv3_desc"], 
    agg_name="store_count", 
    calculations=store_count,
    stores_extra=stores_extra_merged
)

In [None]:
avg_rev_mall = AggTransformer(
    agg_cols=["mall_name"], 
    agg_name="avg_revenue", 
    calculations=average_revenue, 
    sample_revenue=stores_train_labels
)

avg_rev_mall_lv2 = AggTransformer(
    agg_cols=["mall_name", "lv2_desc"], 
    agg_name="avg_revenue", 
    calculations=average_revenue, 
    sample_revenue=stores_train_labels
)

avg_rev_chain = AggTransformer(
    agg_cols=["chain_name"], 
    agg_name="avg_revenue", 
    calculations=average_revenue, 
    sample_revenue=stores_train_labels
)

avg_rev_district = AggTransformer(
    agg_cols=['district_name'],
    agg_name='avg_rev_district',
    calculations=average_revenue,
    sample_revenue=stores_train_labels
)

avg_rev_municipality = AggTransformer(
    agg_cols=['municipality_name'],
    agg_name='avg_rev_district',
    calculations=average_revenue,
    sample_revenue=stores_train_labels
)

avg_rev_district_lv3 = AggTransformer(
    agg_cols=['district_name', 'lv3_desc'],
    agg_name='avg_rev_district',
    calculations=average_revenue,
    sample_revenue=stores_train_labels
)

avg_rev_municipality_lv3 = AggTransformer(
    agg_cols=['municipality_name', 'lv3_desc'],
    agg_name='avg_rev_municipality',
    calculations=average_revenue,
    sample_revenue=stores_train_labels
)

avg_rev_lv3 = AggTransformer(
    agg_cols=["lv3_desc"], 
    agg_name="avg_rev_lv3", 
    calculations=average_revenue,
    sample_revenue=stores_train_labels
)

avg_rev_lv4 = AggTransformer(
    agg_cols=["lv4_desc"], 
    agg_name="avg_rev_lv4", 
    calculations=average_revenue,
    sample_revenue=stores_train_labels
)

In [None]:
# Stores in radius irrespective of store type
num_stores_radius = StoresInRadiusTransformer(
    radius=0.1)

num_stores_radius_extra = StoresInRadiusTransformer(
    radius=0.1,
    stores_extra=stores_extra_merged)

In [None]:
cluster_simil = ClusterSimilarity(
    n_clusters=20, 
    gamma=1., 
    random_state=42, 
    sample_weight=stores_train_labels)

In [None]:
closest_stores = ClosestStore(
    stores_extra=stores_extra_merged,
    store_type_groups=store_types
)

In [None]:
num_pipeline = Pipeline([
    ("impute", SimpleImputer(strategy="median")),
    ("standardize", StandardScaler()),
])

cat_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore")
)

kmeans_pipeline = make_pipeline(
    StandardScaler(),
    ClusterSimilarity()
)

In [None]:

preprocessing = make_column_transformer(
        (is_null_pipeline(), ['mall_name']),
        (is_null_pipeline(), ['chain_name']),
        (num_stores_district_lv1, ['store_id', 'district_name', 'lv1_desc']),
        (num_stores_district_lv3, ['store_id', 'district_name', 'lv3_desc']),
        (num_stores_district_lv2, ['store_id', 'district_name', 'lv2_desc']),
        (num_stores_municipality_lv1, ['store_id', "municipality_name", "lv1_desc"]),
        (num_stores_municipality_lv2, ['store_id', "municipality_name", "lv2_desc"]),
        (num_stores_municipality_lv3, ['store_id', "municipality_name", "lv3_desc"]),
        (avg_rev_mall, ['store_id', 'mall_name']),
        (avg_rev_mall_lv2, ['store_id', 'mall_name', 'lv2_desc']),
        (avg_rev_chain, ['store_id', 'chain_name']),
        (avg_rev_district, ['store_id', 'district_name']),
        (avg_rev_district_lv3, ['store_id', 'district_name', 'lv3_desc']),
        (avg_rev_municipality_lv3, ['store_id', 'municipality_name', 'lv3_desc']),
        (avg_rev_lv3, ['store_id', 'lv3_desc']),
        (avg_rev_lv4, ['store_id', 'lv4_desc']),
        (num_stores_radius_extra, ['store_id', 'lat', 'lon']),
        (cluster_simil, ['lat', 'lon']),
        (closest_stores, ['store_id', 'lat', 'lon'] + store_types),
        
        remainder="passthrough"
)

preprocessing.fit(stores_train_data)

In [None]:
train_ = preprocessing.transform(stores_train_data)

In [None]:
test_ = preprocessing.transform(stores_test_enriched)

In [None]:
train_set = pd.DataFrame(train_, columns=preprocessing.get_feature_names_out(), index=stores_train_data.index)

In [None]:
test_set = pd.DataFrame(test_, columns=preprocessing.get_feature_names_out(), index=stores_test_enriched.index)

In [None]:
train_set

In [None]:
test_set

In [None]:
drop_cols = ['remainder__store_name',
 'remainder__plaace_hierarchy_id',
 'remainder__sales_channel_name_x',
 'remainder__grunnkrets_id',
 'remainder__address',
 'remainder__grunnkrets_name',
 'remainder__geometry',
 'remainder__sales_channel_name_y',
 'remainder__lv1',
 'remainder__lv2',
 'remainder__lv3',
 'remainder__lv4']

# 'remainder__grunnkrets_name_num_kids',
#  'remainder__grunnkrets_name_num_kids+',
#  'remainder__grunnkrets_name_num_youths',
#  'remainder__grunnkrets_name_num_youthAdult',
#  'remainder__grunnkrets_name_num_adult',
#  'remainder__grunnkrets_name_num_adults+',
#  'remainder__grunnkrets_name_num_pensinors',
#  'remainder__grunnkrets_name_kids_%',
#  'remainder__grunnkrets_name_kids+_%',
#  'remainder__grunnkrets_name_youths_%',
#  'remainder__grunnkrets_name_youthAdult_%',
#  'remainder__grunnkrets_name_adult_%',
#  'remainder__grunnkrets_name_adults+_%',
#  'remainder__grunnkrets_name_pensinors_%',
#  'remainder__district_name_num_kids',
#  'remainder__district_name_num_kids+',
#  'remainder__district_name_num_youths',
#  'remainder__district_name_num_youthAdult',
#  'remainder__district_name_num_adult',
#  'remainder__district_name_num_adults+',
#  'remainder__district_name_num_pensinors',
#  'remainder__district_name_kids_%',
#  'remainder__district_name_kids+_%',
#  'remainder__district_name_youths_%',
#  'remainder__district_name_youthAdult_%',
#  'remainder__district_name_adult_%',
#  'remainder__district_name_adults+_%',
#  'remainder__district_name_pensinors_%',
#  'remainder__municipality_name_num_kids',
#  'remainder__municipality_name_num_kids+',
#  'remainder__municipality_name_num_youths',
#  'remainder__municipality_name_num_youthAdult',
#  'remainder__municipality_name_num_adult',
#  'remainder__municipality_name_num_adults+',
#  'remainder__municipality_name_num_pensinors',
#  'remainder__municipality_name_kids_%',
#  'remainder__municipality_name_kids+_%',
#  'remainder__municipality_name_youths_%',
#  'remainder__municipality_name_youthAdult_%',
#  'remainder__municipality_name_adult_%',
#  'remainder__municipality_name_adults+_%',
#  'remainder__municipality_name_pensinors_%',
#  'remainder__grunnkrets_name_couple_children_0_to_5_years',
#  'remainder__grunnkrets_name_couple_children_18_or_above',
#  'remainder__grunnkrets_name_couple_children_6_to_17_years',
#  'remainder__grunnkrets_name_couple_without_children',
#  'remainder__grunnkrets_name_single_parent_children_0_to_5_years',
#  'remainder__grunnkrets_name_single_parent_children_18_or_above',
#  'remainder__grunnkrets_name_single_parent_children_6_to_17_years',
#  'remainder__grunnkrets_name_singles',
#  'remainder__grunnkrets_name_%_dist_of_couple_children_0_to_5_years',
#  'remainder__grunnkrets_name_%_dist_of_couple_children_18_or_above',
#  'remainder__grunnkrets_name_%_dist_of_couple_children_6_to_17_years',
#  'remainder__grunnkrets_name_%_dist_of_couple_without_children',
#  'remainder__grunnkrets_name_%_dist_of_single_parent_children_0_to_5_years',
#  'remainder__grunnkrets_name_%_dist_of_single_parent_children_18_or_above',
#  'remainder__grunnkrets_name_%_dist_of_single_parent_children_6_to_17_years',
#  'remainder__grunnkrets_name_%_dist_of_singles',
#  'remainder__district_name_couple_children_0_to_5_years',
#  'remainder__district_name_couple_children_18_or_above',
#  'remainder__district_name_couple_children_6_to_17_years',
#  'remainder__district_name_couple_without_children',
#  'remainder__district_name_single_parent_children_0_to_5_years',
#  'remainder__district_name_single_parent_children_18_or_above',
#  'remainder__district_name_single_parent_children_6_to_17_years',
#  'remainder__district_name_singles',
#  'remainder__district_name_%_dist_of_couple_children_0_to_5_years',
#  'remainder__district_name_%_dist_of_couple_children_18_or_above',
#  'remainder__district_name_%_dist_of_couple_children_6_to_17_years',
#  'remainder__district_name_%_dist_of_couple_without_children',
#  'remainder__district_name_%_dist_of_single_parent_children_0_to_5_years',
#  'remainder__district_name_%_dist_of_single_parent_children_18_or_above',
#  'remainder__district_name_%_dist_of_single_parent_children_6_to_17_years',
#  'remainder__district_name_%_dist_of_singles',
#  'remainder__municipality_name_couple_children_0_to_5_years',
#  'remainder__municipality_name_couple_children_18_or_above',
#  'remainder__municipality_name_couple_children_6_to_17_years',
#  'remainder__municipality_name_couple_without_children',
#  'remainder__municipality_name_single_parent_children_0_to_5_years',
#  'remainder__municipality_name_single_parent_children_18_or_above',
#  'remainder__municipality_name_single_parent_children_6_to_17_years',
#  'remainder__municipality_name_singles',
#  'remainder__municipality_name_%_dist_of_couple_children_0_to_5_years',
#  'remainder__municipality_name_%_dist_of_couple_children_18_or_above',
#  'remainder__municipality_name_%_dist_of_couple_children_6_to_17_years',
#  'remainder__municipality_name_%_dist_of_couple_without_children',
#  'remainder__municipality_name_%_dist_of_single_parent_children_0_to_5_years',
#  'remainder__municipality_name_%_dist_of_single_parent_children_18_or_above',
#  'remainder__municipality_name_%_dist_of_single_parent_children_6_to_17_years',
#  'remainder__municipality_name_%_dist_of_singles',

In [None]:
import h2o
from h2o.automl import H2OAutoML

# Start the H2O cluster (locally)
h2o.init()

# Import a sample binary outcome train/test set into H2O
train = h2o.H2OFrame(pd.concat([train_set[1000:], stores_train_labels[1000:]], axis=1).drop(drop_cols, axis=1))
test = h2o.H2OFrame(train_set[0:1000].drop(drop_cols, axis=1))
test_labels = stores_train_labels[0:1000]
#test = h2o.H2OFrame(pd.concat([test_set, stores_test_enriched[['store_id']]], axis=1).drop(drop_cols, axis=1))

# Identify predictors and response
x = train.columns
y = "revenue"
x.remove(y)

# Run AutoML for 20 base models
aml = H2OAutoML(max_models=20, seed=1)
aml.train(x=x, y=y, training_frame=train)

# View the AutoML Leaderboard
lb = aml.leaderboard
lb.head(rows=lb.nrows)  # Print all rows instead of default (10 rows)

# The leader model is stored here
aml.leader

In [None]:
preds_avg = aml.predict(test)

preds_best = aml.leader.predict(test)

In [None]:
df = test.cbind(preds_best)
df = df.as_data_frame(use_pandas=True)
result = df.loc[:,("store_id", 'predict')]
submission = result.rename(columns = {"store_id": "id",  "predict" : "predicted"})
submission.to_csv("StackedEnsembleBestOfFamily2.csv", index = False)

submission

In [None]:
m = h2o.get_model(lb[1,"model_id"])
m.varimp(use_pandas=True)

In [None]:
pd.concat([train_set, stores_train_labels], axis=1).to_csv("train_set.csv", index=False)
pd.concat([test_set, stores_test_enriched[['store_id']]], axis=1).to_csv("test_set.csv", index=False)

In [None]:
from sklearn.preprocessing import StandardScaler
num_cols = train_set.select_dtypes(include=[np.number])

std_scaler = StandardScaler()
housing_num_std_scaled = std_scaler.fit_transform(housing_num)