In [1]:
import sys
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PowerTransformer
from sklearn.compose import make_column_transformer, make_column_selector

sys.path.append("/Users/nwong/Workspace/Projects/tdt4173_project/src")

from feature_engineering.sklearn_transformers import *
from feature_engineering.store_features import *
from feature_engineering.bus_stop_features import *
from feature_engineering.utils import *
import h2o
from h2o.automl import H2OAutoML

stop_importance_levels = ['Mangler viktighetsnivå',
                          'Standard holdeplass',
                          'Lokalt knutepunkt',
                          'Nasjonalt knutepunkt',
                          'Regionalt knutepunkt',
                          'Annen viktig holdeplass']
store_types = ['lv1_desc', 'lv2_desc', 'lv3_desc', 'lv4_desc']
geo_groups = ['grunnkrets_id','t_district','municipality_name']

In [2]:
stores_train_df = set_year_2016(pd.read_csv("/Users/nwong/Workspace/Projects/tdt4173_project/data/raw/stores_train.csv"))
stores_test_df = set_year_2016(pd.read_csv("/Users/nwong/Workspace/Projects/tdt4173_project/data/raw/stores_test.csv"))
stores_extra_df = set_year_2016(pd.read_csv("/Users/nwong/Workspace/Projects/tdt4173_project/data/raw/stores_extra.csv"))

income_dist = set_year_2016(pd.read_csv("/Users/nwong/Workspace/Projects/tdt4173_project/data/raw/grunnkrets_income_households.csv"))
age_dist = set_year_2016(pd.read_csv("/Users/nwong/Workspace/Projects/tdt4173_project/data/raw/grunnkrets_age_distribution.csv"))
household_dist = set_year_2016(pd.read_csv("/Users/nwong/Workspace/Projects/tdt4173_project/data/raw/grunnkrets_households_num_persons.csv"))
grunnkrets_df = combine_keys(set_year_2016(pd.read_csv("/Users/nwong/Workspace/Projects/tdt4173_project/data/raw/grunnkrets_norway_stripped.csv")))
plaace_df = pd.read_csv("/Users/nwong/Workspace/Projects/tdt4173_project/data/raw/plaace_hierarchy.csv")

bus_stops_df = bus_stops_lat_lon(pd.read_csv("/Users/nwong/Workspace/Projects/tdt4173_project/data/raw/busstops_norway.csv"))


  bus_stops_df[['lon', 'lat']] = bus_stops_df['lng_lat'].str.split(


In [3]:
stores_train_merged = stores_train_df.merge(plaace_df, on="plaace_hierarchy_id", how="left").drop(['revenue'], axis=1)
stores_test_merged = stores_test_df.merge(plaace_df, on="plaace_hierarchy_id", how="left")
stores_extra_merged = stores_extra_df.merge(plaace_df, on="plaace_hierarchy_id", how="left")

In [4]:
def new_pop_density(stores_df, age_dist, grunnkrets_df, geo_groups):
    population_density = population_density_grouped_by_geo_group(stores_df, age_dist, grunnkrets_df, geo_groups)
    return population_density.fillna(population_density.mean())

In [5]:

def new_age_dist(stores_df, age_df, grunnkrets_df, geo_groups):
    _age_dist = age_dist_by_geo_group(stores_train_df, age_dist, grunnkrets_df, geo_groups)
    return _age_dist.fillna(_age_dist.mean())

In [6]:
cluster_simil = ClusterSimilarity(n_clusters=100, gamma=1., random_state=42, sample_weight=stores_train_df[['revenue']])
similarities = cluster_simil.fit(stores_train_df[["lat", "lon"]])

def new_clustering(cluster_simil, stores_df):
    return pd.DataFrame(cluster_simil.transform(stores_df[['lat', 'lon']]), columns=cluster_simil.get_feature_names_out(), index=stores_df.store_id).reset_index()

In [7]:
merged_stores_train = stores_train_df \
    .merge(income_dist, on="grunnkrets_id", how="left") \
    .merge(grunnkrets_df, on="grunnkrets_id", how="left") \
    .merge(bus_stops_distance_by_importance(stores_train_df, bus_stops_df, stop_importance_levels).reset_index(level=0), on="store_id", how="left") \
    .merge(store_closest_by_store_groups(stores_train_merged, pd.concat([stores_train_merged, stores_test_merged, stores_extra_merged]), store_types), on="store_id", how="left") \
    .merge(new_pop_density(stores_train_df, age_dist, grunnkrets_df, geo_groups), on="store_id", how="left") \
    .merge(new_age_dist(stores_train_df, age_dist, grunnkrets_df, geo_groups), on="store_id", how="left") \
    # .merge(new_clustering(cluster_simil, stores_train_df), on="store_id", how="left")
merged_stores_test = stores_test_df \
    .merge(grunnkrets_df, on="grunnkrets_id", how="left") \
    .merge(income_dist, on="grunnkrets_id", how="left") \
    .merge(bus_stops_distance_by_importance(stores_test_df, bus_stops_df, stop_importance_levels).reset_index(level=0), on="store_id", how="left") \
    .merge(store_closest_by_store_groups(stores_test_merged, pd.concat([stores_train_merged, stores_test_merged, stores_extra_merged]), store_types), on="store_id", how="left") \
    .merge(new_pop_density(stores_test_df, age_dist, grunnkrets_df, geo_groups), on="store_id", how="left") \
    .merge(new_age_dist(stores_test_df, age_dist, grunnkrets_df, geo_groups), on="store_id", how="left") \
    # .merge(new_clustering(cluster_simil, stores_test_df), on="store_id", how="left")

In [None]:
target_labels = merged_stores_train[['store_id', 'revenue']].copy()
merged_stores_train = merged_stores_train.copy().drop('revenue', axis=1)

In [None]:
def new_transformer(merged_stores_df):
    yeo_pipeline = make_pipeline(
        SimpleImputer(strategy="mean"),
        PowerTransformer()
    )
    preprocessing = make_column_transformer(
        (yeo_pipeline, make_column_selector(dtype_include=np.number)),
        remainder = "passthrough"
    )
    return pd.DataFrame(preprocessing.fit_transform(merged_stores_df), columns=preprocessing.get_feature_names_out(), index=merged_stores_df.index)

In [None]:
merged_stores_train = new_transformer(merged_stores_train)
merged_stores_test = new_transformer(merged_stores_test)

In [None]:
pt = PowerTransformer()
rev_transformed = pt.fit_transform(target_labels[["revenue"]])
merged_stores_train["revenue"] = rev_transformed

In [None]:
drop_cols = ['remainder__geometry', 'remainder__grunnkrets_name', 'remainder__district_name']

In [None]:
merged_stores_train.drop(drop_cols, axis=1)

In [None]:
merged_stores_test.drop(drop_cols,axis=1)

In [None]:
# Start the H2O cluster (locally)
h2o.init()

# Import a sample binary outcome train/test set into H2O
train = h2o.H2OFrame(merged_stores_train)
test = h2o.H2OFrame(merged_stores_test)
#test = h2o.H2OFrame(pd.concat([test_set, stores_test_enriched[['store_id']]], axis=1).drop(drop_cols, axis=1))

# Identify predictors and response
x = train.columns
y = "revenue"
x.remove(y)

# Run AutoML for 20 base models
aml = H2OAutoML(max_models=20, seed=1, exclude_algos=['deeplearning'])
aml.train(x=x, y=y, training_frame=train)

In [None]:
lb = aml.leaderboard
lb.head(rows=lb.nrows)  # Print all rows instead of default (10 rows)

In [None]:
# The leader model is stored here
aml.leader

In [None]:
m = h2o.get_model(lb[3,"model_id"])
m.varimp(use_pandas=True)

In [None]:
preds_avg = aml.predict(test)
preds_best = aml.leader.predict(test)

In [None]:
df = test.cbind(preds_best)
df = df.as_data_frame(use_pandas=True)
result = df.loc[:,("store_id", 'predict')]
submission = result.rename(columns = {"store_id": "id",  "predict" : "predicted"})
submission['predicted'] = pt.inverse_transform(submission[['predicted']])
submission.to_csv("StackedEnsembleBestOfFamily4.csv", index = False)

submission