In [28]:
import sys
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PowerTransformer
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.cluster import KMeans

sys.path.append("/Users/nwong/Workspace/Projects/tdt4173_project/src")

from feature_engineering.sklearn_transformers import *
from feature_engineering.store_features import *
from feature_engineering.bus_stop_features import *
from feature_engineering.utils import *
import h2o
from h2o.automl import H2OAutoML

stop_importance_levels = ['Mangler viktighetsnivå',
                          'Standard holdeplass',
                          'Lokalt knutepunkt',
                          'Nasjonalt knutepunkt',
                          'Regionalt knutepunkt',
                          'Annen viktig holdeplass']
store_types = ['lv1_desc', 'lv2_desc', 'lv3_desc', 'lv4_desc']
geo_groups = ['grunnkrets_id','t_district','municipality_name']

In [29]:
stores_train_df = pd.read_csv("/Users/nwong/Workspace/Projects/tdt4173_project/data/impuded/stores_train.csv")
stores_test_df = pd.read_csv("/Users/nwong/Workspace/Projects/tdt4173_project/data/impuded/stores_test.csv")
stores_extra_df = set_year_2016(pd.read_csv("/Users/nwong/Workspace/Projects/tdt4173_project/data/raw/stores_extra.csv"))

income_dist = set_year_2016(pd.read_csv("/Users/nwong/Workspace/Projects/tdt4173_project/data/raw/grunnkrets_income_households.csv"))
age_dist = set_year_2016(pd.read_csv("/Users/nwong/Workspace/Projects/tdt4173_project/data/raw/grunnkrets_age_distribution.csv"))
household_dist = set_year_2016(pd.read_csv("/Users/nwong/Workspace/Projects/tdt4173_project/data/raw/grunnkrets_households_num_persons.csv"))
grunnkrets_df = combine_keys(set_year_2016(pd.read_csv("/Users/nwong/Workspace/Projects/tdt4173_project/data/raw/grunnkrets_norway_stripped.csv")))
plaace_df = pd.read_csv("/Users/nwong/Workspace/Projects/tdt4173_project/data/raw/plaace_hierarchy.csv")

bus_stops_df = bus_stops_lat_lon(pd.read_csv("/Users/nwong/Workspace/Projects/tdt4173_project/data/raw/busstops_norway.csv"))


  bus_stops_df[['lon', 'lat']] = bus_stops_df['lng_lat'].str.split(


In [3]:
stores_train_merged = encode_levels(stores_train_df.merge(plaace_df, on="plaace_hierarchy_id", how="left"))
stores_test_merged = encode_levels(stores_test_df.merge(plaace_df, on="plaace_hierarchy_id", how="left"))
stores_extra_merged = encode_levels(stores_extra_df.merge(plaace_df, on="plaace_hierarchy_id", how="left"))

In [4]:
compare_train_df = pd.concat([stores_train_merged, stores_extra_merged], ignore_index=True)
compare_test_df = pd.concat([stores_test_merged, stores_extra_merged], ignore_index=True)

In [5]:
def stores_in_radius_new(stores_merged, compare_df, radius=0.05):
    lv_1 = stores_in_radius(stores_merged, compare_df, radius=radius, store_type_group='lv1_desc')
    lv_2 = stores_in_radius(stores_merged, compare_df, radius=radius, store_type_group='lv2_desc')
    lv_3 = stores_in_radius(stores_merged, compare_df, radius=radius, store_type_group='lv3_desc')
    lv_4 = stores_in_radius(stores_merged, compare_df, radius=radius, store_type_group='lv4_desc')
    all_count = stores_in_radius(stores_merged, compare_df, radius=radius, store_type_group=None)
    
    return lv_1.merge(lv_2, on="store_id", how="inner").merge(lv_3, on="store_id", how="inner").merge(lv_4, on="store_id", how="inner").merge(all_count, on="store_id", how="inner")

In [6]:
def new_pop_density(stores_df, age_dist, grunnkrets_df, geo_groups):
    population_density = population_density_grouped_by_geo_group(stores_df, age_dist, grunnkrets_df, geo_groups)
    return population_density.fillna(population_density.mean())

In [7]:

def new_age_dist(stores_df, age_df, grunnkrets_df, geo_groups):
    _age_dist = age_dist_by_geo_group(stores_train_df, age_dist, grunnkrets_df, geo_groups)
    return _age_dist.fillna(_age_dist.mean())

In [8]:
cluster_simil = ClusterSimilarity(n_clusters=40, gamma=1., random_state=42, sample_weight=stores_train_merged[['revenue']])
similarities = cluster_simil.fit(stores_train_merged[["lat", "lon"]])

def new_clustering(cluster_simil, stores_df):
    return pd.DataFrame(cluster_simil.transform(stores_df[['lat', 'lon']]), columns=cluster_simil.get_feature_names_out(), index=stores_df.store_id).reset_index()

In [9]:
# kmeans = KMeans(n_clusters=1200, init='k-means++')
# kmeans.fit(stores_train_merged[['lat', 'lon']], sample_weight=stores_train_merged['revenue'])

# def new_kmeans_weighted(dataframe):
#     dataframe['cluster_label'] = kmeans.predict(dataframe[['lat', 'lon']])
#     dataframe['cluster_label_str'] = dataframe['cluster_label'].astype(str) + '_cluster'
#     dataframe.drop('cluster_label', axis=1, inplace=True)


In [10]:
# new_kmeans_weighted(stores_train_df)
# new_kmeans_weighted(stores_test_df)

In [11]:
def distance_to_closest_group(stores_df, compare_df, group):
    """
        Mall or chain
    """
    mat = cdist(
        stores_df[['lat', 'lon']],
        compare_df[compare_df[group].notna()][['lat', 'lon']], metric="euclidean"
    )
    
    new_df = pd.DataFrame(
        mat, index=stores_df['store_id'], columns=compare_df[compare_df[group].notna()]['store_id']
    )
    
    new_df = new_df[new_df > 0]
    
    stores = new_df.index
    # closest = new_df.idxmin(axis=1)
    distance = new_df.min(axis=1)
    
    return pd.DataFrame({'store_id': stores.values, f'distance_closest_{group}': distance.values})

In [12]:
merged_stores_train = stores_train_merged \
    # .merge(grunnkrets_df, on="grunnkrets_id", how="left") \
    .merge(income_dist, on="grunnkrets_id", how="left") \
    .merge(store_closest_by_store_groups(stores_train_merged, compare_train_df, store_types), on="store_id", how="left") \
    # .merge(stores_in_radius_new(stores_train_merged, compare_train_df), on="store_id", how="left") \
    .merge(new_pop_density(stores_train_df, age_dist, grunnkrets_df, geo_groups), on="store_id", how="left") \
    # .merge(average_household_income_by_geo_groups(stores_train_df, geo_groups, income_dist, household_dist, grunnkrets_df)) \
    .merge(bus_stops_distance_by_importance(stores_train_df, bus_stops_df, stop_importance_levels).reset_index(level=0), on="store_id", how="left") \
    # .merge(new_age_dist(stores_train_df, age_dist, grunnkrets_df, geo_groups), on="store_id", how="left") \
    # .merge(distance_to_closest_group(stores_train_merged, compare_train_df, "mall_name"), on="store_id", how="left") \
    # .merge(distance_to_closest_group(stores_train_merged, compare_train_df, "chain_name"), on="store_id", how="left") 
    # .merge(new_clustering(cluster_simil, stores_train_df), on="store_id", how="left") \
        
merged_stores_test = stores_test_merged \
    .merge(grunnkrets_df, on="grunnkrets_id", how="left") \
    .merge(income_dist, on="grunnkrets_id", how="left") \
    .merge(store_closest_by_store_groups(stores_test_merged, compare_test_df, store_types), on="store_id", how="left") \
    .merge(stores_in_radius_new(stores_test_merged, compare_test_df), on="store_id", how="left") \
    .merge(new_pop_density(stores_test_df, age_dist, grunnkrets_df, geo_groups), on="store_id", how="left") \
    .merge(average_household_income_by_geo_groups(stores_test_df, geo_groups, income_dist, household_dist, grunnkrets_df)) \
    .merge(bus_stops_distance_by_importance(stores_test_df, bus_stops_df, stop_importance_levels).reset_index(level=0), on="store_id", how="left") \
    .merge(new_age_dist(stores_test_df, age_dist, grunnkrets_df, geo_groups), on="store_id", how="left") \
    .merge(distance_to_closest_group(stores_test_merged, compare_test_df, "mall_name"), on="store_id", how="left") \
    .merge(distance_to_closest_group(stores_test_merged, compare_test_df, "chain_name"), on="store_id", how="left") 
    # .merge(new_clustering(cluster_simil, stores_test_df), on="store_id", how="left") \

  return population_density.fillna(population_density.mean())
  return _age_dist.fillna(_age_dist.mean())
  return population_density.fillna(population_density.mean())
  return _age_dist.fillna(_age_dist.mean())


In [13]:
target_labels = merged_stores_train[['store_id', 'revenue']].copy()
merged_stores_train = merged_stores_train.copy().drop('revenue', axis=1)

In [14]:
# pd.DataFrame(merged_stores_train.columns).to_csv("features.csv")

In [15]:
merged_stores_train['store_name_first'] = merged_stores_train['store_name'].str.split(' ').str[0]
merged_stores_train['store_name_last'] = merged_stores_train['store_name'].str.split(' ').str[-1]
merged_stores_train['address_first'] = merged_stores_train['address'].str.extract('(\D+)\s+(\d+)\s?(.*)')[0] + "_" + merged_stores_train['municipality_name']

merged_stores_test['store_name_first'] = merged_stores_test['store_name'].str.split(' ').str[0]
merged_stores_test['store_name_last'] = merged_stores_test['store_name'].str.split(' ').str[-1]
merged_stores_test['address_first'] = merged_stores_test['address'].str.extract('(\D+)\s+(\d+)\s?(.*)')[0] + "_" + merged_stores_test['municipality_name']

In [16]:
merged_stores_train['is_chain'] = merged_stores_train['chain_name'].isna()
merged_stores_train['is_mall'] = merged_stores_train['mall_name'].isna()

merged_stores_test['is_chain'] = merged_stores_test['chain_name'].isna()
merged_stores_test['is_mall'] = merged_stores_test['mall_name'].isna()

In [17]:
merged_stores_train['store_type2_chain'] = merged_stores_train['lv2_desc'] + '_' + merged_stores_train['is_chain'].astype(str)
merged_stores_train['store_type2_mall'] = merged_stores_train['lv2_desc'] + '_' + merged_stores_train['is_mall'].astype(str)

merged_stores_test['store_type2_chain'] = merged_stores_test['lv2_desc'] + '_' + merged_stores_test['is_chain'].astype(str)
merged_stores_test['store_type2_mall'] = merged_stores_test['lv2_desc'] + '_' + merged_stores_test['is_mall'].astype(str)

In [18]:
merged_stores_train['store_type3_chain'] = merged_stores_train['lv3_desc'] + '_' + merged_stores_train['is_chain'].astype(str)
merged_stores_train['store_type3_mall'] = merged_stores_train['lv3_desc'] + '_' + merged_stores_train['is_mall'].astype(str)

merged_stores_test['store_type3_chain'] = merged_stores_test['lv3_desc'] + '_' + merged_stores_test['is_chain'].astype(str)
merged_stores_test['store_type3_mall'] = merged_stores_test['lv3_desc'] + '_' + merged_stores_test['is_mall'].astype(str)

In [19]:
merged_stores_train['store_type_district'] = merged_stores_train['lv2_desc'] + '_' + merged_stores_train['t_district']
merged_stores_test['store_type_district'] = merged_stores_test['lv2_desc'] + '_' + merged_stores_test['t_district']

In [20]:
bus_col_names = [
    'distance_to_mangler_viktighetsnivå',
    'distance_to_standard_holdeplass',
    'distance_to_lokalt_knutepunkt',
    'distance_to_nasjonalt_knutepunkt',
    'distance_to_regionalt_knutepunkt',
    'distance_to_annen_viktig_holdeplass'
]
merged_stores_train["bus_distance_to_store"] = merged_stores_train[bus_col_names].min(axis = 1)
merged_stores_test["bus_distance_to_store"] = merged_stores_train[bus_col_names].min(axis = 1)

In [21]:
# pd.DataFrame(merged_stores_train).to_csv("train_set.csv", index=False)

In [22]:
# pd.DataFrame(merged_stores_test).to_csv("test_set.csv", index=False)

In [23]:
inc_cols = [
    'store_id',
    # 'store_name_first',
    # 'store_name_last',
    
    # 'plaace_hierarchy_id',
    # 'chain_name',
    # 'is_chain',
    # 'mall_name',
    # 'is_mall',
    'store_type2_chain',
    'store_type2_mall',
    'store_type3_chain',
    'store_type3_mall',
    'sales_channel_name_x',
    # 'store_type_district',
    
    # 'grunnkrets_id',
    'municipality_name',
    't_district',
    # 'address',
    'address_first',
    
    'lv1_desc',
    'lv2_desc',
    'lv3_desc',
    'lv4_desc',
    # 'lv4',
]

yeo_cols = [
    'all_households',
    # 'singles',
    # 'couple_without_children',
    # 'couple_with_children',
    # 'other_households',
    # 'single_parent_with_children',
    
    'avg_household_income_t_district',
    'avg_household_income_municipality_name',
    # 't_district_density',
    'municipality_name_density',
    
    'all_stores_in_radius',
    # 'lv1_desc_in_radius',
    'lv2_desc_in_radius',
    'lv3_desc_in_radius',
    'lv4_desc_in_radius',
    
    # 't_district_kids_%',
    # 't_district_kids+_%',
    # 't_district_youths_%',
    # 't_district_youthAdult_%',
    # 't_district_adult_%',
    # 't_district_adults+_%',
    # 't_district_pensinors_%',
]

box_cols = [
    # 'area_km2',
    # 'distance_to_lv1_desc',
    'distance_to_lv2_desc',
    'distance_to_lv3_desc',
    'distance_to_lv4_desc',
    
    # 'grunnkrets_id_density',
    't_district_density',
    'municipality_name_density',
    
    'distance_to_lokalt_knutepunkt',
    'distance_to_regionalt_knutepunkt',
    'distance_to_annen_viktig_holdeplass',
    'distance_to_nasjonalt_knutepunkt',
    # 'distance_to_mangler_viktighetsnivå',
    # 'distance_to_standard_holdeplass',
    
    'distance_closest_mall_name',
    'distance_closest_chain_name',
    # 'bus_distance_to_store',
]

# Try with inc_cols
# yeo_cols += [f'Cluster {i} similarity' for i in range(500)]

_merged_stores_train = merged_stores_train.filter(inc_cols+yeo_cols+box_cols)
_merged_stores_test = merged_stores_test.filter(inc_cols+yeo_cols+box_cols)

In [24]:
# _merged_stores_train['grunnkrets_id'] = _merged_stores_train['grunnkrets_id'].astype('str')

In [25]:
# _merged_stores_test['grunnkrets_id'] = _merged_stores_test['grunnkrets_id'].astype('str')

In [26]:
yeo_pipeline = make_pipeline(
    SimpleImputer(strategy="median"),
    PowerTransformer()
)
box_pipeline = make_pipeline(
    SimpleImputer(strategy="median"),
    PowerTransformer(method="box-cox")
)

preprocessing = make_column_transformer(
        (yeo_pipeline, yeo_cols),
        (box_pipeline, box_cols),
        remainder="passthrough"
    )

def new_transformer(merged_stores_df, preprocessing):
    return pd.DataFrame(preprocessing.fit_transform(merged_stores_df), columns=preprocessing.get_feature_names_out(), index=merged_stores_df.index)

In [27]:
# preprocessing.fit(_merged_stores_train)

_merged_stores_train = new_transformer(_merged_stores_train, preprocessing)
_merged_stores_test = new_transformer(_merged_stores_test, preprocessing)

ValueError: Selected columns, ['all_households', 'avg_household_income_t_district', 'avg_household_income_municipality_name', 'municipality_name_density', 'all_stores_in_radius', 'lv2_desc_in_radius', 'lv3_desc_in_radius', 'lv4_desc_in_radius'], are not unique in dataframe

In [None]:
pt = PowerTransformer()
rev_transformed = pt.fit_transform(target_labels[["revenue"]])
_merged_stores_train["revenue"] = rev_transformed

In [None]:
_merged_stores_train = _merged_stores_train[(_merged_stores_train.revenue > -1.8888)]

In [None]:
# _merged_stores_train["z_score"] = (_merged_stores_train.revenue - _merged_stores_train.revenue.mean()) / _merged_stores_train.revenue.std()
# train_no_outliers = (_merged_stores_train[(_merged_stores_train.z_score > -2)])

# _merged_stores_train.drop(['z_score'], axis=1, inplace=True)

In [None]:
_merged_stores_train

In [None]:
# Start the H2O cluster (locally)
h2o.init()

# Import a sample binary outcome train/test set into H2O
train = h2o.H2OFrame(_merged_stores_train)
test = h2o.H2OFrame(_merged_stores_test)

cat_vars = inc_cols

cat_vars = [f'remainder__{i}' for i in cat_vars if i != 'store_id']

for cat in cat_vars:
    train[cat] = train[cat].asfactor()
    test[cat] = test[cat].asfactor()


#test = h2o.H2OFrame(pd.concat([test_set, stores_test_enriched[['store_id']]], axis=1).drop(drop_cols, axis=1))

In [None]:
# Identify predictors and response
x = train.columns
y = "revenue"
x.remove(y)

# Run AutoML for 20 base models
aml = H2OAutoML(max_models=20, seed=1, exclude_algos=['deeplearning'])
aml.train(x=x, y=y, training_frame=train)

In [None]:
lb = aml.leaderboard
lb.head(rows=lb.nrows)  # Print all rows instead of default (10 rows)

In [None]:
# The leader model is stored here
aml.leader

In [None]:
m = h2o.get_model(lb[3,"model_id"])
m.varimp(use_pandas=True)

In [None]:
preds_avg = aml.predict(test)
preds_best = aml.leader.predict(test)

In [None]:
df = test.cbind(preds_best)
df = df.as_data_frame(use_pandas=True)
result = df.loc[:,("remainder__store_id", 'predict')]
submission = result.rename(columns = {"remainder__store_id": "id",  "predict" : "predicted"})
submission['predicted'] = pt.inverse_transform(submission[['predicted']])
submission.to_csv("StackedEnsembleBestOfFamily21.csv", index = False)

submission