In [22]:
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PowerTransformer
from sklearn.impute import SimpleImputer
from scipy.spatial.distance import cdist
from sklearn.preprocessing import PowerTransformer
from sklearn.compose import make_column_transformer, make_column_selector

import h2o
from h2o.automl import H2OAutoML

In [23]:
def raw_read(path, file_name, dtype=None):
    """
        Utility function to simplify reading of files from local machine.
    """
    return pd.read_csv(f"{path}{file_name}.csv", dtype=dtype)

def deduplicate_year(raw_df, deduplicate_column="grunnkrets_id"):
    """
        Use 2016 values by default. If exist in 2015, merge together. Drop year.
    """
    raw_df = raw_df.copy()
    return raw_df.sort_values(by='year').drop_duplicates(subset=[deduplicate_column], keep='last').drop('year', axis=1)

def combine_keys(dataframe):
    dataframe = dataframe.copy()
    dataframe['t_district'] = dataframe['district_name'] + dataframe['municipality_name']
    return dataframe

def bus_stops_lat_lon(bus_stops_df):
    """
    Extract latitude and longitude as separate columns.
    """
    bus_stops_df['lng_lat'] = bus_stops_df['geometry'].str.extract(
        r'\((.*?)\)')
    bus_stops_df[['lon', 'lat']] = bus_stops_df['lng_lat'].str.split(
        " ", 1, expand=True)
    bus_stops_df[['lon', 'lat']] = bus_stops_df[[
        'lon', 'lat']].apply(pd.to_numeric)
    return bus_stops_df[['busstop_id', 'stopplace_type', 'importance_level', 'side_placement', 'geometry', 'lat', 'lon']]

In [24]:
def population(dataset_age):
    """
        Calculate total population of grunnkrets
    """
    population = dataset_age.drop(["grunnkrets_id"], axis=1).sum(axis=1)
    dataset_age["population_count"] = population
    return dataset_age[["grunnkrets_id", "population_count"]]

def population_grouped(data_age, data_geography, grouping_element):
    """
        Calculate population of a given geographical grouping.
    """
    age_df = population(data_age)
    geography_df = data_geography
    population_df = age_df.merge(geography_df, how="left", on="grunnkrets_id")
    grouped_df = population_df.groupby([grouping_element], as_index=False)["population_count"].sum()
    return grouped_df

def population_count_grouped_by_geo_group(stores_df, age_df, grunnkrets_df, geo_groups): 
    """
        Calculate population of all geographical groupings.
    """
    combined_df = stores_df.merge(grunnkrets_df, how = "left", on = "grunnkrets_id")

    population_columns = ["population_count"]
    df_list = []

    for geo_group in geo_groups: 
        pop_df = population_grouped(age_df, grunnkrets_df, geo_group)
        merged_df = combined_df.merge(pop_df, how = "left", on = geo_group)[["store_id"] + population_columns]
        merged_df.set_index("store_id", inplace = True)
        merged_df2 = merged_df.add_prefix(f'{geo_group}_')
        df_list.append(merged_df2)

    return pd.concat(df_list, axis = 1).reset_index()

def population_density(age_df, geo_df, grouping_element):
    """
        Calculate population density based on population and area for a given geographical grouping.
    """
    age_data = population(age_df)
    geo_df = geo_df
    combined_df = age_data.merge(geo_df, how="left", on="grunnkrets_id")
    density_df = combined_df.groupby([grouping_element], as_index=False)[
        ["population_count", "area_km2"]].sum()
    density_df["density"] = density_df["population_count"] / \
        density_df["area_km2"]
    return density_df

def population_density_grouped_by_geo_group(stores_df, age_df, grunnkrets_df, geo_groups):
    """
        Calculate population density based on population and area for all geographical grouping.
    """
    grunnkrets_df_2016 = grunnkrets_df
    combined_df = stores_df.merge(grunnkrets_df_2016, how = "left", on = "grunnkrets_id")

    pop_density_columns = ["density"]
    df_list = []

    for geo_group in geo_groups: 
        pop_df = population_density(age_df, grunnkrets_df, geo_group)
        merged_df = combined_df.merge(pop_df, how = "left", on = geo_group)[["store_id"] + pop_density_columns]
        merged_df.set_index("store_id", inplace = True)
        merged_df2 = merged_df.add_prefix(f'{geo_group}_')
        df_list.append(merged_df2)

    return pd.concat(df_list, axis = 1).reset_index()

def new_pop_density(stores_df, age_dist, grunnkrets_df, geo_groups):
    population_density = population_density_grouped_by_geo_group(stores_df, age_dist, grunnkrets_df, geo_groups)
    return population_density.fillna(population_density.mean())

def bus_stops_closest(stores_df, bus_stops_df, importance_level="Regionalt knutepunkt"):
    """
    Id and distance of the closest bus stop to all stores.
    """
    bus_stops_df = bus_stops_df[bus_stops_df['importance_level'] == importance_level]
    mat = cdist(stores_df[['lat', 'lon']],
                bus_stops_df[['lat', 'lon']], metric='euclidean')

    new_df = pd.DataFrame(
        mat, index=stores_df['store_id'], columns=bus_stops_df['busstop_id'])

    stores = stores_df.store_id
    closest = new_df.idxmin(axis=1)
    distance = new_df.min(axis=1)

    return pd.DataFrame({'store_id': stores.values, 'closest_bus_stop': closest.values, 'distance': distance.values})

def bus_stops_in_radius(stores_df, bus_stops_df, radius=0.1, importance_level=None):
    """
    Number of bus stops within a given radius. The importance level of bus stops can be specified.
    """
    if importance_level is not None:
        bus_stops_df = bus_stops_df[bus_stops_df['importance_level'] == importance_level]

    mat = cdist(stores_df[['lat', 'lon']],
                bus_stops_df[['lat', 'lon']], metric='euclidean')
    new_df = pd.DataFrame(
        mat, index=stores_df['store_id'], columns=bus_stops_df['busstop_id'])
    count = pd.DataFrame(new_df[new_df < radius].count(axis=1)).reset_index()
    count.rename(columns={0: 'count'}, inplace=True)
    return count

# Relevant feature engineering functions.
def bus_stops_distance_by_importance(stores_df, bus_stops_df, stop_importance_levels):
    """
    Distance for each store to the closest bus stop of each importance_level
    """
    df_list = []
    for importance_level in stop_importance_levels:
        importance_level_cleaned = importance_level.lower().replace(" ", "_")
        df = bus_stops_closest(stores_df, bus_stops_df, importance_level=importance_level)
        df.rename(columns={'distance': f'distance_to_{importance_level_cleaned}'}, inplace=True)
        df_list.append(df[['store_id', f'distance_to_{importance_level_cleaned}']])

    dfs = [df.set_index('store_id') for df in df_list]
    return pd.concat(dfs, axis=1)

def bus_stops_in_radius_by_importance(stores_df, bus_stops_df, stop_importance_levels, radius=0.01):
    """
    Number of bus stops in radius of store for each importance level.
    """
    df_list = []
    df_list.append(bus_stops_in_radius(stores_df, bus_stops_df, radius=radius).rename(columns={'count':'number_of_all_stop_types'})) # All bus stops in radius
    
    for importance_level in stop_importance_levels:
        importance_level_cleaned = importance_level.lower().replace(" ", "_")
        df = bus_stops_in_radius(stores_df, bus_stops_df, importance_level=importance_level, radius=radius)
        df.rename(columns={'count': f'number_of_{importance_level_cleaned}'}, inplace=True)
        df_list.append(df[['store_id', f'number_of_{importance_level_cleaned}']])

    dfs = [df.set_index('store_id') for df in df_list]
    return pd.concat(dfs, axis=1)

def store_closest(stores_df, compare_df, store_type_group="lv4_desc"):
    """
    Id and distance of the closest store of same type in the same group.
    """
    
    store_types_in_group = stores_df[store_type_group].unique()
    df_list = []
    for store_type in store_types_in_group:
        stores_by_type = stores_df[stores_df[store_type_group] == store_type]
        stores_comp_by_type = compare_df[compare_df[store_type_group] == store_type]
        
        mat = cdist(stores_by_type[['lat', 'lon']], stores_comp_by_type[['lat', 'lon']], metric='euclidean')
        
        df = pd.DataFrame(
            mat, index=stores_by_type['store_id'], columns=stores_comp_by_type['store_id'])
        
        df = df[df > 0]
        
        stores = df.index
        closest = df.idxmin(axis=1)
        distance = df.min(axis=1)
        
        new_df = pd.DataFrame({'store_id': stores.values, 'closest_store': closest.values, 'distance': distance.values})
        df_list.append(new_df)
        
    
    return pd.concat(df_list, ignore_index=True)


def store_closest_by_store_groups(stores_df, compare_df, store_type_groups):
    """
        Closest store across all store levels.
    """
    df_list = []
    
    for store_type_group in store_type_groups:
        df = store_closest(stores_df, compare_df, store_type_group=store_type_group)
        df.rename(columns={'distance': f'distance_to_{store_type_group}'}, inplace=True)
        df_list.append(df[['store_id', f'distance_to_{store_type_group}']])

    dfs = [df.set_index('store_id') for df in df_list]
    return pd.concat(dfs, axis=1).reset_index()

def distance_to_closest_group(stores_df, compare_df, group):
    """
        Mall or chain
    """
    mat = cdist(
        stores_df[['lat', 'lon']],
        compare_df[compare_df[group].notna()][['lat', 'lon']], metric="euclidean"
    )
    
    new_df = pd.DataFrame(
        mat, index=stores_df['store_id'], columns=compare_df[compare_df[group].notna()]['store_id']
    )
    
    new_df = new_df[new_df > 0]
    
    stores = new_df.index
    # closest = new_df.idxmin(axis=1)
    distance = new_df.min(axis=1)
    
    return pd.DataFrame({'store_id': stores.values, f'distance_closest_{group}': distance.values})

In [25]:
raw_path = "/Users/nwong/Workspace/Projects/tdt4173_project/data/raw/"

raw_stores_train = raw_read(raw_path, "stores_train", {'grunnkrets_id':str})
raw_stores_test = raw_read(raw_path, "stores_test", {'grunnkrets_id':str})
raw_stores_extra = raw_read(raw_path,"stores_extra", {'grunnkrets_id':str})

raw_income_dist = raw_read(raw_path, "grunnkrets_income_households", {'grunnkrets_id':str})
raw_age_dist = raw_read(raw_path, "grunnkrets_age_distribution", {'grunnkrets_id':str})
raw_households_dist = raw_read(raw_path, "grunnkrets_households_num_persons", {'grunnkrets_id':str})
raw_grunnkrets = raw_read(raw_path, "grunnkrets_norway_stripped", {'grunnkrets_id':str})

raw_plaace = raw_read(raw_path, "plaace_hierarchy", {'lv1':str, 'lv2':str})
raw_bus_stops = raw_read(raw_path, "busstops_norway")

dedup_income_dist = deduplicate_year(raw_income_dist)
dedup_age_dist = deduplicate_year(raw_age_dist)
dedup_households_dist = deduplicate_year(raw_households_dist)
dedup_grunnkrets = combine_keys(deduplicate_year(raw_grunnkrets))

enriched_bus_stops = bus_stops_lat_lon(raw_bus_stops)

  bus_stops_df[['lon', 'lat']] = bus_stops_df['lng_lat'].str.split(


In [26]:
stores_train_merged = raw_stores_train.merge(raw_plaace, on="plaace_hierarchy_id", how="left")
stores_test_merged = raw_stores_test.merge(raw_plaace, on="plaace_hierarchy_id", how="left")
stores_extra_merged = raw_stores_extra.merge(raw_plaace, on="plaace_hierarchy_id", how="left")

compare_train_df = pd.concat([stores_train_merged, stores_extra_merged], ignore_index=True)
compare_test_df = pd.concat([stores_test_merged, stores_extra_merged], ignore_index=True)

stores_types = ['lv1_desc', 'lv2_desc', 'lv3_desc', 'lv4_desc']
geo_groups = ['grunnkrets_id', 't_district', 'municipality_name']
stop_importance_levels = ['Mangler viktighetsnivå',
                          'Standard holdeplass',
                          'Lokalt knutepunkt',
                          'Nasjonalt knutepunkt',
                          'Regionalt knutepunkt',
                          'Annen viktig holdeplass']

In [27]:
merged_stores_train = stores_train_merged \
    .merge(dedup_income_dist, on="grunnkrets_id", how="left") \
    .merge(store_closest_by_store_groups(stores_train_merged, compare_train_df, stores_types), on="store_id", how="left") \
    .merge(new_pop_density(raw_stores_train, dedup_age_dist, dedup_grunnkrets, geo_groups), on="store_id", how="left") \
    .merge(bus_stops_distance_by_importance(raw_stores_train, enriched_bus_stops, stop_importance_levels).reset_index(level=0), on="store_id", how="left")
    
merged_stores_test = stores_test_merged \
    .merge(dedup_income_dist, on="grunnkrets_id", how="left") \
    .merge(store_closest_by_store_groups(stores_test_merged, compare_test_df, stores_types), on="store_id", how="left") \
    .merge(new_pop_density(raw_stores_test, dedup_age_dist, dedup_grunnkrets, geo_groups), on="store_id", how="left") \
    .merge(bus_stops_distance_by_importance(raw_stores_test, enriched_bus_stops, stop_importance_levels).reset_index(level=0), on="store_id", how="left")

  return population_density.fillna(population_density.mean())
  return population_density.fillna(population_density.mean())


In [38]:
merged_stores_train

def firstname(string): 
    if " " in string: 
        newstring = string.split(" ")[:-1]
        return " ".join(newstring)
    else: 
        return string

def lastname(string):
    if " " in string: 
        if string.split(" ")[-1] == 'AS': 
            return string.split(" ")[-2]
        else: 
            return string.split(" ")[-1]  
    else: 
        return string


def address(string): 
    newstring = ""
    for char in string: 
        if not char.isdigit(): 
            newstring += char
        else: 
            return newstring

def district(string): 
    string = string.lower()
    if 'sentrum' in string:
        return 'sentrum'
    else: 
        return string

In [28]:
target_labels = merged_stores_train[['store_id', 'revenue']].copy()
merged_stores_train = merged_stores_train.copy().drop('revenue', axis=1)

In [29]:
inc_cols = [
    'store_id',
    'store_name',
    'sales_channel_name_x',
    'plaace_hierarchy_id',
    'grunnkrets_id',
    'address',
    'lat',
    'lon',
    'chain_name',
    'mall_name',
    
    'lv1_desc',
    'lv2_desc',
    'lv3_desc',
    'lv4_desc',
]

yeo_cols = [
    'all_households',
    'singles',
    'couple_without_children',
    'couple_with_children',
    'other_households',
    'single_parent_with_children',
]

box_cols = [
    'distance_to_lv1_desc',
    'distance_to_lv2_desc',
    'distance_to_lv3_desc',
    'distance_to_lv4_desc',
    
    'grunnkrets_id_density',
    't_district_density',
    'municipality_name_density',
    
    'distance_to_lokalt_knutepunkt',
    'distance_to_regionalt_knutepunkt',
    'distance_to_annen_viktig_holdeplass',
    'distance_to_nasjonalt_knutepunkt',
    'distance_to_mangler_viktighetsnivå',
    'distance_to_standard_holdeplass',
]

_merged_stores_train = merged_stores_train.filter(inc_cols+yeo_cols+box_cols)
_merged_stores_test = merged_stores_test.filter(inc_cols+yeo_cols+box_cols)

In [30]:
yeo_pipeline = make_pipeline(
    SimpleImputer(strategy="median"),
    PowerTransformer()
)
box_pipeline = make_pipeline(
    SimpleImputer(strategy="median"),
    PowerTransformer(method="box-cox")
)

preprocessing = make_column_transformer(
        (yeo_pipeline, yeo_cols),
        (box_pipeline, box_cols),
        remainder="passthrough"
    )

def new_transformer(merged_stores_df, preprocessing):
    return pd.DataFrame(preprocessing.fit_transform(merged_stores_df), columns=preprocessing.get_feature_names_out(), index=merged_stores_df.index)

In [31]:
_merged_stores_train = new_transformer(_merged_stores_train, preprocessing)
_merged_stores_test = new_transformer(_merged_stores_test, preprocessing)

In [32]:
pt = PowerTransformer()
rev_transformed = pt.fit_transform(target_labels[["revenue"]])
_merged_stores_train["revenue"] = rev_transformed

In [33]:
_merged_stores_train = _merged_stores_train[(_merged_stores_train.revenue > -1.8888)]

In [34]:
h2o.init()

# Import a sample binary outcome train/test set into H2O
train = h2o.H2OFrame(_merged_stores_train)
test = h2o.H2OFrame(_merged_stores_test)

cat_vars = inc_cols

cat_vars = [f'remainder__{i}' for i in cat_vars if i != 'store_id']

for cat in cat_vars:
    train[cat] = train[cat].asfactor()
    test[cat] = test[cat].asfactor()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: java version "1.8.0_301"; Java(TM) SE Runtime Environment (build 1.8.0_301-b09); Java HotSpot(TM) 64-Bit Server VM (build 25.301-b09, mixed mode)
  Starting server from /Users/nwong/opt/anaconda3/envs/sklearn-env/lib/python3.10/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/z1/l03w8mpn5xz3mghrk0j2w5gr0000gn/T/tmp31pontms
  JVM stdout: /var/folders/z1/l03w8mpn5xz3mghrk0j2w5gr0000gn/T/tmp31pontms/h2o_nwong_started_from_python.out
  JVM stderr: /var/folders/z1/l03w8mpn5xz3mghrk0j2w5gr0000gn/T/tmp31pontms/h2o_nwong_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,01 secs
H2O_cluster_timezone:,Europe/Oslo
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.38.0.2
H2O_cluster_version_age:,17 days
H2O_cluster_name:,H2O_from_python_nwong_ekvwsk
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.549 Gb
H2O_cluster_total_cores:,16
H2O_cluster_allowed_cores:,16


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


H2OResponseError: Server error java.lang.IllegalArgumentException:
  Error: Column remainder__sales_channel_name not found
  Request: POST /99/Rapids
    data: {'ast': "(tmp= py_2_sid_9e47 (cols_py (tmp= py_1_sid_9e47 (:= Key_Frame__upload_a7d34d808fb027f7707a788862e74773.hex (as.factor (cols_py Key_Frame__upload_a7d34d808fb027f7707a788862e74773.hex 'remainder__store_name')) 20 [])) 'remainder__sales_channel_name'))", 'session_id': '_sid_9e47'}


In [None]:
x = train.columns
y = "revenue"
x.remove(y)

# Run AutoML for 20 base models
aml = H2OAutoML(max_models=20, seed=1, exclude_algos=['deeplearning'])
aml.train(x=x, y=y, training_frame=train)

NameError: name 'train' is not defined

In [None]:
preds_avg = aml.predict(test)
preds_best = aml.leader.predict(test)

NameError: name 'aml' is not defined

In [None]:
df = test.cbind(preds_best)
df = df.as_data_frame(use_pandas=True)
result = df.loc[:,("remainder__store_id", 'predict')]
submission = result.rename(columns = {"remainder__store_id": "id",  "predict" : "predicted"})
submission['predicted'] = pt.inverse_transform(submission[['predicted']])
submission.to_csv("StackedEnsembleBestOfFamily21.csv", index = False)