In [90]:
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PowerTransformer
from sklearn.impute import SimpleImputer
from scipy.spatial.distance import cdist
from sklearn.preprocessing import PowerTransformer
from sklearn.compose import make_column_transformer, make_column_selector

import h2o
from h2o.automl import H2OAutoML

In [91]:
def raw_read(path, file_name, dtype=None):
    """
        Utility function to simplify reading of files from local machine.
    """
    return pd.read_csv(f"{path}{file_name}.csv", dtype=dtype)

def deduplicate_year(raw_df, deduplicate_column="grunnkrets_id"):
    """
        Use 2016 values by default. If exist in 2015, merge together. Drop year.
    """
    raw_df = raw_df.copy()
    return raw_df.sort_values(by='year').drop_duplicates(subset=[deduplicate_column], keep='last').drop('year', axis=1)

def combine_keys(dataframe):
    dataframe = dataframe.copy()
    dataframe['t_district'] = dataframe['district_name'] + dataframe['municipality_name']
    return dataframe

def bus_stops_lat_lon(bus_stops_df):
    """
    Extract latitude and longitude as separate columns.
    """
    bus_stops_df['lng_lat'] = bus_stops_df['geometry'].str.extract(
        r'\((.*?)\)')
    bus_stops_df[['lon', 'lat']] = bus_stops_df['lng_lat'].str.split(
        " ", 1, expand=True)
    bus_stops_df[['lon', 'lat']] = bus_stops_df[[
        'lon', 'lat']].apply(pd.to_numeric)
    return bus_stops_df[['busstop_id', 'stopplace_type', 'importance_level', 'side_placement', 'geometry', 'lat', 'lon']]

In [92]:
def bus_stops_closest(stores_df, bus_stops_df, importance_level="Regionalt knutepunkt"):
    """
    Id and distance of the closest bus stop to all stores.
    """
    bus_stops_df = bus_stops_df[bus_stops_df['importance_level'] == importance_level]
    mat = cdist(stores_df[['lat', 'lon']],
                bus_stops_df[['lat', 'lon']], metric='euclidean')

    new_df = pd.DataFrame(
        mat, index=stores_df['store_id'], columns=bus_stops_df['busstop_id'])

    stores = stores_df.store_id
    closest = new_df.idxmin(axis=1)
    distance = new_df.min(axis=1)

    return pd.DataFrame({'store_id': stores.values, 'closest_bus_stop': closest.values, 'distance': distance.values})

def bus_stops_in_radius(stores_df, bus_stops_df, radius=0.1, importance_level=None):
    """
    Number of bus stops within a given radius. The importance level of bus stops can be specified.
    """
    if importance_level is not None:
        bus_stops_df = bus_stops_df[bus_stops_df['importance_level'] == importance_level]

    mat = cdist(stores_df[['lat', 'lon']],
                bus_stops_df[['lat', 'lon']], metric='euclidean')
    new_df = pd.DataFrame(
        mat, index=stores_df['store_id'], columns=bus_stops_df['busstop_id'])
    count = pd.DataFrame(new_df[new_df < radius].count(axis=1)).reset_index()
    count.rename(columns={0: 'count'}, inplace=True)
    return count

# Relevant feature engineering functions.
def bus_stops_distance_by_importance(stores_df, bus_stops_df, stop_importance_levels):
    """
    Distance for each store to the closest bus stop of each importance_level
    """
    df_list = []
    for importance_level in stop_importance_levels:
        importance_level_cleaned = importance_level.lower().replace(" ", "_")
        df = bus_stops_closest(stores_df, bus_stops_df, importance_level=importance_level)
        df.rename(columns={'distance': f'distance_to_{importance_level_cleaned}'}, inplace=True)
        df_list.append(df[['store_id', f'distance_to_{importance_level_cleaned}']])

    dfs = [df.set_index('store_id') for df in df_list]
    return pd.concat(dfs, axis=1)

def bus_stops_in_radius_by_importance(stores_df, bus_stops_df, stop_importance_levels, radius=0.01):
    """
    Number of bus stops in radius of store for each importance level.
    """
    df_list = []
    df_list.append(bus_stops_in_radius(stores_df, bus_stops_df, radius=radius).rename(columns={'count':'number_of_all_stop_types'})) # All bus stops in radius
    
    for importance_level in stop_importance_levels:
        importance_level_cleaned = importance_level.lower().replace(" ", "_")
        df = bus_stops_in_radius(stores_df, bus_stops_df, importance_level=importance_level, radius=radius)
        df.rename(columns={'count': f'number_of_{importance_level_cleaned}'}, inplace=True)
        df_list.append(df[['store_id', f'number_of_{importance_level_cleaned}']])

    dfs = [df.set_index('store_id') for df in df_list]
    return pd.concat(dfs, axis=1)

def population(dataset_age):
    population = dataset_age.drop(["grunnkrets_id"], axis=1).sum(axis=1)
    dataset_age["population_count"] = population
    return dataset_age[["grunnkrets_id", "population_count"]]

def population_grouped(data_age, data_geography, grouping_element):
    age_df = population(data_age)
    geography_df = data_geography
    population_df = age_df.merge(geography_df, how="left", on="grunnkrets_id")
    grouped_df = population_df.groupby([grouping_element], as_index=False)["population_count"].sum()
    return grouped_df

def population_count_grouped_by_geo_group(stores_df, age_df, grunnkrets_df, geo_groups): 
    combined_df = stores_df.merge(grunnkrets_df, how = "left", on = "grunnkrets_id")

    population_columns = ["population_count"]
    df_list = []

    for geo_group in geo_groups: 
        pop_df = population_grouped(age_df, grunnkrets_df, geo_group)
        merged_df = combined_df.merge(pop_df, how = "left", on = geo_group)[["store_id"] + population_columns]
        merged_df.set_index("store_id", inplace = True)
        merged_df2 = merged_df.add_prefix(f'{geo_group}_')
        df_list.append(merged_df2)

    return pd.concat(df_list, axis = 1).reset_index()

def population_density(age_df, geo_df, grouping_element):
    age_data = population(age_df)
    geo_df = geo_df
    combined_df = age_data.merge(geo_df, how="left", on="grunnkrets_id")
    density_df = combined_df.groupby([grouping_element], as_index=False)[
        ["population_count", "area_km2"]].sum()
    density_df["density"] = density_df["population_count"] / \
        density_df["area_km2"]
    return density_df

def population_density_grouped_by_geo_group(stores_df, age_df, grunnkrets_df, geo_groups):
    grunnkrets_df_2016 = grunnkrets_df
    combined_df = stores_df.merge(grunnkrets_df_2016, how = "left", on = "grunnkrets_id")

    pop_density_columns = ["density"]
    df_list = []

    for geo_group in geo_groups: 
        pop_df = population_density(age_df, grunnkrets_df, geo_group)
        merged_df = combined_df.merge(pop_df, how = "left", on = geo_group)[["store_id"] + pop_density_columns]
        merged_df.set_index("store_id", inplace = True)
        merged_df2 = merged_df.add_prefix(f'{geo_group}_')
        df_list.append(merged_df2)

    return pd.concat(df_list, axis = 1).reset_index()

def age_distrubution(grunnkrets_age_df, geographic_df, grouping_element):
    age_df1 = grunnkrets_age_df
    age_df1["num_kids"] = age_df1.iloc[:, 1:8].sum(axis=1)
    age_df1["num_kids+"] = age_df1.iloc[:, 8:14].sum(axis=1)
    age_df1["num_youths"] = age_df1.iloc[:, 14: 19].sum(axis=1)
    age_df1["num_youthAdult"] = age_df1.iloc[:, 19:27].sum(axis=1)
    age_df1["num_adult"] = age_df1.iloc[:, 27:37].sum(axis=1)
    age_df1["num_adults+"] = age_df1.iloc[:, 37:62].sum(axis=1)
    age_df1["num_pensinors"] = age_df1.iloc[:, 62:92].sum(axis=1)

    age_df2 = age_df1[["grunnkrets_id", "num_kids", "num_kids+", "num_youths",
                       "num_youthAdult", "num_adult", "num_adults+", "num_pensinors"]]

    pop_df = population(grunnkrets_age_df)
    new_geo_df = geographic_df.drop(["geometry", "area_km2"], axis=1)
    combined_df = age_df2.merge(pop_df, how="inner", on="grunnkrets_id").merge(
        new_geo_df, how="inner", on="grunnkrets_id")
    list_columns = ["num_kids", "num_kids+", "num_youths",
                    "num_youthAdult", "num_adult", "num_adults+", "num_pensinors"]
    combined_df2 = combined_df.groupby([grouping_element], as_index=False)[
        list_columns].sum()

    pop_gk = population_grouped(
        grunnkrets_age_df, geographic_df, grouping_element)
    new_df = combined_df2.merge(pop_gk, how="inner", on=grouping_element)

    new_df["kids_%"] = new_df["num_kids"] / new_df["population_count"]
    new_df["kids+_%"] = new_df["num_kids+"] / new_df["population_count"]
    new_df["youths_%"] = new_df["num_youths"] / new_df["population_count"]
    new_df["youthAdult_%"] = new_df["num_youthAdult"] / \
        new_df["population_count"]
    new_df["adult_%"] = new_df["num_adult"] / new_df["population_count"]
    new_df["adults+_%"] = new_df["num_adults+"] / new_df["population_count"]
    new_df["pensinors_%"] = new_df["num_pensinors"] / \
        new_df["population_count"]

    age_dist_df = new_df.drop(["population_count"]+["num_kids", "num_kids+", "num_youths",
                       "num_youthAdult", "num_adult", "num_adults+", "num_pensinors"], axis=1)
    return age_dist_df

def age_dist_by_geo_group(stores_df, age_df, grunnkrets_norway_df, geo_groups): 
    combined_df = stores_df.merge(grunnkrets_norway_df, how = "left", on = "grunnkrets_id")

    age_columns = ['kids_%', 'kids+_%', 'youths_%',
       'youthAdult_%', 'adult_%', 'adults+_%', 'pensinors_%']

    df_list = []
    for geo_group in geo_groups: 
      age_dist_df = age_distrubution(age_df, grunnkrets_norway_df, geo_group)
      merged_df = combined_df.merge(age_dist_df, how = "left", on = geo_group)[["store_id"] + age_columns]
      merged_df.set_index("store_id", inplace = True)
      merged_df2 = merged_df.add_prefix(f'{geo_group}_')
      df_list.append(merged_df2)
    
    return pd.concat(df_list, axis = 1).reset_index()

def household_type_distrubution(grunnkrets_norway_df, grunnkrets_household_pop_df, grouping_element):
    combined_df = grunnkrets_norway_df.merge(grunnkrets_household_pop_df, how="inner", on="grunnkrets_id")

    list_columns = ["couple_children_0_to_5_years", "couple_children_18_or_above", "couple_children_6_to_17_years",
                    "couple_without_children", "single_parent_children_0_to_5_years", "single_parent_children_18_or_above",
                    "single_parent_children_6_to_17_years", "singles"]

    grouped_df = combined_df.groupby([grouping_element], as_index=False)[
        list_columns].sum()
    grouped_df["tot_pop_count"] = grouped_df.iloc[:, 1:].sum(axis=1)

    grouped_df["%_dist_of_couple_children_0_to_5_years"] = grouped_df["couple_children_0_to_5_years"] / \
        grouped_df["tot_pop_count"]
    grouped_df["%_dist_of_couple_children_18_or_above"] = grouped_df["couple_children_18_or_above"] / \
        grouped_df["tot_pop_count"]
    grouped_df["%_dist_of_couple_children_6_to_17_years"] = grouped_df["couple_children_6_to_17_years"] / \
        grouped_df["tot_pop_count"]
    grouped_df["%_dist_of_couple_without_children"] = grouped_df["couple_without_children"] / \
        grouped_df["tot_pop_count"]
    grouped_df["%_dist_of_single_parent_children_0_to_5_years"] = grouped_df["single_parent_children_0_to_5_years"] / \
        grouped_df["tot_pop_count"]
    grouped_df["%_dist_of_single_parent_children_18_or_above"] = grouped_df["single_parent_children_18_or_above"] / \
        grouped_df["tot_pop_count"]
    grouped_df["%_dist_of_single_parent_children_6_to_17_years"] = grouped_df["single_parent_children_6_to_17_years"] / \
        grouped_df["tot_pop_count"]
    grouped_df["%_dist_of_singles"] = grouped_df["singles"] / \
        grouped_df["tot_pop_count"]

    returned_df = grouped_df.drop(["tot_pop_count"], axis=1)
    return returned_df

def household_dist_by_geo_group(stores_df, grunnkrets_household_pop_df, grunnkrets_norway_df, geo_groups):
    combined_df = stores_df.merge(grunnkrets_norway_df, how = "left", on = "grunnkrets_id")
    
    household_columns = ['couple_children_0_to_5_years', 'couple_children_18_or_above', 'couple_children_6_to_17_years', 'couple_without_children',
       'single_parent_children_0_to_5_years','single_parent_children_18_or_above','single_parent_children_6_to_17_years', 'singles',
       '%_dist_of_couple_children_0_to_5_years','%_dist_of_couple_children_18_or_above','%_dist_of_couple_children_6_to_17_years',
       '%_dist_of_couple_without_children','%_dist_of_single_parent_children_0_to_5_years','%_dist_of_single_parent_children_18_or_above',
       '%_dist_of_single_parent_children_6_to_17_years', '%_dist_of_singles']
       
    df_list = []

    for geo_group in geo_groups: 
        household_type_df = household_type_distrubution(grunnkrets_norway_df, grunnkrets_household_pop_df, geo_group)
        merged_df = combined_df.merge(household_type_df, how = "left", on = geo_group)[["store_id"] + household_columns]
        merged_df.set_index("store_id", inplace = True)
        merged_df2 = merged_df.add_prefix(f'{geo_group}_')
        df_list.append(merged_df2)
    return pd.concat(df_list, axis = 1)

def mean_income_per_capita(grunnkrets_age_df, grunnkrets_household_inc_df):
    "mean income per capita per grunnkrets"
    age_df = population(grunnkrets_age_df)
    age_and_income_df = age_df.merge(grunnkrets_household_inc_df, how='left', on='grunnkrets_id')
    mean_income = age_and_income_df.drop(['singles', 'couple_without_children',
                                         'couple_with_children', 'other_households', 'single_parent_with_children'], axis=1)
    mean_income['mean_income'] = mean_income['all_households'] / \
        mean_income['population_count']
    mean_income = mean_income.drop(['all_households'], axis=1)

    return mean_income

def mean_income_per_capita_grouped(grunnkrets_age_df, grunnkrets_household_inc_df, grunnkrets_norway_df, geo_group, agg_name):
    # gets data from mean_income_per_capita functino
    data_mean_income = mean_income_per_capita(grunnkrets_age_df, grunnkrets_household_inc_df)
    # gets data from geography set and makes sure we only use data for 2016
    # gets the data of mean income with the geography data
    mean_income_geo_df = data_mean_income.merge(
        grunnkrets_norway_df, how='left', on='grunnkrets_id')
    # sum the number of people based on grouping element
    grouped_population_df = mean_income_geo_df.groupby(
        [geo_group], as_index=False)["population_count"].sum()
    # merge this with the grunnkrets to see both total population per selected area and grunnkrets
    total_grouped_df = mean_income_geo_df.merge(
        grouped_population_df, how='left', on=geo_group)
    portion_income_df = total_grouped_df
    # find ration of grunnkrets to total population and multiply this with grunnkrets mean income
    portion_income_df['mean_income'] = total_grouped_df['mean_income'] * \
        total_grouped_df['population_count_x'] / \
        total_grouped_df['population_count_y']
    # add these incomes together, should add up to the total mean income for the selected area
    grouped_income_df = portion_income_df.groupby(
        [geo_group])["mean_income"].sum().reset_index(name=agg_name)
    return grouped_income_df

def mean_income_per_capita_by_geo_group(stores_df, grunnkrets_age_df, grunnkrets_household_inc_df, grunnkrets_norway_df, geo_groups):
    merged_df = stores_df.merge(grunnkrets_norway_df, how="left", on="grunnkrets_id")
    
    df_list = []
    for geo_group in geo_groups:
        df = mean_income_per_capita_grouped(grunnkrets_age_df, grunnkrets_household_inc_df, grunnkrets_norway_df, geo_group, agg_name=f'{geo_group}_mean_income')
        df_list.append(merged_df.merge(df, how="left", on=[geo_group])[['store_id', f'{geo_group}_mean_income']])
    
    dfs = [df.set_index('store_id') for df in df_list]
    return pd.concat(dfs, axis=1)

def num_households(household_dist):
    household_dist = household_dist.copy()
    population = household_dist.drop(["grunnkrets_id"], axis=1).sum(axis=1)
    household_dist["household_count"] = population
    return household_dist[["grunnkrets_id", "household_count"]]

def num_households_geo(geo_group, household_dist, grunnkrets_df):
    _num_households = num_households(household_dist)
    merged_df = grunnkrets_df.merge(_num_households, on="grunnkrets_id", how="inner")
    return merged_df.groupby([geo_group], as_index=False)['household_count'].sum()

def total_grunnkrets_income(income_dist, household_dist):
    _num_households = num_households(household_dist)
    merged_df = income_dist.merge(_num_households, on="grunnkrets_id", how="inner")[['grunnkrets_id', 'household_count', 'all_households']]
    merged_df['total_income'] = merged_df['household_count'] * merged_df['all_households']
    return merged_df[['grunnkrets_id', 'total_income']]    

def total_income_geo(geo_group, income_dist, household_dist, grunnkrets_df):
    grunnkrets_income = total_grunnkrets_income(income_dist, household_dist)
    merged_df = grunnkrets_df.merge(grunnkrets_income, on="grunnkrets_id", how="inner")
    return merged_df.groupby([geo_group], as_index=False)['total_income'].sum()

def average_household_income_geo(geo_group, income_dist, household_dist, grunnkrets_df):
    income = total_income_geo(geo_group, income_dist, household_dist, grunnkrets_df)
    households = num_households_geo(geo_group, household_dist, grunnkrets_df)
    
    merged_df = income.merge(households, on=geo_group, how="inner")
    merged_df[f'avg_household_income_{geo_group}'] = merged_df['total_income'] / merged_df['household_count']
    return merged_df[[geo_group, f'avg_household_income_{geo_group}']]
    
def average_household_income_by_geo_groups(stores_df, geo_groups, income_dist, household_dist, grunnkrets_df):
    merged_df = stores_df.merge(grunnkrets_df, how="left", on="grunnkrets_id")
    
    df_list = []
    for geo_group in geo_groups:
        df = average_household_income_geo(geo_group, income_dist, household_dist, grunnkrets_df)
        df_list.append(merged_df.merge(df, how="left", on=[geo_group])[['store_id', f'avg_household_income_{geo_group}']])
    
    dfs = [df.set_index('store_id') for df in df_list]
    return pd.concat(dfs, axis=1).reset_index()

def stores_in_radius(stores_df, compare_df, radius=0.1, store_type_group=None):
    mat = cdist(stores_df[['lat', 'lon']],
                compare_df[['lat', 'lon']], metric="euclidean")
    
    new_df = pd.DataFrame(
        mat, index=stores_df['store_id'], columns=compare_df['store_id']
    )
    
    if store_type_group is None:
        count = new_df[(new_df < radius) & (new_df > 0)].count(axis=1)
        return count.to_frame(name="all_stores_in_radius").reset_index()
    
    else:
        test_df = new_df[(new_df < radius) & (new_df > 0)]
        store_count = {}
        
        for index, row in test_df.iterrows():
            nearby_stores = row.dropna().index.values
            index_type = compare_df.loc[compare_df['store_id'] == index, store_type_group].iat[0]
            
            number_same = compare_df[(compare_df['store_id'].isin(nearby_stores)) & (
                compare_df[store_type_group] == index_type)]['store_id'].count()
            
            store_count[index] = number_same
        
        df = pd.DataFrame.from_dict(store_count, orient='index', columns=[f"{store_type_group}_in_radius"])
        df.index.rename('store_id', inplace=True)
        return df.reset_index()

def store_closest(stores_df, compare_df, store_type_group="lv4_desc"):
    """
    Id and distance of the closest store of same type in the same group.
    """
    
    store_types_in_group = stores_df[store_type_group].unique()
    df_list = []
    for store_type in store_types_in_group:
        stores_by_type = stores_df[stores_df[store_type_group] == store_type]
        stores_comp_by_type = compare_df[compare_df[store_type_group] == store_type]
        
        mat = cdist(stores_by_type[['lat', 'lon']], stores_comp_by_type[['lat', 'lon']], metric='euclidean')
        
        df = pd.DataFrame(
            mat, index=stores_by_type['store_id'], columns=stores_comp_by_type['store_id'])
        
        df = df[df > 0]
        
        stores = df.index
        closest = df.idxmin(axis=1)
        distance = df.min(axis=1)
        
        new_df = pd.DataFrame({'store_id': stores.values, 'closest_store': closest.values, 'distance': distance.values})
        df_list.append(new_df)
        
    
    return pd.concat(df_list, ignore_index=True)


def store_closest_by_store_groups(stores_df, compare_df, store_type_groups):
    df_list = []
    
    for store_type_group in store_type_groups:
        df = store_closest(stores_df, compare_df, store_type_group=store_type_group)
        df.rename(columns={'distance': f'distance_to_{store_type_group}'}, inplace=True)
        df_list.append(df[['store_id', f'distance_to_{store_type_group}']])

    dfs = [df.set_index('store_id') for df in df_list]
    return pd.concat(dfs, axis=1).reset_index()

def encode_levels(stores_df):
    stores_df['level2'] = stores_df['lv1'].astype(str) + "," + stores_df['lv2'].astype(str)
    stores_df['level3'] = stores_df['lv1'].astype(str) + "," + stores_df['lv2'].astype(str) + "," +stores_df['lv3'].astype(str)
    
    return stores_df


def new_age_dist(stores_df, age_df, grunnkrets_df, geo_groups):
    _age_dist = age_dist_by_geo_group(stores_df, age_df, grunnkrets_df, geo_groups)
    return _age_dist.fillna(_age_dist.mean())

def new_pop_density(stores_df, age_dist, grunnkrets_df, geo_groups):
    population_density = population_density_grouped_by_geo_group(stores_df, age_dist, grunnkrets_df, geo_groups)
    return population_density.fillna(population_density.mean())

def stores_in_radius_new(stores_merged, compare_df, radius=0.05):
    lv_1 = stores_in_radius(stores_merged, compare_df, radius=radius, store_type_group='lv1_desc')
    lv_2 = stores_in_radius(stores_merged, compare_df, radius=radius, store_type_group='lv2_desc')
    lv_3 = stores_in_radius(stores_merged, compare_df, radius=radius, store_type_group='lv3_desc')
    lv_4 = stores_in_radius(stores_merged, compare_df, radius=radius, store_type_group='lv4_desc')
    all_count = stores_in_radius(stores_merged, compare_df, radius=radius, store_type_group=None)
    
    return lv_1.merge(lv_2, on="store_id", how="inner").merge(lv_3, on="store_id", how="inner").merge(lv_4, on="store_id", how="inner").merge(all_count, on="store_id", how="inner")

def distance_to_closest_group(stores_df, compare_df, group):
    """
        Mall or chain
    """
    mat = cdist(
        stores_df[['lat', 'lon']],
        compare_df[compare_df[group].notna()][['lat', 'lon']], metric="euclidean"
    )
    
    new_df = pd.DataFrame(
        mat, index=stores_df['store_id'], columns=compare_df[compare_df[group].notna()]['store_id']
    )
    
    new_df = new_df[new_df > 0]
    
    stores = new_df.index
    # closest = new_df.idxmin(axis=1)
    distance = new_df.min(axis=1)
    
    return pd.DataFrame({'store_id': stores.values, f'distance_closest_{group}': distance.values})

In [93]:
raw_path = "/Users/nwong/Workspace/Projects/tdt4173_project/data/raw/"

raw_stores_train = raw_read(raw_path, "stores_train", {'grunnkrets_id':str})
raw_stores_test = raw_read(raw_path, "stores_test", {'grunnkrets_id':str})
raw_stores_extra = raw_read(raw_path,"stores_extra", {'grunnkrets_id':str})

raw_income_dist = raw_read(raw_path, "grunnkrets_income_households", {'grunnkrets_id':str})
raw_age_dist = raw_read(raw_path, "grunnkrets_age_distribution", {'grunnkrets_id':str})
raw_households_dist = raw_read(raw_path, "grunnkrets_households_num_persons", {'grunnkrets_id':str})
raw_grunnkrets = raw_read(raw_path, "grunnkrets_norway_stripped", {'grunnkrets_id':str})

raw_plaace = raw_read(raw_path, "plaace_hierarchy", {'lv1':str, 'lv2':str})
raw_bus_stops = raw_read(raw_path, "busstops_norway")

dedup_income_dist = deduplicate_year(raw_income_dist)
dedup_age_dist = deduplicate_year(raw_age_dist)
dedup_households_dist = deduplicate_year(raw_households_dist)
dedup_grunnkrets = combine_keys(deduplicate_year(raw_grunnkrets))

enriched_bus_stops = bus_stops_lat_lon(raw_bus_stops)

  bus_stops_df[['lon', 'lat']] = bus_stops_df['lng_lat'].str.split(


In [94]:
stores_train_merged = raw_stores_train.merge(raw_plaace, on="plaace_hierarchy_id", how="left")
stores_test_merged = raw_stores_test.merge(raw_plaace, on="plaace_hierarchy_id", how="left")
stores_extra_merged = raw_stores_extra.merge(raw_plaace, on="plaace_hierarchy_id", how="left")

compare_train_df = pd.concat([stores_train_merged, stores_extra_merged], ignore_index=True)
compare_test_df = pd.concat([stores_test_merged, stores_extra_merged], ignore_index=True)

stores_types = ['lv1_desc', 'lv2_desc', 'lv3_desc', 'lv4_desc']
geo_groups = ['grunnkrets_id', 't_district', 'municipality_name']
stop_importance_levels = ['Mangler viktighetsnivå',
                          'Standard holdeplass',
                          'Lokalt knutepunkt',
                          'Nasjonalt knutepunkt',
                          'Regionalt knutepunkt',
                          'Annen viktig holdeplass']

In [95]:
train_df = stores_train_merged
test_df = stores_test_merged 

def firstname(string): 
    if " " in string: 
        newstring = string.split(" ")[:-1]
        return " ".join(newstring)
    else: 
        return string

def lastname(string):
    if " " in string: 
        if string.split(" ")[-1] == 'AS': 
            return string.split(" ")[-2]
        else: 
            return string.split(" ")[-1]  
    else: 
        return string


def address(string): 
    newstring = ""
    for char in string: 
        if not char.isdigit(): 
            newstring += char
        else: 
            return newstring

def district(string): 
    string = string.lower()
    if 'sentrum' in string:
        return 'sentrum'
    else: 
        return string

train_df['lastname'] = train_df["store_name"].apply(lastname)
test_df['lastname'] = test_df["store_name"].apply(lastname)

train_df['firstname'] = train_df["store_name"].apply(firstname)
test_df['firstname'] = test_df["store_name"].apply(firstname)

train_df['AS'] = (train_df['lastname'] == 'AS').astype(str)

train_df['lastname'] = train_df["store_name"].apply(lastname)

test_df['AS'] = (test_df['lastname'] == 'AS').astype(str)

test_df['lastname'] = test_df["store_name"].apply(lastname)


train_df['last_count'] = train_df.groupby('lastname')['lastname'].transform('count')
train_df['first_count'] = train_df.groupby('firstname')['firstname'].transform('count')

test_df['last_count'] = test_df.groupby('lastname')['lastname'].transform('count')
test_df['first_count'] = test_df.groupby('firstname')['firstname'].transform('count')

In [96]:
merged_stores_train = train_df \
    .merge(dedup_grunnkrets, on="grunnkrets_id", how="left") \
    .merge(dedup_income_dist, on="grunnkrets_id", how="left") \
    .merge(store_closest_by_store_groups(stores_train_merged, compare_train_df, stores_types), on="store_id", how="left") \
    .merge(stores_in_radius_new(stores_train_merged, compare_train_df), on="store_id", how="left") \
    .merge(new_pop_density(raw_stores_train, dedup_age_dist, dedup_grunnkrets, geo_groups), on="store_id", how="left") \
    .merge(average_household_income_by_geo_groups(raw_stores_train, geo_groups, raw_income_dist, dedup_households_dist, dedup_grunnkrets)) \
    .merge(bus_stops_distance_by_importance(raw_stores_train, enriched_bus_stops, stop_importance_levels).reset_index(level=0), on="store_id", how="left") \
    .merge(new_age_dist(raw_stores_train, dedup_age_dist, dedup_grunnkrets, geo_groups), on="store_id", how="left") \
    .merge(distance_to_closest_group(stores_train_merged, compare_train_df, "mall_name"), on="store_id", how="left") \
    .merge(distance_to_closest_group(stores_train_merged, compare_train_df, "chain_name"), on="store_id", how="left") 
    
merged_stores_test = test_df \
    .merge(dedup_grunnkrets, on="grunnkrets_id", how="left") \
    .merge(dedup_income_dist, on="grunnkrets_id", how="left") \
    .merge(store_closest_by_store_groups(stores_test_merged, compare_test_df, stores_types), on="store_id", how="left") \
    .merge(stores_in_radius_new(stores_test_merged, compare_test_df), on="store_id", how="left") \
    .merge(new_pop_density(raw_stores_test, dedup_age_dist, dedup_grunnkrets, geo_groups), on="store_id", how="left") \
    .merge(average_household_income_by_geo_groups(raw_stores_test, geo_groups, raw_income_dist, dedup_households_dist, dedup_grunnkrets)) \
    .merge(bus_stops_distance_by_importance(raw_stores_test, enriched_bus_stops, stop_importance_levels).reset_index(level=0), on="store_id", how="left") \
    .merge(new_age_dist(raw_stores_test, dedup_age_dist, dedup_grunnkrets, geo_groups), on="store_id", how="left") \
    .merge(distance_to_closest_group(stores_test_merged, compare_test_df, "mall_name"), on="store_id", how="left") \
    .merge(distance_to_closest_group(stores_test_merged, compare_test_df, "chain_name"), on="store_id", how="left") 

  return population_density.fillna(population_density.mean())
  return _age_dist.fillna(_age_dist.mean())
  return population_density.fillna(population_density.mean())
  return _age_dist.fillna(_age_dist.mean())


In [97]:
target_labels = merged_stores_train[['store_id', 'revenue']].copy()
merged_stores_train = merged_stores_train.copy().drop('revenue', axis=1)

In [98]:
merged_stores_train["lv2"] = merged_stores_train["lv2"].apply(lambda x: x.replace(".","_"))
merged_stores_train["lv3"] = merged_stores_train["lv3"].apply(lambda x: x.replace(".","_"))
merged_stores_train["lv4"] = merged_stores_train["lv4"].apply(lambda x: x.replace(".","_"))

merged_stores_test["lv2"] = merged_stores_test["lv2"].apply(lambda x: x.replace(".","_"))
merged_stores_test["lv3"] = merged_stores_test["lv3"].apply(lambda x: x.replace(".","_"))
merged_stores_test["lv4"] = merged_stores_test["lv4"].apply(lambda x: x.replace(".","_"))

In [108]:
inc_cols = [
    'store_id',
    "plaace_hierarchy_id",
    'chain_name',
    'mall_name',
    'sales_channel_name_x',
    'municipality_name',
    "grunnkrets_id",
    "t_district",
    "grunnkrets_name",
    "address",
    'lv1',
    'lv2',
    'lv3',
    'lv4',  
    
    'firstname',
    'lastname',
    'AS',
]

yeo_cols = [
    "lv1_desc_in_radius",
    "lv2_desc_in_radius",
    "lv3_desc_in_radius",
    "lv4_desc_in_radius",
    "all_stores_in_radius", 
    "all_households",
    "singles",
    "couple_without_children",
    "couple_with_children",
    "other_households",
    "single_parent_with_children",
    'last_count',
    'first_count',  
]

box_cols = [
    "distance_closest_mall_name",
    "distance_closest_chain_name",
    "distance_to_lv1_desc",
    "distance_to_lv2_desc",
    "distance_to_lv3_desc",
    "distance_to_lv4_desc",
    'distance_to_mangler_viktighetsnivå',
    'distance_to_standard_holdeplass',
    'distance_to_lokalt_knutepunkt',
    'distance_to_nasjonalt_knutepunkt',
    'distance_to_regionalt_knutepunkt',
    'distance_to_annen_viktig_holdeplass',
    
    "grunnkrets_id_density",
    "t_district_density",
    "municipality_name_density", 
]

_merged_stores_train = merged_stores_train.filter(inc_cols+yeo_cols+box_cols)
_merged_stores_test = merged_stores_test.filter(inc_cols+yeo_cols+box_cols)

In [100]:
yeo_pipeline = make_pipeline(
    SimpleImputer(strategy="median"),
    PowerTransformer()
)
box_pipeline = make_pipeline(
    SimpleImputer(strategy="median"),
    PowerTransformer(method="box-cox")
)

preprocessing = make_column_transformer(
        (yeo_pipeline, yeo_cols),
        (box_pipeline, box_cols),
        remainder="passthrough"
    )

def new_transformer(merged_stores_df, preprocessing):
    return pd.DataFrame(preprocessing.fit_transform(merged_stores_df), columns=preprocessing.get_feature_names_out(), index=merged_stores_df.index)

In [101]:
_merged_stores_train = new_transformer(_merged_stores_train, preprocessing)
_merged_stores_test = new_transformer(_merged_stores_test, preprocessing)

In [102]:
pt = PowerTransformer()
rev_transformed = pt.fit_transform(target_labels[["revenue"]])
_merged_stores_train["revenue"] = rev_transformed

In [103]:
_merged_stores_train = _merged_stores_train[(_merged_stores_train.revenue > -1.888)]

In [104]:
h2o.init()

# Import a sample binary outcome train/test set into H2O
train = h2o.H2OFrame(_merged_stores_train)
test = h2o.H2OFrame(_merged_stores_test)

cat_vars = inc_cols

cat_vars = [f'remainder__{i}' for i in cat_vars if i != 'store_id']

for cat in cat_vars:
    train[cat] = train[cat].asfactor()
    test[cat] = test[cat].asfactor()

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,1 hour 32 mins
H2O_cluster_timezone:,Europe/Oslo
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.38.0.2
H2O_cluster_version_age:,17 days
H2O_cluster_name:,H2O_from_python_nwong_ekvwsk
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,2.736 Gb
H2O_cluster_total_cores:,16
H2O_cluster_allowed_cores:,16


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [105]:
x = train.columns
y = "revenue"
x.remove(y)

# Run AutoML for 20 base models
aml = H2OAutoML(max_models=20, seed=1, exclude_algos=['deeplearning'])
aml.train(x=x, y=y, training_frame=train)

AutoML progress: |
22:59:32.233: _train param, Dropping bad and constant columns: [remainder__AS, remainder__store_id]

███
22:59:49.40: _train param, Dropping bad and constant columns: [remainder__AS, remainder__store_id]

███
23:00:10.910: _train param, Dropping bad and constant columns: [remainder__AS, remainder__store_id]


23:00:15.693: _train param, Dropping bad and constant columns: [remainder__AS, remainder__store_id]

██
23:00:28.487: _train param, Dropping bad and constant columns: [remainder__AS, remainder__store_id]

█████████
23:03:17.757: _train param, Dropping bad and constant columns: [remainder__AS, remainder__store_id]
23:03:21.872: _train param, Dropping bad and constant columns: [remainder__AS, remainder__store_id]

█
23:03:27.683: _train param, Dropping bad and constant columns: [remainder__AS, remainder__store_id]

██
23:03:38.932: _train param, Dropping bad and constant columns: [remainder__AS, remainder__store_id]

█
23:03:48.398: _train param, Dropping bad and 

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
mae,0.540311,0.0049839,0.536622,0.5448842,0.5464153,0.5380719,0.5355617
mean_residual_deviance,0.4914147,0.0056905,0.4853008,0.4972618,0.4965665,0.4920937,0.4858508
mse,0.4914147,0.0056905,0.4853008,0.4972618,0.4965665,0.4920937,0.4858508
null_deviance,2409.9536,78.479866,2393.7407,2432.2131,2502.3323,2433.1992,2288.2827
r2,0.4842008,0.005715,0.4771154,0.4801295,0.4897929,0.4840578,0.4899082
residual_deviance,1242.0419,43.10652,1251.5908,1264.0394,1275.6793,1251.8864,1167.0137
rmse,0.7010004,0.0040599,0.6966354,0.7051679,0.7046747,0.7014939,0.69703
rmsle,,0.0,,,,,


In [109]:
preds_avg = aml.predict(test)
preds_best = aml.leader.predict(test)

stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |



███████████████████████████████████████████| (done) 100%


In [110]:
df = test.cbind(preds_best)
df = df.as_data_frame(use_pandas=True)
result = df.loc[:,("remainder__store_id", 'predict')]
submission = result.rename(columns = {"remainder__store_id": "id",  "predict" : "predicted"})
submission['predicted'] = pt.inverse_transform(submission[['predicted']])
submission.to_csv("sun_2233_submit.csv", index = False)

Feature names unseen at fit time:
- predicted
Feature names seen at fit time, yet now missing:
- revenue

