# Short Notebook 
### TDT4173

#### Haakon Paaske (haakopaa) and Julia Lövgren (julialov) <br>
#### Kaggle team: Group 046

### Feature Engineering

In [1]:
import math
import geopy.distance
from dis import dis
import math
import geopandas as gpd
import numpy
from shapely import wkt
from shapely import wkb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from shapely.geometry import Point
from os import path
from scipy.spatial import cKDTree
from shapely.geometry import Point


train = pd.read_csv('data/stores_train.csv')
test= pd.read_csv('data/stores_test.csv')
busstops = pd.read_csv('data/busstops_norway.csv')
grunnkrets_age = pd.read_csv('data/grunnkrets_age_distribution.csv')
grunnkrets_households = pd.read_csv('data/grunnkrets_households_num_persons.csv')
grunnkrets_income = pd.read_csv('data/grunnkrets_income_households.csv')
grunnkrets_stripped = pd.read_csv('data/grunnkrets_norway_stripped.csv')
plaace_hierarchy = pd.read_csv('data/plaace_hierarchy.csv')

In [2]:
# CREATING FUNCTIONS

In [3]:
def rev_to_log(train_data):
    """ Since we are trying to minimize the rmsle and not the rmse, a log transfom on the revenue would help the algorithms who dont have rmsle as a target metric  """
    if train_data._get_value(1, 'revenue') > 15:
        train_data['revenue'] = np.log1p(train_data['revenue'])
    return train_data

def remove_zero_revenue(train_data):
    """ Found out by just trying to remove all the stores with 0.1 or lower revenue in the training set gave better score on the test set """
    train_data = train_data.loc[train_data["revenue"] > 0.1]
    train_data = train_data.reset_index(drop=True)
    return train_data
    
def dist_to_all_km(lat, lon, df):
    """ A haversine function we use a lot to get the distances between stores, busstops etc """
    # coordinates in radians
    lat1 = lat*math.pi/180
    lon1 = lon*math.pi/180
    lat2 = df['lat']*math.pi/180 # go through whole lat column
    lon2 = df['lon']*math.pi/180 # go through whole lon column

    # store original coordinates in new dataframe
    distances = pd.DataFrame()
    distances['lat'] = df['lat'].copy()
    distances['lon'] = df['lon'].copy()

    # calculate cartesian coordinates
    R = 6371 # Earth radius in km
    df['x'] = R*np.cos(lat2)*np.cos(lon2)
    df['y'] = R*np.cos(lat2)*np.sin(lon2)
    df['z'] = R*np.sin(lat2)
    x1 = R*np.cos(lat1)*np.cos(lon1)
    y1 = R*np.cos(lat1)*np.sin(lon1)
    z1 = R*np.sin(lat1)

    # calculate distance, store as new column in the distances dataframe
    distances['dist'] = np.sqrt(np.square(df['x']-x1)+np.square(df['y']-y1)+np.square(df['z']-z1))

    return distances['dist'].squeeze()

In [4]:
# Creating the initial dataset with every feature we want

def lat_long_busstop():
    """ Need lat lon coordinate for all the shapely points in the busstop dataset. """
    busstops['geometry'] = gpd.GeoSeries.from_wkt(busstops['geometry'])
    busstops['lat'] = busstops.geometry.apply(lambda x: x.y)
    busstops['lon'] = busstops.geometry.apply(lambda x: x.x)


def convert_nan(train_data):
    """ Need a way to handle nan values in chain and mall. Decided to turn NaN in both these categories into "no mall" and "no chain" """
    # Replace NaN in mall_name and chain_name columns with 'No mall' and 'No chain'
    train_data.mall_name = train_data.mall_name.fillna('No mall')
    train_data.chain_name = train_data.chain_name.fillna('No chain')
    train_data.address = train_data.address.fillna('No Address')
    train_data['mall_name']= train_data['mall_name'].astype('category')
    train_data['chain_name']= train_data['chain_name'].astype('category')
    return train_data

def combine_grunnkrets_and_data(train_data, test_data):
    """ Map every grunnkrets to its municipality. Found out that grunnkrets are so small that it would not make too much sense to use it as a platform for further features. """
    df = pd.DataFrame(train_data, columns=['store_id', 'lat', 'lon', 'grunnkrets_id'])
    df = pd.merge(df, grunnkrets_stripped[['grunnkrets_id', 'municipality_name']], on='grunnkrets_id', how='left')
    df = df.drop_duplicates(subset=['store_id'], keep='first').reset_index(drop=True)

    # df.isnull().sum() # 30 NaNs in municipality_name

    index_of_NaN, idy = np.where(pd.isnull(df))

    for i in index_of_NaN:
        lat = df._get_value(i,'lat')
        lon = df._get_value(i,'lon')
        dist_to_all = dist_to_all_km(lat, lon, df).to_frame()
        dist_to_all = dist_to_all.sort_values('dist').reset_index()
        dist_to_all = dist_to_all.rename(columns={'index':'index_in_train_data'})

        # Find index of closest store that isn't itself and that has a grunnkrets_id
        index_of_closest_store = 0
        while dist_to_all.iloc[index_of_closest_store]['dist'] == 0 or pd.isna(df.at[dist_to_all.iloc[index_of_closest_store]['index_in_train_data'], 'municipality_name']):
            index_of_closest_store = index_of_closest_store + 1
        
        # Set municipality_name as the municipality name of the closest store that has one
        df.at[i,'municipality_name'] = df.at[dist_to_all.iloc[index_of_closest_store]['index_in_train_data'], 'municipality_name']

        # Add municipality names
    train_data = pd.merge(train_data, df[['grunnkrets_id', 'municipality_name']], on='grunnkrets_id', how='left')
    # we get a bunch of duplicates of store_ids...? Remove them.
    train_data = train_data.drop_duplicates(subset=['store_id'], keep='first')

    # For test
    test_data = pd.merge(test_data, df[['grunnkrets_id', 'municipality_name']], on='grunnkrets_id', how='left')
    # we get a bunch of duplicates of store_ids...? Remove them.
    test_data = test_data.drop_duplicates(subset=['store_id'], keep='first')

    return train_data, test_data


def create_lvl(train_data):
    """ Makes sure the model interprets the levels as category and not int/float. important for lv1 and lv2. """
    global plaace_hierarchy

    train_data = pd.merge(train_data, plaace_hierarchy[['plaace_hierarchy_id', 'lv1', 'lv2', 'lv3', 'lv4']], on='plaace_hierarchy_id', how='outer')
    train_data['lv1']= train_data['lv1'].astype('category')
    train_data['lv2']= train_data['lv2'].astype('category')
    train_data['lv3']= train_data['lv3'].astype('category')
    train_data['lv4']= train_data['lv4'].astype('category')
    
    #drop the last broken columns
    train_data = train_data.dropna(subset=['store_id'])
    return train_data


In [5]:
# BUSSTOP FEATURES

def busstops_wihin_distances(train_data): 
    """ Decided to calculate the number of busstops, excluding importance level, within many different distances. We can the chose which distances works best for our model """
    train_data['busstops_within_50m'] = 0
    train_data['busstops_within_100m'] = 0
    train_data['busstops_within_400m'] = 0
    train_data['busstops_within_800m'] = 0
    train_data['busstops_within_1500m'] = 0
    train_data['busstops_within_3000m'] = 0
    train_data['busstops_within_6000m'] = 0
    


    iter = 0

    for index in range(len(train_data)):
        one_to_all = dist_to_all_km(train_data._get_value(index, 'lat'), train_data._get_value(index, 'lon'), busstops)
        one_to_all = one_to_all.to_frame()
        one_to_all.rename( columns={0 :'a'}, inplace=True)
        count50 = (one_to_all < 0.05).sum()
        count100 = (one_to_all < 0.1).sum()
        count400 = (one_to_all < 0.4).sum()
        count800 = (one_to_all < 0.8).sum()
        count1500 = (one_to_all < 1.5).sum()
        count3000 = (one_to_all < 3).sum()
        count6000 = (one_to_all < 6).sum()

        iter += 1
        print(str(iter) + "/" + str(len(train_data)))
        train_data._set_value(index, 'busstops_within_50m', count50)
        train_data._set_value(index, 'busstops_within_100m', count100)
        train_data._set_value(index, 'busstops_within_400m', count400)
        train_data._set_value(index, 'busstops_within_800m', count800)
        train_data._set_value(index, 'busstops_within_1500m', count1500)
        train_data._set_value(index, 'busstops_within_3000m', count3000)
        train_data._set_value(index, 'busstops_within_6000m', count6000)
    return train_data


In [6]:
# GRUNNKRETS AND MUNICIPALITY FEATURES

def prep_gk():
    """ Just a function which fills nan values and combines some datasets that we need for later """
    global grunnkrets_age
    global grunnkrets_stripped
    global train
    grunnkrets_age['total_nbr_people'] = 0
    grunnkrets_age = grunnkrets_age.drop_duplicates(subset=['grunnkrets_id'], keep='last') # if there is value for 2016 we keep it, otherwise 2015
    grunnkrets_age = grunnkrets_age.fillna(0)
    grunnkrets_age = grunnkrets_age.drop('year',axis=1)
    grunnkrets_age['grunnkrets_id'] = grunnkrets_age['grunnkrets_id'].astype(str)
    grunnkrets_age['total_nbr_people'] = grunnkrets_age.sum(axis=1) # total number of inhabitants
    grunnkrets_age['grunnkrets_id'] = grunnkrets_age['grunnkrets_id'].astype(int)

    number_stores = train['grunnkrets_id'].value_counts().rename_axis('grunnkrets_id').reset_index(name='store_counts_total') # Not including NaN (stores without a grunnkrets_id)
    grunnkrets_stripped = pd.merge(grunnkrets_stripped, number_stores[['grunnkrets_id', 'store_counts_total']], on='grunnkrets_id', how='left')
    grunnkrets_stripped.store_counts_total = grunnkrets_stripped.store_counts_total.fillna(0)
    grunnkrets_stripped = pd.merge(grunnkrets_stripped, grunnkrets_age[['grunnkrets_id', 'total_nbr_people']], on='grunnkrets_id', how='left')
    grunnkrets_stripped['nbr_people_per_store_in_grunnkrets'] = grunnkrets_stripped['total_nbr_people']/grunnkrets_stripped['store_counts_total']


def people_per_gk(train_data):
    """ Calculates the number of people that lives in each grunnkrets """
    train_data = pd.merge(train_data, grunnkrets_age[['grunnkrets_id', 'total_nbr_people']], on='grunnkrets_id', how='left')
    train_data['total_nbr_people'] = train_data['total_nbr_people'].fillna(0)
    return train_data

def people_per_store_in_each_gk(train_data):
    """ Calculates how many people there is per store in every grunnkrets.  """
    train_data = pd.merge(train_data, grunnkrets_stripped[['grunnkrets_id', 'nbr_people_per_store_in_grunnkrets']], on='grunnkrets_id', how='left')

    #dont know if we need this
    train_data = train_data.drop_duplicates(subset=['store_id'], keep='first')
    train_data['nbr_people_per_store_in_grunnkrets'] = train_data['nbr_people_per_store_in_grunnkrets'].fillna(0)
    return train_data


def people_per_store_with_same_lvl2_in_each_gk(train_data):
    """ Calculates the number of people in a grunnkrets devided by the number of stores with same level 2 in that grunnkrets. """
    # Number of people per store in each grunnkrets in lv2
    counts = train_data[["store_id", "grunnkrets_id", "lv2"]].groupby(
        ["grunnkrets_id", "lv2"]
    ).count().reset_index()
    counts.columns = ["grunnkrets_id", "lv2", "counts_gr_lv2"]
    train_data = train_data.merge(counts, how="left", on=["grunnkrets_id", "lv2"])

    #dont know if we need this
    train_data = train_data.drop_duplicates(subset=['store_id'], keep='first')
    return train_data
    
def people_per_municipality(train_data, test_data):
    """ Calculates the number of people in every municipality and creates size groups of municipalities based on number of people.
    The splits into size groups are determined based on plotting the distribution of number of people in each municipality.
    The purpose of creating size groups is to include revenue based features per municipality size group to avoid the potential data leakage of including for example mean revenue per single municipality or grunnkrets, where there might be very few stores in each municipality/grunnkrets """
    # Total nbr people in each municipality
    municipalities = train_data[["municipality_name", "total_nbr_people"]].groupby(
        ["municipality_name"]
    ).sum().reset_index()

    municipalities = municipalities.rename(columns={'total_nbr_people':'nbr_people_in_municipality'})
    municipalities = municipalities.sort_values('nbr_people_in_municipality', ascending = False)

    # Print distribution to check relevant division into small/medium/large size municipality groups
    # print(municipalities['nbr_people_in_municipality'].describe())
    # plt.figure()
    # sns.histplot(data=municipalities, x='nbr_people_in_municipality') # We see that two municipalities are very large, plot again without them
    # plt.figure()
    # sns.histplot(data=municipalities[municipalities['nbr_people_in_municipality']<500000], x='nbr_people_in_municipality') # excluding the two largest
    # The above prints show reasonable splits for size groups, used below in conditions

    # Make new column in municipalities for municipality size category, assign categories. Values are based on the above EDA of the distribution of nbr_people_in_municipality.
    conditions = [
        (municipalities['nbr_people_in_municipality'] < 25000),
        (municipalities['nbr_people_in_municipality'] >= 25000) & (municipalities['nbr_people_in_municipality'] < 75000),
        (municipalities['nbr_people_in_municipality'] >= 75000) & (municipalities['nbr_people_in_municipality'] < 150000),
        (municipalities['nbr_people_in_municipality'] >= 150000) & (municipalities['nbr_people_in_municipality'] < 500000),
        (municipalities['nbr_people_in_municipality'] >= 500000),
    ]

    values = ['below25k', 'between25and75k', 'between75and150k', 'between150and500k', 'above500k']
    municipalities['municipality_size_group'] = np.select(conditions, values)

    municipalities['municipality_size_group'].value_counts()

    # merge to train data
    train_data = pd.merge(train_data, municipalities[['municipality_name', 'municipality_size_group']], on='municipality_name', how='outer')

    # merge to test data
    test_data = pd.merge(test_data, municipalities[['municipality_name', 'municipality_size_group']], on='municipality_name', how='outer')
    return train_data, test_data

def people_per_sotre_with_same_lvl2_in_each_muninicipality(train_data):
    """ Number of people per store with the same level 2 plaaace hierarchy in each municipality. """
    nbr_in_municipality = train_data[["store_id","municipality_name", "lv2"]].groupby(
        ["municipality_name", "lv2"]
    ).count().reset_index()
    nbr_in_municipality.columns = ["municipality_name", "lv2", "counts_municipality_lv2"]
    train_data = train_data.merge(nbr_in_municipality, how="left", on=["municipality_name", "lv2"])

    #dont know if we need this
    train_data = train_data.drop_duplicates(subset=['store_id'], keep='first')
    return train_data

# 
#
def mean_rev_size_group(train_data, test_data):
    """ Mean revenue of each municipality size group """
    mean_rev_munic = train_data[["municipality_size_group", "revenue"]].groupby(
    ["municipality_size_group"]
    ).mean().reset_index()
    mean_rev_munic = mean_rev_munic.rename(columns={'revenue':'mean_revenue_for_municipality_size_group'})

    # merge to train data
    train_data = train_data.merge(mean_rev_munic, how="left", on=["municipality_size_group"])
    # In case of duplicates, remove them.
    train_data = train_data.drop_duplicates(subset=['store_id'], keep='first')

    # merge to test data
    test_data = test_data.merge(mean_rev_munic, how="left", on=["municipality_size_group"])
    # In case of duplicates, remove them.
    test_data = test_data.drop_duplicates(subset=['store_id'], keep='first')
    return train_data, test_data

def median_rev_size_group(train_data, test_data):
    """ Median revenue of each municipality size group """
    median_rev_munic = train_data[["municipality_size_group", "revenue"]].groupby(
        ["municipality_size_group"]
    ).median().reset_index()
    median_rev_munic = median_rev_munic.rename(columns={'revenue':'median_revenue_for_municipality_size_group'})

    # merge to train data
    train_data = train_data.merge(median_rev_munic, how="left", on=["municipality_size_group"])
    # In case of duplicates, remove them.
    train_data = train_data.drop_duplicates(subset=['store_id'], keep='first')
    # merge to test data
    test_data = test_data.merge(median_rev_munic, how="left", on=["municipality_size_group"])
    # In case of duplicates, remove them.
    test_data = test_data.drop_duplicates(subset=['store_id'], keep='first')
    return train_data, test_data

def std_red_size_group(train_data, test_data):
    """ Standard deviation of revenue of each municipality size group """
    stdev_munic = train_data[["municipality_size_group", "revenue"]].groupby(
        ["municipality_size_group"]
    ).std().reset_index()
    stdev_munic = stdev_munic.rename(columns={'revenue':'st_dev_of_revenue_for_municipality_size_group'})
    
    # merge to train data
    train_data = train_data.merge(stdev_munic, how="left", on=["municipality_size_group"])
    # In case of duplicates, remove them.
    train_data = train_data.drop_duplicates(subset=['store_id'], keep='first')
    # merge to test data
    test_data = test_data.merge(stdev_munic, how="left", on=["municipality_size_group"])
    # In case of duplicates, remove them.
    test_data = test_data.drop_duplicates(subset=['store_id'], keep='first')
    return train_data, test_data


def mean_rev_municipality(train_data, test_data):
    """ Calculates the mean revenue in each municipality and creates groups of municipalities based on their revenue.
    The splits into revenue groups are determined based on plotting the distribution of mean revenue in each municipality.
    The purpose of creating revenue groups is to include revenue based features per municipality revenue group to avoid the potential data leakage of including for example mean revenue per single municipality or grunnkrets, where there might be very few stores in each municipality/grunnkrets """

    # Mean revenue for municipality
    municipalities_rev = train_data[["municipality_name", "revenue"]].groupby(
        ["municipality_name"]
    ).mean().reset_index()
    municipalities_rev = municipalities_rev.rename(columns={'revenue':'mean_revenue_for_municipality'})

    # Print distribution to check relevant division into low/medium/high revenue municipality groups
    #print(municipalities_rev['mean_revenue_for_municipality'].describe())
    #plt.figure()
    #sns.histplot(data=municipalities_rev, x='mean_revenue_for_municipality')
    # The above prints show reasonable splits for mean revenue groups, used below in conditions

    # Make new column in municipalities_rev for municipality revenue category, assign categories. Values are based on the above EDA of the distribution of mean_revenue_for_municipality.
    conds = [
          (municipalities_rev['mean_revenue_for_municipality'] < 0.7),
          (municipalities_rev['mean_revenue_for_municipality'] >= 0.7) & (municipalities_rev['mean_revenue_for_municipality'] < 1.2),
          (municipalities_rev['mean_revenue_for_municipality'] >= 1.2) & (municipalities_rev['mean_revenue_for_municipality'] < 2),
          (municipalities_rev['mean_revenue_for_municipality'] >= 2) & (municipalities_rev['mean_revenue_for_municipality'] < 2.7),
          (municipalities_rev['mean_revenue_for_municipality'] >= 2.7)
    ]
    vals = ['rev_below_0.7', 'rev_0.7_to_1.2', 'rev_1.2_to_2', 'rev_2_to_2.7', 'rev_above_2.7']
    municipalities_rev['municipality_rev_group'] = np.select(conds, vals)

    # merge to train data
    train_data = pd.merge(train_data, municipalities_rev[['municipality_name', 'municipality_rev_group']], on='municipality_name', how='outer')

    # merge to test data
    test_data = pd.merge(test_data, municipalities_rev[['municipality_name', 'municipality_rev_group']], on='municipality_name', how='outer')
    
    return train_data, test_data

def mean_rev_rev_group(train_data, test_data):
    """ Mean revenue of each municipality revenue group """
    mean_rev_munic = train_data[["municipality_rev_group", "revenue"]].groupby(
        ["municipality_rev_group"]
    ).mean().reset_index()
    mean_rev_munic = mean_rev_munic.rename(columns={'revenue':'mean_revenue_for_municipality_rev_group'})

    # merge to train data
    train_data = train_data.merge(mean_rev_munic, how="left", on=["municipality_rev_group"])
    # In case of duplicates, remove them.
    train_data = train_data.drop_duplicates(subset=['store_id'], keep='first')

    # merge to test data
    test_data = test_data.merge(mean_rev_munic, how="left", on=["municipality_rev_group"])
    # In case of duplicates, remove them.
    test_data = test_data.drop_duplicates(subset=['store_id'], keep='first')

    return train_data, test_data

def median_rev_rev_group(train_data, test_data):
    """ Median revenue of each municipality revenue group """
    median_rev_munic = train_data[["municipality_rev_group", "revenue"]].groupby(
        ["municipality_rev_group"]
    ).median().reset_index()
    median_rev_munic = median_rev_munic.rename(columns={'revenue':'median_revenue_for_municipality_rev_group'})

    # merge to train data
    train_data = train_data.merge(median_rev_munic, how="left", on=["municipality_rev_group"])
    # In case of duplicates, remove them.
    train_data = train_data.drop_duplicates(subset=['store_id'], keep='first')

    # merge to test data
    test_data = test_data.merge(median_rev_munic, how="left", on=["municipality_rev_group"])
    # In case of duplicates, remove them.
    test_data = test_data.drop_duplicates(subset=['store_id'], keep='first')

    return train_data, test_data


In [7]:
# STORE TO STORE FEATURES

""" Below are some functions that calculates the number of stores based on criteria within different distances for every store in the training set and testing set. 
We decided to use stores extra, stores_train and stores_test concatinated into a single dataframe.
We then calculated de distance from all the stores in the train set and test set to all the stores in the concatinated set. 
We did this with criterias lv2, lv3, and lv4. 
We also calculated the distance to the closest competitor for each lvl.  """

def store_dist_lvl2(df):
    """ Number of stores in the same hierarchy level 2 within 100 m, 500 m, 1 km, 5 km, 10 km and 20 km respectively,
    and the distance to the closest competitor, i. e. the closest store with the same hierarchy level 2. """
    # create a datafram with all stores extra
    test = pd.read_csv('data/stores_test.csv')
    train = pd.read_csv('data/stores_train.csv')
    extra = pd.read_csv('data/stores_extra.csv')
    extra_df = pd.DataFrame()
    extra_df = extra_df.append(test).append(train).append(extra)
    extra_df = create_lvl(extra_df)

    df['num_stores_within_100m_and_same_lvl2'] = 0
    df['num_stores_within_500m_and_same_lvl2'] = 0
    df['num_stores_within_1km_and_same_lvl2'] = 0
    df['num_stores_within_5km_and_same_lvl2'] = 0
    df['num_stores_within_10km_and_same_lvl2'] = 0
    df['num_stores_within_20km_and_same_lvl2'] = 0
    df['closest_competitor_lv2'] = 10.0
    num = 0 

    for index in range(len(df)):
        lat = df._get_value(index,'lat')
        lon = df._get_value(index,'lon')
        lvl = df._get_value(index, 'lv2')
        dist_to_all = dist_to_all_km(lat, lon, extra_df)

        count01 = 0
        count05 = 0
        count1 = 0
        count5 = 0
        count10 = 0
        count20 = 0
        closest = 100.0

        iter = 0
        for number in dist_to_all:
            if number < 0.1 and extra_df._get_value(iter, 'lv2') == lvl:
                count01 += 1
            if number < 0.5 and extra_df._get_value(iter, 'lv2') == lvl:
                count05 += 1
            if number < 1 and extra_df._get_value(iter, 'lv2') == lvl:
                count1 += 1
            if number < 5 and extra_df._get_value(iter, 'lv2') == lvl:
                count5 += 1
            if number < 10 and extra_df._get_value(iter, 'lv2') == lvl:
                count10 +=1
            if number < 20 and extra_df._get_value(iter, 'lv2') == lvl:
                count20 += 1
            if number < closest and extra_df._get_value(iter, 'lv2') == lvl and number != 0.0:
                closest = number
            iter += 1

        df._set_value(index, 'num_stores_within_100m_and_same_lvl2', count01)
        df._set_value(index, 'num_stores_within_500m_and_same_lvl2', count05)
        df._set_value(index, 'num_stores_within_1km_and_same_lvl2', count1)
        df._set_value(index, 'num_stores_within_5km_and_same_lvl2', count5)
        df._set_value(index, 'num_stores_within_10km_and_same_lvl2', count10)
        df._set_value(index, 'num_stores_within_20km_and_same_lvl2', count20)
        df._set_value(index, 'closest_competitor_lv2', float(closest))

        num+=1 #for runtime tracking only
        print(num)
    return df


def store_dist_lvl3(df):
    """ Number of stores in the same hierarchy level 3 within 100 m, 500 m, 1 km, 5 km, 10 km and 20 km respectively,
    and the distance to the closest competitor, i. e. the closest store with the same hierarchy level 3. """
    # create a datafram with all stores extra
    test = pd.read_csv('data/stores_test.csv')
    train = pd.read_csv('data/stores_train.csv')
    extra = pd.read_csv('data/stores_extra.csv')
    extra_df = pd.DataFrame()
    extra_df = extra_df.append(test).append(train).append(extra)
    extra_df = create_lvl(extra_df)

    df['num_stores_within_100m_and_same_lvl3'] = 0
    df['num_stores_within_500m_and_same_lvl3'] = 0
    df['num_stores_within_1km_and_same_lvl3'] = 0
    df['num_stores_within_5km_and_same_lvl3'] = 0
    df['num_stores_within_10km_and_same_lvl3'] = 0
    df['num_stores_within_20km_and_same_lvl3'] = 0
    df['closest_competitor_lv3'] = 10.0
    num = 0 

    for index in range(len(df)):
        lat = df._get_value(index,'lat')
        lon = df._get_value(index,'lon')
        lvl = df._get_value(index, 'lv3')
        dist_to_all = dist_to_all_km(lat, lon, extra_df)

        count01 = 0
        count05 = 0
        count1 = 0
        count5 = 0
        count10 = 0
        count20 = 0
        closest = 100.0

        iter = 0
        for number in dist_to_all:
            if number < 0.1 and extra_df._get_value(iter, 'lv3') == lvl:
                count01 += 1
            if number < 0.5 and extra_df._get_value(iter, 'lv3') == lvl:
                count05 += 1
            if number < 1 and extra_df._get_value(iter, 'lv3') == lvl:
                count1 += 1
            if number < 5 and extra_df._get_value(iter, 'lv3') == lvl:
                count5 += 1
            if number < 10 and extra_df._get_value(iter, 'lv3') == lvl:
                count10 +=1
            if number < 20 and extra_df._get_value(iter, 'lv3') == lvl:
                count20 += 1
            if number < closest and extra_df._get_value(iter, 'lv3') == lvl and number != 0:
                closest = number
            iter += 1

        df._set_value(index, 'num_stores_within_100m_and_same_lvl3', count01)
        df._set_value(index, 'num_stores_within_500m_and_same_lvl3', count05)
        df._set_value(index, 'num_stores_within_1km_and_same_lvl3', count1)
        df._set_value(index, 'num_stores_within_5km_and_same_lvl3', count5)
        df._set_value(index, 'num_stores_within_10km_and_same_lvl3', count10)
        df._set_value(index, 'num_stores_within_20km_and_same_lvl3', count20)
        df._set_value(index, 'closest_competitor_lv3', float(closest))

        num+=1 #for runtime tracking only
        print(num)
    return df

def store_dist_lvl4(df):
    """ Number of stores in the same hierarchy level 4 within 100 m, 500 m, 1 km, 5 km, 10 km and 20 km respectively,
    and the distance to the closest competitor, i. e. the closest store with the same hierarchy level 4. """
    # create a datafram with all stores extra
    test = pd.read_csv('data/stores_test.csv')
    train = pd.read_csv('data/stores_train.csv')
    extra = pd.read_csv('data/stores_extra.csv')
    extra_df = pd.DataFrame()
    extra_df = extra_df.append(test).append(train).append(extra)
    extra_df = create_lvl(extra_df)

    df['num_stores_within_100m_and_same_lvl4'] = 0
    df['num_stores_within_500m_and_same_lvl4'] = 0
    df['num_stores_within_1km_and_same_lvl4'] = 0
    df['num_stores_within_5km_and_same_lvl4'] = 0
    df['num_stores_within_10km_and_same_lvl4'] = 0
    df['num_stores_within_20km_and_same_lvl4'] = 0
    df['closest_competitor_lv4'] = 10.0
    num = 0 

    for index in range(len(df)):
        lat = df._get_value(index,'lat')
        lon = df._get_value(index,'lon')
        lvl = df._get_value(index, 'lv4')
        dist_to_all = dist_to_all_km(lat, lon, extra_df)

        count01 = 0
        count05 = 0
        count1 = 0
        count5 = 0
        count10 = 0
        count20 = 0
        closest = 100.0

        iter = 0
        for number in dist_to_all:
            if number < 0.1 and extra_df._get_value(iter, 'lv4') == lvl:
                count01 += 1
            if number < 0.5 and extra_df._get_value(iter, 'lv4') == lvl:
                count05 += 1
            if number < 1 and extra_df._get_value(iter, 'lv4') == lvl:
                count1 += 1
            if number < 5 and extra_df._get_value(iter, 'lv4') == lvl:
                count5 += 1
            if number < 10 and extra_df._get_value(iter, 'lv4') == lvl:
                count10 +=1
            if number < 20 and extra_df._get_value(iter, 'lv4') == lvl:
                count20 += 1
            if number < closest and extra_df._get_value(iter, 'lv4') == lvl and number != 0:
                closest = number
            iter += 1

        df._set_value(index, 'num_stores_within_100m_and_same_lvl4', count01)
        df._set_value(index, 'num_stores_within_500m_and_same_lvl4', count05)
        df._set_value(index, 'num_stores_within_1km_and_same_lvl4', count1)
        df._set_value(index, 'num_stores_within_5km_and_same_lvl4', count5)
        df._set_value(index, 'num_stores_within_10km_and_same_lvl4', count10)
        df._set_value(index, 'num_stores_within_20km_and_same_lvl4', count20)
        df._set_value(index, 'closest_competitor_lv4', float(closest))

        num+=1 #for runtime tracking only
        print(num)
    return df

In [8]:
# MEAN AND MEDIAN REVENUE FEATURES

""" Below are functions which computes the mean and median revenue for all the different stores that belongs to the different hierarchy levels. 
We also calculated the mean revenue for chains. """

def mean_rev_lv3(train_data, test_data):
    """ Mean revenue for each hierarchy level 3 category """
    mean_rev = train_data[["lv3", "revenue"]].groupby(
    ["lv3"]
    ).mean().reset_index()
    mean_rev = mean_rev.rename(columns={'revenue':'mean_revenue_lv3'})

    # merge to train data
    train_data = train_data.merge(mean_rev, how="left", on=["lv3"])
    # In case of duplicates, remove them.
    train_data = train_data.drop_duplicates(subset=['store_id'], keep='first')

    # merge to test data
    test_data = test_data.merge(mean_rev, how="left", on=["lv3"])
    # In case of duplicates, remove them.
    test_data = test_data.drop_duplicates(subset=['store_id'], keep='first')
    return train_data, test_data

def mean_rev_lv2(train_data, test_data):
    """ Mean revenue for each hierarchy level 2 category """
    mean_rev = train_data[["lv2", "revenue"]].groupby(
    ["lv2"]
    ).mean().reset_index()
    mean_rev = mean_rev.rename(columns={'revenue':'mean_revenue_lv2'})
    # hej
    # merge to train data
    train_data = train_data.merge(mean_rev, how="left", on=["lv2"])
    # In case of duplicates, remove them.
    train_data = train_data.drop_duplicates(subset=['store_id'], keep='first')

    # merge to test data
    test_data = test_data.merge(mean_rev, how="left", on=["lv2"])
    # In case of duplicates, remove them.
    test_data = test_data.drop_duplicates(subset=['store_id'], keep='first')
    return train_data, test_data

def median_rev_lv2(train_data, test_data):
    """ Median revenue for each hierarchy level 2 category """
    median_rev = train_data[["lv2", "revenue"]].groupby(
    ["lv2"]
    ).median().reset_index()
    median_rev = median_rev.rename(columns={'revenue':'median_revenue_lv2'})

    # merge to train data
    train_data = train_data.merge(median_rev, how="left", on=["lv2"])
    # In case of duplicates, remove them.
    train_data = train_data.drop_duplicates(subset=['store_id'], keep='first')

    # merge to test data
    test_data = test_data.merge(median_rev, how="left", on=["lv2"])
    # In case of duplicates, remove them.
    test_data = test_data.drop_duplicates(subset=['store_id'], keep='first')
    return train_data, test_data

def median_rev_lv3(train_data, test_data):
    """ Median revenue for each hierarchy level 3 category """
    median_rev = train_data[["lv3", "revenue"]].groupby(
    ["lv3"]
    ).median().reset_index()
    median_rev = median_rev.rename(columns={'revenue':'median_revenue_lv3'})

    # merge to train data
    train_data = train_data.merge(median_rev, how="left", on=["lv3"])
    # In case of duplicates, remove them.
    train_data = train_data.drop_duplicates(subset=['store_id'], keep='first')

    # merge to test data
    test_data = test_data.merge(median_rev, how="left", on=["lv3"])
    # In case of duplicates, remove them.
    test_data = test_data.drop_duplicates(subset=['store_id'], keep='first')
    return train_data, test_data

def median_rev_lv4(train_data, test_data):
    """ Median revenue for each hierarchy level 4 category """
    median_rev = train_data[["lv4", "revenue"]].groupby(
    ["lv4"]
    ).median().reset_index()
    median_rev = median_rev.rename(columns={'revenue':'median_revenue_lv4'})

    # merge to train data
    train_data = train_data.merge(median_rev, how="left", on=["lv4"])
    # In case of duplicates, remove them.
    train_data = train_data.drop_duplicates(subset=['store_id'], keep='first')

    # merge to test data
    test_data = test_data.merge(median_rev, how="left", on=["lv4"])
    # In case of duplicates, remove them.
    test_data = test_data.drop_duplicates(subset=['store_id'], keep='first')
    return train_data, test_data

def mean_rev_lv4(train_data, test_data):
    """ Mean revenue for each hierarchy level 4 category """
    mean_rev = train_data[["lv4", "revenue"]].groupby(
    ["lv4"]
    ).mean().reset_index()
    mean_rev = mean_rev.rename(columns={'revenue':'mean_revenue_lv4'})
    # hej
    # merge to train data
    train_data = train_data.merge(mean_rev, how="left", on=["lv4"])
    # In case of duplicates, remove them.
    train_data = train_data.drop_duplicates(subset=['store_id'], keep='first')

    # merge to test data
    test_data = test_data.merge(mean_rev, how="left", on=["lv4"])
    # In case of duplicates, remove them.
    test_data = test_data.drop_duplicates(subset=['store_id'], keep='first')
    return train_data, test_data

def mean_rev_chain(train_data, test_data):
    """ Mean revenue for each chain """
    mean = train_data[["chain_name", "revenue"]].groupby(["chain_name"]).mean().reset_index()
    mean = mean.rename(columns={'revenue':'mean_revenue_chain'})
    train_data = train_data.merge(mean, how="left", on=["chain_name"])
    # we get a bunch of duplicates of store_ids...? Remove them.
    train_data = train_data.drop_duplicates(subset=['store_id'], keep='first')

    # For test
    test_data = test_data.merge(mean, how="left", on=["chain_name"])
    # we get a bunch of duplicates of store_ids...? Remove them.
    test_data = test_data.drop_duplicates(subset=['store_id'], keep='first')
    return train_data, test_data


In [9]:
# RUNNING THE FUNCTIONS DEFINED ABOVE

In [10]:
#only run once for both train and test data
train = rev_to_log(train)
lat_long_busstop()
prep_gk()

In [11]:
# GRUNNKRETS AND MUNICIPALITY FEATURES

#train
train = convert_nan(train)
train = create_lvl(train)
train, test = combine_grunnkrets_and_data(train, test)
train = people_per_gk(train)
train = people_per_store_in_each_gk(train)
train = people_per_store_with_same_lvl2_in_each_gk(train)
train = people_per_sotre_with_same_lvl2_in_each_muninicipality(train)

#test
test = convert_nan(test)
test = create_lvl(test)
test = people_per_gk(test)
test = people_per_store_in_each_gk(test)
test = people_per_store_with_same_lvl2_in_each_gk(test)
test = people_per_sotre_with_same_lvl2_in_each_muninicipality(test)
train, test = people_per_municipality(train, test)

In [12]:
# BUSSTOP AND STORE FEATURES

#train
train = busstops_wihin_distances(train)
train = store_dist_lvl2(train)
train = store_dist_lvl3(train)
train = store_dist_lvl4(train)

#test
test = busstops_wihin_distances(test)
test = store_dist_lvl2(test)
test = store_dist_lvl3(test)
test = store_dist_lvl4(test)


1/12859
2/12859
3/12859
4/12859
5/12859
6/12859
7/12859
8/12859
9/12859
10/12859
11/12859
12/12859
13/12859
14/12859
15/12859
16/12859
17/12859
18/12859
19/12859
20/12859
21/12859
22/12859
23/12859
24/12859
25/12859
26/12859
27/12859
28/12859
29/12859
30/12859
31/12859
32/12859
33/12859
34/12859
35/12859
36/12859
37/12859
38/12859
39/12859
40/12859
41/12859
42/12859
43/12859
44/12859
45/12859
46/12859
47/12859
48/12859
49/12859
50/12859
51/12859
52/12859
53/12859
54/12859
55/12859
56/12859
57/12859
58/12859
59/12859
60/12859
61/12859
62/12859
63/12859
64/12859
65/12859
66/12859
67/12859
68/12859
69/12859
70/12859
71/12859
72/12859
73/12859
74/12859
75/12859
76/12859
77/12859
78/12859
79/12859
80/12859
81/12859
82/12859
83/12859
84/12859
85/12859
86/12859
87/12859
88/12859
89/12859
90/12859
91/12859
92/12859
93/12859
94/12859
95/12859
96/12859
97/12859
98/12859
99/12859
100/12859
101/12859
102/12859
103/12859
104/12859
105/12859
106/12859
107/12859
108/12859
109/12859
110/12859
111/1285

In [13]:
#REVENUE FEATURES FOR MUNICIPALITY GROUPS

train, test = mean_rev_size_group(train, test)
train, test = median_rev_size_group(train, test)
train, test = std_red_size_group(train, test)
train, test = mean_rev_municipality(train, test)
train, test = mean_rev_rev_group(train, test)
train, test = median_rev_rev_group(train, test)

train, test = mean_rev_lv2(train, test)
train, test = mean_rev_lv3(train, test)
train, test = median_rev_lv2(train, test)
train, test = median_rev_lv3(train, test)
train, test = mean_rev_chain(train, test)
train, test = median_rev_lv4(train, test)
train, test = mean_rev_lv4(train, test)

In [14]:
# Decided to also check if mean and median income for all the stores in each municipality was a good feature. 

def mean_income_municipality(train_data, test_data):
    """ Mean income in each municipality """
    mean_rev = train_data[["municipality_name", "revenue"]].groupby(["municipality_name"]).mean().reset_index()
    mean_rev = mean_rev.rename(columns={'revenue':'mean_income_municipality'})

    train_data = train_data.merge(mean_rev, how="left", on=["municipality_name"])
    train_data = train_data.drop_duplicates(subset=['store_id'], keep='first')

    # merge to test data
    test_data = test_data.merge(mean_rev, how="left", on=["municipality_name"])
    # In case of duplicates, remove them.
    test_data = test_data.drop_duplicates(subset=['store_id'], keep='first')
    return train_data, test_data

def median_income_municipality(train_data, test_data):
    """ Median income in each municipality """
    mean_rev = train_data[["municipality_name", "revenue"]].groupby(["municipality_name"]).median().reset_index()
    mean_rev = mean_rev.rename(columns={'revenue':'median_income_municipality'})
    
    train_data = train_data.merge(mean_rev, how="left", on=["municipality_name"])
    train_data = train_data.drop_duplicates(subset=['store_id'], keep='first')

    # merge to test data
    test_data = test_data.merge(mean_rev, how="left", on=["municipality_name"])
    # In case of duplicates, remove them.
    test_data = test_data.drop_duplicates(subset=['store_id'], keep='first')
    return train_data, test_data

train, test = mean_income_municipality(train, test)
train, test = median_income_municipality(train, test)

# Also decided to check if number of stores in the same chain was a good feature.
count_chain = train[["store_id", "chain_name"]].groupby(["chain_name"]).count().reset_index()
count_chain.columns = ["chain_name", "counts_chain"]
train.merge(count_chain, how="left", on=["chain_name"])

count_chain1 = test[["store_id", "chain_name"]].groupby(["chain_name"]).count().reset_index()
count_chain1.columns = ["chain_name", "counts_chain"]
test.merge(count_chain1, how="left", on=["chain_name"])

Unnamed: 0,store_id,year,store_name,plaace_hierarchy_id,sales_channel_name,grunnkrets_id,address,lat,lon,chain_name,...,mean_revenue_lv2,mean_revenue_lv3,median_revenue_lv2,median_revenue_lv3,mean_revenue_chain,median_revenue_lv4,mean_revenue_lv4,mean_income_municipality,median_income_municipality,counts_chain
0,914206820-914239427-717245,2016.0,VÅLERENGA HALAL BURGER AS,1.1.1.0,Hamburger restaurants,3012704.0,STRØMSVEIEN 25 A,59.908672,10.787031,No chain,...,1.505243,2.096344,1.492117,2.042388,1.263696,2.042388,2.096344,1.573874,1.470289,6099.0
1,913341082-977479363-2948,2016.0,BURGER KING STOVNER,1.1.1.0,Hamburger restaurants,3013917.0,STOVNER SENTER 3,59.962146,10.924524,BURGER KING,...,1.505243,2.096344,1.492117,2.042388,1.830112,2.042388,2.096344,1.573874,1.470289,9.0
2,997991699-998006945-417222,2016.0,VULKAN BURGERBAR,1.1.1.0,Hamburger restaurants,3014305.0,AUD SCHØNEMANNS VEI 15,59.921102,10.785123,VULKAN BURGERBAR,...,1.505243,2.096344,1.492117,2.042388,3.397223,2.042388,2.096344,1.573874,1.470289,1.0
3,914931487-815162862-756427,2016.0,BURGER KING OSLO S,1.1.1.0,Hamburger restaurants,3013105.0,EKEBERGVEIEN 235,59.861083,10.799907,BURGER KING,...,1.505243,2.096344,1.492117,2.042388,1.830112,2.042388,2.096344,1.573874,1.470289,9.0
4,914631734-914748119-740036,2016.0,KVERNERIET & FRIENDS,1.1.1.0,Hamburger restaurants,3010909.0,KIRKEVEIEN 64 A,59.929256,10.715470,No chain,...,1.505243,2.096344,1.492117,2.042388,1.263696,2.042388,2.096344,1.573874,1.470289,6099.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8573,915493076-988458864-239310,2016.0,BUVIK FRISØR AS,3.2.1.0,Hairdressers,16570206.0,GJERDESGARDEN 6,63.308618,10.158853,No chain,...,0.967281,1.138065,0.888714,1.103600,1.263696,1.103600,1.138065,1.608211,1.119588,6099.0
8574,915321275-915542816-771670,2016.0,IRENS HÅRSTUDIO AS,3.2.4.0,Spas,8220204.0,GVARVGATA 47,59.388267,9.176232,No chain,...,0.967281,0.851122,0.888714,0.723191,1.263696,0.723191,0.851122,1.524025,1.681380,6099.0
8575,984215533-984234902-388305,2016.0,NATURLIG HELSE AS,3.2.4.0,Spas,11420103.0,SMIDALEN 1,59.101770,5.702666,No chain,...,0.967281,0.851122,0.888714,0.723191,1.263696,0.723191,0.851122,1.251160,1.174029,6099.0
8576,976870352-977103460-224357,2016.0,TRAINING OF LIFE AS,3.2.4.0,Spas,11420101.0,HEGGLANDSLIA 9,59.087327,5.761732,No chain,...,0.967281,0.851122,0.888714,0.723191,1.263696,0.723191,0.851122,1.251160,1.174029,6099.0


In [15]:
# SAVE RAW DATA TO FILE
test.to_csv('testing_set.csv', index=False)
train.to_csv('training_set.csv', index=False)

### Feature Selection

In [16]:
# BEST IMPLEMENTATION
import pandas as pd
import numpy as np
train = pd.read_csv('training_set.csv')
test = pd.read_csv('testing_set.csv')

def drop_all(train_data):
    train_data = train_data.drop(['year', 'store_name', 'plaace_hierarchy_id', 'sales_channel_name', 'grunnkrets_id', 'lv1'], axis=1)
    train_data = train_data.drop(['address', 'lat', 'lon', 'municipality_name'], axis=1)
    train_data = train_data.drop(['busstops_within_50m', 'num_stores_within_100m_and_same_lvl3', 'busstops_within_100m', 'num_stores_within_100m_and_same_lvl2'], axis = 1)
    train_data = train_data.drop(['num_stores_within_500m_and_same_lvl3', 'num_stores_within_10km_and_same_lvl3', 'counts_gr_lv2'], axis=1)
    train_data = train_data.drop(['counts_municipality_lv2', 'num_stores_within_1km_and_same_lvl3', 'num_stores_within_10km_and_same_lvl2', 'num_stores_within_5km_and_same_lvl3'], axis=1)
    train_data = train_data.drop(['busstops_within_400m', 'num_stores_within_1km_and_same_lvl2', 'num_stores_within_5km_and_same_lvl2', 'busstops_within_800m'], axis=1)
    train_data = train_data.drop(['municipality_size_group', 'st_dev_of_revenue_for_municipality_size_group', 'mean_revenue_for_municipality_size_group', 'median_revenue_for_municipality_size_group'], axis=1)
    train_data = train_data.drop(['mean_income_municipality', 'median_income_municipality'], axis=1)
    return train_data
    
def remove_zero_revenue(train_data):
    train_data = train_data.loc[train_data["revenue"] > 0.1]
    train_data = train_data.reset_index(drop=True)
    return train_data

def remove_high_revenue(train_data):
    train_data = train_data.loc[train_data["revenue"] < 5]
    train_data = train_data.reset_index(drop=True)
    return train_data

def log_all(df):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    for c in [c for c in df.columns if df[c].dtype in numerics]:
        df[c] = np.log1p(df[c])

    #log back revenue
    if 'revenue' in df.columns:
        df['revenue'] = np.expm1(df['revenue'])
    return df

test = drop_all(test)
train = drop_all(train)
train = log_all(train)
test = log_all(test)
train = remove_zero_revenue(train)
train = remove_high_revenue(train)
test.to_csv('testing_set_dropped_best.csv', index=False)
train.to_csv('training_set_dropped_best.csv', index=False)


### Model

In [17]:
# Import and init h2o lib. Running on java
import h2o
from h2o.automl import H2OAutoML
import pandas as pd
h2o.init()
col_types_best = {
 'chain_name': 'enum',
 'mall_name': 'enum',
 'revenue': 'numeric',
 'lv2': 'enum',
 'lv3': 'enum'
 }

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,1 hour 39 mins
H2O_cluster_timezone:,Europe/Oslo
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.38.0.2
H2O_cluster_version_age:,17 days
H2O_cluster_name:,H2O_from_python_paaske_fu3vy1
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,1.890 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [18]:
# Reading csv to h2o dataframe 

train_data = pd.read_csv('training_set_dropped_best.csv')
test_data = pd.read_csv('testing_set_dropped_best.csv')

# Save store ids, then remove this column
store_ids = test_data['store_id'].to_numpy()
ids = pd.DataFrame(store_ids)
ids.rename(columns={0 :'id'}, inplace=True)

train_data = train_data.drop('store_id', axis=1)
test_data = test_data.drop('store_id', axis=1)


train_data.to_csv('training_set_dropped_no_id.csv', index=False)
test_data.to_csv('testing_set_dropped_no_id.csv', index=False)

train_data = h2o.import_file('training_set_dropped_no_id.csv', col_types=col_types_best)
del col_types_best['revenue']
test_data = h2o.import_file('testing_set_dropped_no_id.csv', col_types=col_types_best)

train_data.types

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


{'chain_name': 'enum',
 'mall_name': 'enum',
 'revenue': 'real',
 'lv2': 'enum',
 'lv3': 'enum',
 'lv4': 'enum',
 'total_nbr_people': 'real',
 'nbr_people_per_store_in_grunnkrets': 'real',
 'busstops_within_1500m': 'real',
 'busstops_within_3000m': 'real',
 'busstops_within_6000m': 'real',
 'num_stores_within_500m_and_same_lvl2': 'real',
 'num_stores_within_20km_and_same_lvl2': 'real',
 'closest_competitor_lv2': 'real',
 'num_stores_within_20km_and_same_lvl3': 'real',
 'closest_competitor_lv3': 'real',
 'num_stores_within_100m_and_same_lvl4': 'real',
 'num_stores_within_500m_and_same_lvl4': 'real',
 'num_stores_within_1km_and_same_lvl4': 'real',
 'num_stores_within_5km_and_same_lvl4': 'real',
 'num_stores_within_10km_and_same_lvl4': 'real',
 'num_stores_within_20km_and_same_lvl4': 'real',
 'closest_competitor_lv4': 'real',
 'municipality_rev_group': 'enum',
 'mean_revenue_for_municipality_rev_group': 'real',
 'median_revenue_for_municipality_rev_group': 'real',
 'mean_revenue_lv2': 're

In [19]:
# Removing labels from train set
y = "revenue"
x = train_data.columns
x.remove(y)
test_data

chain_name,mall_name,lv2,lv3,lv4,total_nbr_people,nbr_people_per_store_in_grunnkrets,busstops_within_1500m,busstops_within_3000m,busstops_within_6000m,num_stores_within_500m_and_same_lvl2,num_stores_within_20km_and_same_lvl2,closest_competitor_lv2,num_stores_within_20km_and_same_lvl3,closest_competitor_lv3,num_stores_within_100m_and_same_lvl4,num_stores_within_500m_and_same_lvl4,num_stores_within_1km_and_same_lvl4,num_stores_within_5km_and_same_lvl4,num_stores_within_10km_and_same_lvl4,num_stores_within_20km_and_same_lvl4,closest_competitor_lv4,municipality_rev_group,mean_revenue_for_municipality_rev_group,median_revenue_for_municipality_rev_group,mean_revenue_lv2,mean_revenue_lv3,median_revenue_lv2,median_revenue_lv3,mean_revenue_chain,median_revenue_lv4,mean_revenue_lv4
No chain,No mall,0.741937,1.1.1,1.1.1.0,6.86797,6.86797,4.70953,5.76519,6.83626,2.63906,7.58579,0.0321771,3.91202,0.861841,0.693147,0.693147,0.693147,3.3673,3.63759,3.91202,0.861841,rev_1.2_to_2,0.953556,0.911095,0.918386,1.13022,0.913133,1.11264,0.816999,1.11264,1.13022
BURGER KING,Stovner Senter,0.741937,1.1.1,1.1.1.0,7.79235,4.85531,3.80666,5.0876,6.17794,2.30259,7.54697,0.000516631,3.82864,1.29872,0.693147,0.693147,0.693147,2.19722,2.94444,3.82864,1.29872,rev_1.2_to_2,0.953556,0.911095,0.918386,1.13022,0.913133,1.11264,1.04032,1.11264,1.13022
VULKAN BURGERBAR,No mall,0.741937,1.1.1,1.1.1.0,8.50147,6.20071,4.77068,6.01127,6.87316,1.79176,7.58477,0.30168,3.89182,0.823647,0.693147,0.693147,0.693147,3.43399,3.63759,3.89182,0.823647,rev_1.2_to_2,0.953556,0.911095,0.918386,1.13022,0.913133,1.11264,1.48097,1.11264,1.13022
BURGER KING,No mall,0.741937,1.1.1,1.1.1.0,6.93828,5.33272,4.02535,5.20949,6.13556,2.07944,7.5984,0.0432923,3.93183,1.00078,0.693147,0.693147,0.693147,1.09861,3.46574,3.93183,1.00078,rev_1.2_to_2,0.953556,0.911095,0.918386,1.13022,0.913133,1.11264,1.04032,1.11264,1.13022
No chain,No mall,0.741937,1.1.1,1.1.1.0,7.36771,4.33573,4.35671,5.66988,6.7334,4.04305,7.6004,0.0603135,3.89182,0.0843221,1.09861,1.38629,1.38629,3.3322,3.55535,3.89182,0.0843221,rev_1.2_to_2,0.953556,0.911095,0.918386,1.13022,0.913133,1.11264,0.816999,1.11264,1.13022
No chain,No mall,0.741937,1.1.1,1.1.1.0,4.77912,1.04512,4.82831,5.86079,6.80683,5.39363,7.5994,0.0562585,3.91202,0.166935,0.693147,1.79176,2.63906,3.29584,3.55535,3.91202,0.166935,rev_1.2_to_2,0.953556,0.911095,0.918386,1.13022,0.913133,1.11264,0.816999,1.11264,1.13022
No chain,No mall,0.741937,1.1.1,1.1.1.0,7.31788,4.84024,4.23411,5.64191,6.76619,1.60944,7.6039,0.0413104,3.93183,0.632861,0.693147,0.693147,1.09861,3.21888,3.55535,3.93183,0.632861,rev_1.2_to_2,0.953556,0.911095,0.918386,1.13022,0.913133,1.11264,0.816999,1.11264,1.13022
No chain,No mall,0.741937,1.1.2,1.1.2.0,7.19369,5.11949,4.85981,5.80513,6.76619,4.45435,7.6014,0.0168698,5.01064,0.107576,0.693147,2.30259,3.2581,4.61512,4.81218,5.01064,0.107576,rev_1.2_to_2,0.953556,0.911095,0.918386,0.951516,0.913133,0.90762,0.816999,0.90762,0.951516
No chain,No mall,0.741937,1.1.2,1.1.2.0,6.49979,4.56286,4.43082,5.57215,6.70319,4.17439,7.6039,0.0295874,5.00395,0.128105,1.09861,2.07944,3.04452,4.59512,4.81218,5.00395,0.128105,rev_1.2_to_2,0.953556,0.911095,0.918386,0.951516,0.913133,0.90762,0.816999,0.90762,0.951516
No chain,No mall,0.741937,1.1.2,1.1.2.0,8.50147,6.20071,4.7185,5.99146,6.86066,2.30259,7.58426,0.290387,4.99721,0.33885,0.693147,1.09861,1.38629,4.65396,4.79579,4.99721,0.33885,rev_1.2_to_2,0.953556,0.911095,0.918386,0.951516,0.913133,0.90762,0.816999,0.90762,0.951516


In [20]:
# Running fit/training on train set with cross validation n=5
aml = H2OAutoML(max_models = 30, seed = 2, stopping_metric = "RMSE")
aml.train(x = x, y = y, training_frame = train_data)

AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%


Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
mae,0.4987331,0.0118125,0.508216,0.5057749,0.4973718,0.4788671,0.5034357
mean_residual_deviance,0.4292242,0.02105,0.4488407,0.4433082,0.4216951,0.3962628,0.4360144
mse,0.4292242,0.02105,0.4488407,0.4433082,0.4216951,0.3962628,0.4360144
null_deviance,2476.749,97.58557,2593.3472,2497.2578,2507.8208,2459.624,2325.695
r2,0.5677556,0.0178062,0.5583124,0.5544742,0.5728604,0.5968311,0.5562999
residual_deviance,1070.1089,61.404324,1145.4414,1111.3737,1071.1056,991.4495,1031.1742
rmse,0.6549916,0.0162094,0.6699557,0.6658139,0.6493806,0.6294941,0.6603139
rmsle,0.2757734,0.0049062,0.2805395,0.2721107,0.2770253,0.2693132,0.2798784


In [21]:
lb = aml.leaderboard

In [22]:
# Showing best perforimg models
lb.head()

model_id,rmse,mse,mae,rmsle,mean_residual_deviance
StackedEnsemble_AllModels_1_AutoML_2_20221113_173446,0.655179,0.429259,0.498456,0.275737,0.429259
StackedEnsemble_BestOfFamily_1_AutoML_2_20221113_173446,0.656257,0.430673,0.499899,0.276269,0.430673
DeepLearning_grid_1_AutoML_2_20221113_173446_model_2,0.66248,0.43888,0.505761,0.279385,0.43888
XGBoost_grid_1_AutoML_2_20221113_173446_model_1,0.66277,0.439264,0.505379,0.27844,0.439264
XGBoost_grid_1_AutoML_2_20221113_173446_model_3,0.662819,0.439329,0.505891,0.278455,0.439329
XGBoost_grid_1_AutoML_2_20221113_173446_model_5,0.663344,0.440025,0.506039,0.278763,0.440025
GLM_1_AutoML_2_20221113_173446,0.66401,0.44091,0.508231,0.278932,0.44091
DeepLearning_grid_2_AutoML_2_20221113_173446_model_2,0.664961,0.442173,0.510466,0.280475,0.442173
DeepLearning_1_AutoML_2_20221113_173446,0.665809,0.443302,0.506778,0.27903,0.443302
DeepLearning_grid_3_AutoML_2_20221113_173446_model_1,0.667021,0.444917,0.5124,0.280395,0.444917


In [23]:
preds = aml.predict(test_data)

stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%




In [24]:
#print variable importance
corr = aml.varimp(use_pandas = True)
corr['sum'] = corr.sum(axis=1)
corr = corr.sort_values(by=['sum'])
corr.head(21)

Unnamed: 0,DeepLearning_1,DeepLearning_grid_3_model_1,DeepLearning_grid_1_model_2,DeepLearning_grid_1_model_1,DeepLearning_grid_2_model_2,DeepLearning_grid_3_model_2,DeepLearning_grid_2_model_1,GLM_1,GBM_grid_1_model_4,GBM_3,...,DRF_1,XRT_1,GBM_1,GBM_grid_1_model_2,GBM_grid_1_model_1,GBM_4,GBM_2,GBM_grid_1_model_5,GBM_5,sum
median_revenue_for_municipality_rev_group,0.001403,0.000394,0.000362,0.000262,0.000404,0.000474,0.000399,0.001601,0.000605,0.001446,...,0.001481,0.001643,0.000479,0.00239,0.001089,0.001186,0.001905,0.001942,0.001639,0.041642
mean_revenue_for_municipality_rev_group,0.001363,0.000487,0.000438,0.000245,0.000447,0.000602,0.0005,0.001666,0.000345,0.001224,...,0.001456,0.001826,0.00095,0.0009,0.002146,0.002053,0.001082,0.001114,0.001452,0.049639
municipality_rev_group,0.005828,0.005473,0.0067,0.006314,0.005769,0.004952,0.005324,0.002614,0.000395,0.001528,...,0.002165,0.001103,0.001188,0.001189,0.000588,0.001152,0.001437,0.001175,0.001861,0.068668
num_stores_within_100m_and_same_lvl4,0.001237,0.000892,0.000672,0.000968,0.000855,0.000743,0.000898,0.000919,0.001015,0.001544,...,0.007431,0.005442,0.001507,0.003479,0.004259,0.00357,0.001636,0.002659,0.0008,0.073793
num_stores_within_5km_and_same_lvl4,0.001911,0.000369,0.000517,0.000292,0.000572,0.000557,0.000545,0.002257,0.003204,0.004288,...,0.012382,0.009203,0.003244,0.011597,0.011459,0.00648,0.003363,0.00886,0.002891,0.196084
closest_competitor_lv3,0.001612,0.000416,0.000387,0.000359,0.000395,0.000337,0.000726,0.001314,0.00184,0.003868,...,0.014882,0.010452,0.002493,0.009367,0.00945,0.005737,0.001521,0.00725,0.001222,0.244543
lv2,0.023513,0.012758,0.013315,0.012432,0.016696,0.015781,0.015791,0.033546,0.000616,0.000795,...,0.005896,0.010667,0.000885,0.004907,0.04734,0.002238,0.001197,0.005077,0.001171,0.290754
median_revenue_lv3,0.001093,0.000415,0.000399,0.000369,0.000774,0.000671,0.000597,0.001728,1.2e-05,0.000192,...,0.006969,0.014151,0.009722,0.06097,0.03619,0.001061,9.1e-05,0.009776,0.003182,0.295252
busstops_within_1500m,0.000675,0.000529,0.000592,0.000498,0.000664,0.000496,0.000467,0.000654,0.003689,0.006702,...,0.02238,0.014271,0.005154,0.019605,0.011426,0.014397,0.006106,0.013849,0.006229,0.333846
num_stores_within_500m_and_same_lvl2,0.001729,0.000585,0.000585,0.000583,0.00092,0.000826,0.000704,0.004387,0.008532,0.011519,...,0.02069,0.014206,0.011469,0.018343,0.016501,0.016211,0.012222,0.018685,0.010634,0.353512


In [25]:
#preds = h2o.as_list(preds)
print(preds)

  predict
  1.62723
  1.68007
  3.29146
  1.75239
  1.79959
  2.13673
  1.78773
  1.66027
  1.79654
  1.52193
[8578 rows x 1 column]



### Save Submission

In [26]:
# Convert result to desired format

import pandas as pd
import numpy as np


pandas_preds = preds.as_data_frame()
pandas_preds.rename(columns={'predict' :'predicted'}, inplace=True)
pandas_preds['predicted'] = np.expm1(pandas_preds['predicted'])

def save_submission(pred):
    predicted = pd.DataFrame(pred, columns = ['predicted'])
    output = pd.concat([ids,predicted],axis=1)
    output.drop(output.tail(1).index,inplace=True)
    output.to_csv('h2o.csv', index=False)
    
save_submission(pandas_preds)