In [None]:
import math
import geopy.distance
from dis import dis
import math
import geopandas as gpd
import numpy
from shapely import wkt
from shapely import wkb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from geopy.geocoders import Nominatim

%matplotlib inline

train_data = pd.read_csv('../data/stores_train.csv')
test_data = pd.read_csv('../data/stores_test.csv')
busstops = pd.read_csv('../data/busstops_norway.csv')
grunnkrets_age = pd.read_csv('../data/grunnkrets_age_distribution.csv')
grunnkrets_households = pd.read_csv('../data/grunnkrets_households_num_persons.csv')
grunnkrets_income = pd.read_csv('../data/grunnkrets_income_households.csv')
grunnkrets_stripped = pd.read_csv('../data/grunnkrets_norway_stripped.csv')
plaace_hierarchy = pd.read_csv('../data/plaace_hierarchy.csv')

In [None]:
# Replace NaN in mall_name and chain_name columns with 'No mall' and 'No chain'
train_data.mall_name = train_data.mall_name.fillna('No mall')
train_data.chain_name = train_data.chain_name.fillna('No chain')
# For test
test_data.mall_name = test_data.mall_name.fillna('No mall')
test_data.chain_name = test_data.chain_name.fillna('No chain')

# Dummy variable for mall or no mall
train_data.loc[train_data['mall_name'].str.contains("No mall", na=False),'mall_dummy'] = 0
train_data.loc[~(train_data['mall_name'].str.contains("No mall", na=False)),'mall_dummy'] = 1
train_data.drop(['mall_name'],axis=1, inplace=True)
# For test
test_data.loc[test_data['mall_name'].str.contains("No mall", na=False),'mall_dummy'] = 0
test_data.loc[~(test_data['mall_name'].str.contains("No mall", na=False)),'mall_dummy'] = 1
test_data.drop(['mall_name'],axis=1, inplace=True)

# 'store_name', 'year', 'sales_channel_name', 'address' columns are redundant, remove them
train_data = train_data.drop('store_name',axis=1)
train_data = train_data.drop('sales_channel_name',axis=1)
train_data = train_data.drop('address',axis=1)
# For test
test_data = test_data.drop('store_name',axis=1)
test_data = test_data.drop('sales_channel_name',axis=1)
test_data = test_data.drop('address',axis=1)

In [None]:
# Add municipality names
train_data = pd.merge(train_data, grunnkrets_stripped[['grunnkrets_id', 'municipality_name']], on='grunnkrets_id', how='left')
# we get a bunch of duplicates of store_ids...? Remove them.
train_data = train_data.drop_duplicates(subset=['store_id'], keep='first')
train_data.municipality_name = train_data.municipality_name.fillna('No municipality name')

# For test
test_data = pd.merge(test_data, grunnkrets_stripped[['grunnkrets_id', 'municipality_name']], on='grunnkrets_id', how='left')
# we get a bunch of duplicates of store_ids...? Remove them.
test_data = test_data.drop_duplicates(subset=['store_id'], keep='first')
test_data.municipality_name = test_data.municipality_name.fillna('No municipality name')

In [None]:
# Total nbr people in each grunnkrets
grunnkrets_age = grunnkrets_age.drop_duplicates(subset=['grunnkrets_id'], keep='last') # if there is value for 2016 we keep it, otherwise 2015
grunnkrets_age = grunnkrets_age.fillna(0)
grunnkrets_age = grunnkrets_age.drop('year',axis=1)
grunnkrets_age['grunnkrets_id'] = grunnkrets_age['grunnkrets_id'].astype(str)
grunnkrets_age['total_nbr_people'] = grunnkrets_age.sum(axis=1) # total number of inhabitants
grunnkrets_age['grunnkrets_id'] = grunnkrets_age['grunnkrets_id'].astype(int)
train_data = pd.merge(train_data, grunnkrets_age[['grunnkrets_id', 'total_nbr_people']], on='grunnkrets_id', how='left')

In [None]:
######### MUNICIPALITY SIZE GROUPS #########

In [None]:
# Total nbr people in each municipality
municipalities = train_data[["municipality_name", "total_nbr_people"]].groupby(
    ["municipality_name"]
).sum().reset_index()
municipalities = municipalities.rename(columns={'total_nbr_people':'nbr_people_in_municipality'})

# Print distribution to check relevant division into small/medium/large municipality
municipalities = municipalities[municipalities['municipality_name'] != 'No municipality name'] # remove No municipality name (NaN)

#print(municipalities['nbr_people_in_municipality'].describe())
#ax = municipalities.plot.bar(x='municipality_name', y='nbr_people_in_municipality', rot=0)
#print(municipalities)

# Make new column in municipalities for municipality size category, assign categories
conditions = [
    (municipalities['nbr_people_in_municipality'] < 1.612750e+03),
    (municipalities['nbr_people_in_municipality'] >= 1.612750e+03) & (municipalities['nbr_people_in_municipality'] < 5.731000e+03),
    (municipalities['nbr_people_in_municipality'] >= 5.731000e+03) & (municipalities['nbr_people_in_municipality'] < 1.717325e+04),
    (municipalities['nbr_people_in_municipality'] >= 1.717325e+04) & (municipalities['nbr_people_in_municipality'] < (2.109973e+06)-1),
    (municipalities['nbr_people_in_municipality'] >= (2.109973e+06)-1),
]
values = ['1', '2', '3', '4', '0']
municipalities['municipality_size_group'] = np.select(conditions, values)
#print(municipalities)
# municipalities['municipality_size_group'].value_counts() # four size categories of 102-103 municipalities in each, category 0 is the 'No municipality name' one

# merge to train data
train_data = pd.merge(train_data, municipalities[['municipality_name', 'municipality_size_group']], on='municipality_name', how='outer')

# merge to test data
test_data = pd.merge(test_data, municipalities[['municipality_name', 'municipality_size_group']], on='municipality_name', how='outer')


In [None]:
# mean rev per municipality size group
mean_rev_munic = train_data[["municipality_size_group", "revenue"]].groupby(
    ["municipality_size_group"]
).mean().reset_index()
mean_rev_munic = mean_rev_munic.rename(columns={'revenue':'mean_revenue_for_municipality_size_group'})

# merge to train data
train_data = train_data.merge(mean_rev_munic, how="left", on=["municipality_size_group"])
# In case of duplicates, remove them.
train_data = train_data.drop_duplicates(subset=['store_id'], keep='first')

# merge to test data
test_data = test_data.merge(mean_rev_munic, how="left", on=["municipality_size_group"])
# In case of duplicates, remove them.
test_data = test_data.drop_duplicates(subset=['store_id'], keep='first')

In [None]:
# median rev per municipality size group
median_rev_munic = train_data[["municipality_size_group", "revenue"]].groupby(
    ["municipality_size_group"]
).median().reset_index()
median_rev_munic = median_rev_munic.rename(columns={'revenue':'median_revenue_for_municipality_size_group'})

# merge to train data
train_data = train_data.merge(median_rev_munic, how="left", on=["municipality_size_group"])
# In case of duplicates, remove them.
train_data = train_data.drop_duplicates(subset=['store_id'], keep='first')

# merge to test data
test_data = test_data.merge(median_rev_munic, how="left", on=["municipality_size_group"])
# In case of duplicates, remove them.
test_data = test_data.drop_duplicates(subset=['store_id'], keep='first')

In [None]:
# st dev per municipality size group
stdev_munic = train_data[["municipality_size_group", "revenue"]].groupby(
    ["municipality_size_group"]
).std().reset_index()
stdev_munic = stdev_munic.rename(columns={'revenue':'st_dev_of_revenue_for_municipality_size_group'})

# merge to train data
train_data = train_data.merge(stdev_munic, how="left", on=["municipality_size_group"])
# In case of duplicates, remove them.
train_data = train_data.drop_duplicates(subset=['store_id'], keep='first')

# merge to test data
test_data = test_data.merge(stdev_munic, how="left", on=["municipality_size_group"])
# In case of duplicates, remove them.
test_data = test_data.drop_duplicates(subset=['store_id'], keep='first')

In [None]:
######### MUNICIPALITY REVENUE GROUPS #########

In [None]:
# Mean revenue per municipality



In [None]:
# mean rev per municipality size group DONE
# st dev rev per municipality DONE
# median rev per municipality size group DONE
# antal folk => stor/medel/liten by => mean/median/stdev i stor/medel/liten by DONE

# gruppera kommuner på mean rev => high revenue municipalities/medium revenue municipalities/low revenue municipalities
