In [None]:
import math
import geopy.distance
from dis import dis
import math
import geopandas as gpd
import numpy
from shapely import wkt
from shapely import wkb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from geopy.geocoders import Nominatim

%matplotlib inline

train_data = pd.read_csv('../data/stores_train.csv')
busstops = pd.read_csv('../data/busstops_norway.csv')
grunnkrets_age = pd.read_csv('../data/grunnkrets_age_distribution.csv')
grunnkrets_households = pd.read_csv('../data/grunnkrets_households_num_persons.csv')
grunnkrets_income = pd.read_csv('../data/grunnkrets_income_households.csv')
grunnkrets_stripped = pd.read_csv('../data/grunnkrets_norway_stripped.csv')
plaace_hierarchy = pd.read_csv('../data/plaace_hierarchy.csv')

In [None]:
train_data.head()

In [None]:
#convert y values to logarithmic scale
train_data['revenue'] = np.log1p(train_data['revenue'])
train_data.head()
    

In [None]:
""" #convert them back to normal scale
train_data['revenue'] = np.exp(train_data['revenue']) -1
train_data.head() """

In [None]:
# Replace NaN in mall_name and chain_name columns with 'No mall' and 'No chain'
train_data.mall_name = train_data.mall_name.fillna('No mall')
train_data.chain_name = train_data.chain_name.fillna('No chain')

# Dummy variable for mall or no mall
train_data.loc[train_data['mall_name'].str.contains("No mall", na=False),'mall_dummy'] = 0
train_data.loc[~(train_data['mall_name'].str.contains("No mall", na=False)),'mall_dummy'] = 1
train_data.drop(['mall_name'],axis=1, inplace=True)

# 'store_name', 'year', 'sales_channel_name', 'address' columns are redundant, remove them
train_data = train_data.drop('store_name',axis=1)
train_data = train_data.drop('year',axis=1)
train_data = train_data.drop('sales_channel_name',axis=1)
train_data = train_data.drop('address',axis=1)


In [None]:
train_data = pd.merge(train_data, grunnkrets_stripped[['grunnkrets_id', 'municipality_name']], on='grunnkrets_id', how='left')
# we get a bunch of duplicates of store_ids...? Remove them.
train_data = train_data.drop_duplicates(subset=['store_id'], keep='first')

In [None]:
chains = train_data.filter(['chain_name','revenue'], axis=1)
chains = chains.groupby('chain_name').mean()
chains = chains.rename(columns={'revenue':'mean_revenue_for_chain'})
train_data.merge(chains, how="left", on=["chain_name"])


In [None]:
train_data = pd.merge(train_data, plaace_hierarchy[['plaace_hierarchy_id', 'lv1', 'lv2', 'lv3']], on='plaace_hierarchy_id', how='outer')
train_data['lv1']= train_data['lv1'].astype('category')
train_data['lv2']= train_data['lv2'].astype('category')
train_data['lv3']= train_data['lv3'].astype('category')

In [None]:
grunnkrets_age = grunnkrets_age.drop_duplicates(subset=['grunnkrets_id'], keep='last') # if there is value for 2016 we keep it, otherwise 2015
grunnkrets_age = grunnkrets_age.fillna(0)
grunnkrets_age = grunnkrets_age.drop('year',axis=1)
grunnkrets_age['grunnkrets_id'] = grunnkrets_age['grunnkrets_id'].astype(str)
grunnkrets_age['total_nbr_people'] = grunnkrets_age.sum(axis=1) # total number of inhabitants
""" grunnkrets_age['group1'] = grunnkrets_age.iloc[:,1:11].sum(axis=1) # 0-9 years old
grunnkrets_age['group2'] = grunnkrets_age.iloc[:,11:21].sum(axis=1) # 10-19 years old etc
grunnkrets_age['group3'] = grunnkrets_age.iloc[:,21:31].sum(axis=1)
grunnkrets_age['group4'] = grunnkrets_age.iloc[:,31:41].sum(axis=1)
grunnkrets_age['group5'] = grunnkrets_age.iloc[:,41:51].sum(axis=1)
grunnkrets_age['group6'] = grunnkrets_age.iloc[:,51:61].sum(axis=1)
grunnkrets_age['group7'] = grunnkrets_age.iloc[:,61:71].sum(axis=1)
grunnkrets_age['group8'] = grunnkrets_age.iloc[:,71:81].sum(axis=1)
grunnkrets_age['group9'] = grunnkrets_age.iloc[:,81:92].sum(axis=1) # 80-90 years old """
grunnkrets_age['grunnkrets_id'] = grunnkrets_age['grunnkrets_id'].astype(int)
#train_data = pd.merge(train_data, grunnkrets_age[['grunnkrets_id', 'total_nbr_people', 'group1', 'group2', 'group3', 'group4', 'group5', 'group6', 'group7', 'group8', 'group9']], on='grunnkrets_id', how='left')
train_data = pd.merge(train_data, grunnkrets_age[['grunnkrets_id', 'total_nbr_people']], on='grunnkrets_id', how='left')

In [None]:
# Number of people per store in each grunnkrets regardless of hierarchy
number_stores = train_data['grunnkrets_id'].value_counts().rename_axis('grunnkrets_id').reset_index(name='store_counts_total') # Not including NaN (stores without a grunnkrets_id)
grunnkrets_stripped = pd.merge(grunnkrets_stripped, number_stores[['grunnkrets_id', 'store_counts_total']], on='grunnkrets_id', how='left')
grunnkrets_stripped.store_counts_total = grunnkrets_stripped.store_counts_total.fillna(0)
grunnkrets_stripped = pd.merge(grunnkrets_stripped, grunnkrets_age[['grunnkrets_id', 'total_nbr_people']], on='grunnkrets_id', how='left')
grunnkrets_stripped['nbr_people_per_store_in_grunnkrets'] = grunnkrets_stripped['total_nbr_people']/grunnkrets_stripped['store_counts_total']
train_data = pd.merge(train_data, grunnkrets_stripped[['grunnkrets_id', 'nbr_people_per_store_in_grunnkrets']], on='grunnkrets_id', how='left')


In [None]:
# Number of people per store in each grunnkrets in lv2
counts = train_data[["store_id", "grunnkrets_id", "lv2"]].groupby(
    ["grunnkrets_id", "lv2"]
).count().reset_index()
counts.columns = ["grunnkrets_id", "lv2", "counts_gr_lv2"]
train_data = train_data.merge(counts, how="left", on=["grunnkrets_id", "lv2"])

In [None]:
#train_data.update(train_data[['total_nbr_people','group1','group2','group3','group4','group5','group6','group7','group8','group9']].fillna(0))
#train_data.update(train_data[['nbr_people_per_store_in_grunnkrets','nbr_people_per_km2']].fillna(0))


In [None]:
# Number of stores in same lv2 in each municipality
nbr_in_municipality = train_data[["store_id","municipality_name", "lv2"]].groupby(
    ["municipality_name", "lv2"]
).count().reset_index()
nbr_in_municipality.columns = ["municipality_name", "lv2", "counts_municipality_lv2"]
train_data = train_data.merge(nbr_in_municipality, how="left", on=["municipality_name", "lv2"])

In [None]:
# Mean revenue for each lv1 in each municipality
municipalities = train_data[["municipality_name", "lv1", "revenue"]].groupby(
    ["municipality_name", "lv1"]
).mean().reset_index()
municipalities = municipalities.rename(columns={'revenue':'mean_revenue_for_municipality_and_level1'})
train_data = train_data.merge(municipalities, how="left", on=["municipality_name", "lv1"])
# we get a bunch of duplicates of store_ids...? Remove them.
train_data = train_data.drop_duplicates(subset=['store_id'], keep='first')
train_data.head()

In [None]:
# Mean revenue for each lv2 in each municipality
municipalities = train_data[["municipality_name", "lv2", "revenue"]].groupby(
    ["municipality_name", "lv2"]
).mean().reset_index()
municipalities = municipalities.rename(columns={'revenue':'mean_revenue_for_municipality_and_level2'})
train_data = train_data.merge(municipalities, how="left", on=["municipality_name", "lv2"])
# we get a bunch of duplicates of store_ids...? Remove them.
train_data = train_data.drop_duplicates(subset=['store_id'], keep='first')
train_data.head()


In [None]:
# Mean revenue for each lv3 in each municipality
municipalities = train_data[["municipality_name", "lv3", "revenue"]].groupby(
    ["municipality_name", "lv3"]
).mean().reset_index()
municipalities = municipalities.rename(columns={'revenue':'mean_revenue_for_municipality_and_level3'})
train_data = train_data.merge(municipalities, how="left", on=["municipality_name", "lv3"])
# we get a bunch of duplicates of store_ids...? Remove them.
train_data = train_data.drop_duplicates(subset=['store_id'], keep='first')
train_data.head()

In [None]:
col_at_end = ['revenue']
train_data = train_data[[c for c in train_data if c not in col_at_end] + [c for c in col_at_end if c in train_data]]

In [None]:
train_data.head()

In [None]:
print(list(train_data.columns))

In [None]:
#remove all the features we dont use for our model
#train_data = train_data.drop('store_id',axis=1)
train_data = train_data.drop('plaace_hierarchy_id',axis=1)
#train_data = train_data.drop('grunnkrets_id',axis=1)
train_data = train_data.drop('chain_name',axis=1)
train_data = train_data.drop('municipality_name',axis=1)
train_data = train_data.drop('lv3',axis=1)

In [None]:
train_data.to_csv('../data/modified_data.csv')
train_data.head()
