In [167]:
import math
import geopy.distance
from dis import dis
import math
import geopandas as gpd
import numpy
from shapely import wkt
from shapely import wkb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from shapely.geometry import Point
from geopy.geocoders import Nominatim
from os import path
from scipy.spatial import cKDTree
from shapely.geometry import Point


train = pd.read_csv('data/stores_train.csv')
test= pd.read_csv('data/stores_test.csv')
busstops = pd.read_csv('data/busstops_norway.csv')
grunnkrets_age = pd.read_csv('data/grunnkrets_age_distribution.csv')
grunnkrets_households = pd.read_csv('data/grunnkrets_households_num_persons.csv')
grunnkrets_income = pd.read_csv('data/grunnkrets_income_households.csv')
grunnkrets_stripped = pd.read_csv('data/grunnkrets_norway_stripped.csv')
plaace_hierarchy = pd.read_csv('data/plaace_hierarchy.csv')


In [168]:
def rev_to_log(train_data):
    if train_data._get_value(1, 'revenue') > 15:
        train_data['revenue'] = np.log1p(train_data['revenue'])
    return train_data

def remove_zero_revenue(train_data):
    train_data = train_data.loc[train_data["revenue"] > 0.1]
    train_data = train_data.reset_index(drop = True)
    return train_data

def remove_high_revenue():
    train_data = train_data.loc[train_data["revenue"] < 5]
    train_data = train_data.reset_index()
    return train_data

def dist_to_all_km(lat, lon, df):

    # coordinates in radians
    lat1 = lat*math.pi/180
    lon1 = lon*math.pi/180
    lat2 = df['lat']*math.pi/180 # go through whole lat column
    lon2 = df['lon']*math.pi/180 # go through whole lon column

    # store original coordinates in new dataframe
    distances = pd.DataFrame()
    distances['lat'] = df['lat'].copy()
    distances['lon'] = df['lon'].copy()

    # calculate cartesian coordinates
    R = 6371 # Earth radius in km
    df['x'] = R*np.cos(lat2)*np.cos(lon2)
    df['y'] = R*np.cos(lat2)*np.sin(lon2)
    df['z'] = R*np.sin(lat2)
    x1 = R*np.cos(lat1)*np.cos(lon1)
    y1 = R*np.cos(lat1)*np.sin(lon1)
    z1 = R*np.sin(lat1)

    # calculate distance, store as new column in the distances dataframe
    distances['dist'] = np.sqrt(np.square(df['x']-x1)+np.square(df['y']-y1)+np.square(df['z']-z1))

    return distances['dist'].squeeze()


In [169]:
# Creating the initial dataset with every feature we want

def lat_long_busstop():
    busstops['geometry'] = gpd.GeoSeries.from_wkt(busstops['geometry'])
    busstops['lat'] = busstops.geometry.apply(lambda x: x.y)
    busstops['lon'] = busstops.geometry.apply(lambda x: x.x)


def convert_nan(train_data):
    # Replace NaN in mall_name and chain_name columns with 'No mall' and 'No chain'
    train_data.mall_name = train_data.mall_name.fillna('No mall')
    train_data.chain_name = train_data.chain_name.fillna('No chain')
    train_data.address = train_data.address.fillna('No Address')
    train_data['mall_name']= train_data['mall_name'].astype('category')
    train_data['chain_name']= train_data['chain_name'].astype('category')
    return train_data

def combine_grunnkrets_and_data(train_data):
    train_data = pd.merge(train_data, grunnkrets_stripped[['grunnkrets_id', 'municipality_name']], on='grunnkrets_id', how='left')
    train_data.municipality_name = train_data.municipality_name.fillna('No municipality')
    # we get a bunch of duplicates of store_ids...? Remove them.
    train_data = train_data.drop_duplicates(subset=['store_id'], keep='first')
    return train_data


def create_lvl(train_data):
    global plaace_hierarchy

    train_data = pd.merge(train_data, plaace_hierarchy[['plaace_hierarchy_id', 'lv1', 'lv2', 'lv3']], on='plaace_hierarchy_id', how='outer')
    train_data['lv1']= train_data['lv1'].astype('category')
    train_data['lv2']= train_data['lv2'].astype('category')
    train_data['lv3']= train_data['lv3'].astype('category')
    
    #drop the last broken columns
    train_data = train_data.dropna(subset=['store_id'])
    return train_data
    
def remove_zero_rev(train_data):
    i = train_data[(train_data.revenue == 0)].index
    train_data.drop(i)
    train_data = train_data.reset_index()
    return train_data
    


In [170]:
# BUSSTOP FEATURES
def busstops_wihin_distances(train_data): 
    train_data['busstops_within_50m'] = 0
    train_data['busstops_within_100m'] = 0
    train_data['busstops_within_400m'] = 0
    train_data['busstops_within_800m'] = 0
    train_data['busstops_within_1500m'] = 0
    #train_data['closest_busstop_distance'] = 0

    iter = 0

    for index in range(len(train_data)):
        one_to_all = dist_to_all_km(train_data._get_value(index, 'lat'), train_data._get_value(index, 'lon'), busstops)
        one_to_all = one_to_all.to_frame()
        one_to_all.rename( columns={0 :'a'}, inplace=True)
        count50 = (one_to_all < 0.05).sum()
        count100 = (one_to_all < 0.1).sum()
        count400 = (one_to_all < 0.4).sum()
        count800 = (one_to_all < 0.8).sum()
        count1500 = (one_to_all < 1.5).sum()
        #closest = one_to_all.min()
        iter += 1
        print(str(iter) + "/" + str(len(train_data)))
        train_data._set_value(index, 'busstops_within_50m', count50)
        train_data._set_value(index, 'busstops_within_100m', count100)
        train_data._set_value(index, 'busstops_within_400m', count400)
        train_data._set_value(index, 'busstops_within_800m', count800)
        train_data._set_value(index, 'busstops_within_1500m', count1500)
        #train_data._set_value(index, 'closest_busstop_distance', closest)
    return train_data


In [171]:
# GRUNNKRETS FEATURES

def prep_gk():
    global grunnkrets_age
    global grunnkrets_stripped
    global train
    grunnkrets_age['total_nbr_people'] = 0
    grunnkrets_age = grunnkrets_age.drop_duplicates(subset=['grunnkrets_id'], keep='last') # if there is value for 2016 we keep it, otherwise 2015
    grunnkrets_age = grunnkrets_age.fillna(0)
    grunnkrets_age = grunnkrets_age.drop('year',axis=1)
    grunnkrets_age['grunnkrets_id'] = grunnkrets_age['grunnkrets_id'].astype(str)
    grunnkrets_age['total_nbr_people'] = grunnkrets_age.sum(axis=1) # total number of inhabitants
    grunnkrets_age['grunnkrets_id'] = grunnkrets_age['grunnkrets_id'].astype(int)

    number_stores = train['grunnkrets_id'].value_counts().rename_axis('grunnkrets_id').reset_index(name='store_counts_total') # Not including NaN (stores without a grunnkrets_id)
    grunnkrets_stripped = pd.merge(grunnkrets_stripped, number_stores[['grunnkrets_id', 'store_counts_total']], on='grunnkrets_id', how='left')
    grunnkrets_stripped.store_counts_total = grunnkrets_stripped.store_counts_total.fillna(0)
    grunnkrets_stripped = pd.merge(grunnkrets_stripped, grunnkrets_age[['grunnkrets_id', 'total_nbr_people']], on='grunnkrets_id', how='left')
    grunnkrets_stripped['nbr_people_per_store_in_grunnkrets'] = grunnkrets_stripped['total_nbr_people']/grunnkrets_stripped['store_counts_total']


def people_per_gk(train_data):
    train_data = pd.merge(train_data, grunnkrets_age[['grunnkrets_id', 'total_nbr_people']], on='grunnkrets_id', how='left')
    train_data['total_nbr_people'] = train_data['total_nbr_people'].fillna(0)
    return train_data

def people_per_store_in_each_gk(train_data):
    train_data = pd.merge(train_data, grunnkrets_stripped[['grunnkrets_id', 'nbr_people_per_store_in_grunnkrets']], on='grunnkrets_id', how='left')

    #dont know if we need this
    train_data = train_data.drop_duplicates(subset=['store_id'], keep='first')
    train_data['nbr_people_per_store_in_grunnkrets'] = train_data['nbr_people_per_store_in_grunnkrets'].fillna(0)
    return train_data


def people_per_store_with_same_lvl2_in_each_gk(train_data):
    # Number of people per store in each grunnkrets in lv2
    counts = train_data[["store_id", "grunnkrets_id", "lv2"]].groupby(
        ["grunnkrets_id", "lv2"]
    ).count().reset_index()
    counts.columns = ["grunnkrets_id", "lv2", "counts_gr_lv2"]
    train_data = train_data.merge(counts, how="left", on=["grunnkrets_id", "lv2"])

    #dont know if we need this
    train_data = train_data.drop_duplicates(subset=['store_id'], keep='first')
    return train_data
    
def people_per_municipality(train_data, test_data):
    municipalities = train_data[["municipality_name", "total_nbr_people"]].groupby(
    ["municipality_name"]
    ).sum().reset_index()
    municipalities = municipalities.rename(columns={'total_nbr_people':'nbr_people_in_municipality'})

    # Print distribution to check relevant division into small/medium/large municipality
    municipalities = municipalities[municipalities['municipality_name'] != 'No municipality name'] # remove No municipality name (NaN)

    #print(municipalities['nbr_people_in_municipality'].describe())
    #ax = municipalities.plot.bar(x='municipality_name', y='nbr_people_in_municipality', rot=0)
    #print(municipalities)

    # Make new column in municipalities for municipality size category, assign categories
    conditions = [
        (municipalities['nbr_people_in_municipality'] < np.log1p(1.612750e+03)),
        (municipalities['nbr_people_in_municipality'] >= np.log1p(1.612750e+03)) & (municipalities['nbr_people_in_municipality'] < np.log1p(5.731000e+03)),
        (municipalities['nbr_people_in_municipality'] >= np.log1p(5.731000e+03)) & (municipalities['nbr_people_in_municipality'] < np.log1p(1.717325e+04)),
        (municipalities['nbr_people_in_municipality'] >= np.log1p(1.717325e+04)) & (municipalities['nbr_people_in_municipality'] < np.log1p((2.109973e+06)-1)),
        (municipalities['nbr_people_in_municipality'] >= np.log1p((2.109973e+06)-1)),
    ]
    values = ['1', '2', '3', '4', '0']
    municipalities['municipality_size_group'] = np.select(conditions, values)
    #print(municipalities)
    # municipalities['municipality_size_group'].value_counts() # four size categories of 102-103 municipalities in each, category 0 is the 'No municipality name' one

    # merge to train data
    train_data = pd.merge(train_data, municipalities[['municipality_name', 'municipality_size_group']], on='municipality_name', how='outer')
    # merge to test data
    test_data = pd.merge(test_data, municipalities[['municipality_name', 'municipality_size_group']], on='municipality_name', how='outer')
    return train_data, test_data



def people_per_sotre_with_same_lvl2_in_each_muninicipality(train_data):

    nbr_in_municipality = train_data[["store_id","municipality_name", "lv2"]].groupby(
        ["municipality_name", "lv2"]
    ).count().reset_index()
    nbr_in_municipality.columns = ["municipality_name", "lv2", "counts_municipality_lv2"]
    train_data = train_data.merge(nbr_in_municipality, how="left", on=["municipality_name", "lv2"])

    #dont know if we need this
    train_data = train_data.drop_duplicates(subset=['store_id'], keep='first')
    return train_data

def mean_rev_size_group(train_data, test_data):
    mean_rev_munic = train_data[["municipality_size_group", "revenue"]].groupby(
    ["municipality_size_group"]
    ).mean().reset_index()
    mean_rev_munic = mean_rev_munic.rename(columns={'revenue':'mean_revenue_for_municipality_size_group'})

    # merge to train data
    train_data = train_data.merge(mean_rev_munic, how="left", on=["municipality_size_group"])
    # In case of duplicates, remove them.
    train_data = train_data.drop_duplicates(subset=['store_id'], keep='first')

    # merge to test data
    test_data = test_data.merge(mean_rev_munic, how="left", on=["municipality_size_group"])
    # In case of duplicates, remove them.
    test_data = test_data.drop_duplicates(subset=['store_id'], keep='first')
    return train_data, test_data

def median_rev_size_group(train_data, test_data):
    # median rev per municipality size group
    median_rev_munic = train_data[["municipality_size_group", "revenue"]].groupby(
        ["municipality_size_group"]
    ).median().reset_index()
    median_rev_munic = median_rev_munic.rename(columns={'revenue':'median_revenue_for_municipality_size_group'})

    # merge to train data
    train_data = train_data.merge(median_rev_munic, how="left", on=["municipality_size_group"])
    # In case of duplicates, remove them.
    train_data = train_data.drop_duplicates(subset=['store_id'], keep='first')
    # merge to test data
    test_data = test_data.merge(median_rev_munic, how="left", on=["municipality_size_group"])
    # In case of duplicates, remove them.
    test_data = test_data.drop_duplicates(subset=['store_id'], keep='first')
    return train_data, test_data

def std_red_size_group(train_data, test_data):
    # st dev per municipality size group
    stdev_munic = train_data[["municipality_size_group", "revenue"]].groupby(
        ["municipality_size_group"]
    ).std().reset_index()
    stdev_munic = stdev_munic.rename(columns={'revenue':'st_dev_of_revenue_for_municipality_size_group'})
    

    # merge to train data
    train_data = train_data.merge(stdev_munic, how="left", on=["municipality_size_group"])
    # In case of duplicates, remove them.
    train_data = train_data.drop_duplicates(subset=['store_id'], keep='first')
    # merge to test data
    test_data = test_data.merge(stdev_munic, how="left", on=["municipality_size_group"])
    # In case of duplicates, remove them.
    test_data = test_data.drop_duplicates(subset=['store_id'], keep='first')
    return train_data, test_data

In [172]:
# STORE TO STORE FEATURES
def store_dist_lvl2(df):
    # create a datafram with all stores extra
    test = pd.read_csv('data/stores_test.csv')
    train = pd.read_csv('data/stores_train.csv')
    extra = pd.read_csv('data/stores_extra.csv')
    extra_df = pd.DataFrame()
    extra_df = extra_df.append(test).append(train).append(extra)
    extra_df = create_lvl(extra_df)

    df['num_stores_within_100m_and_same_lvl2'] = 0
    df['num_stores_within_500m_and_same_lvl2'] = 0
    df['num_stores_within_1km_and_same_lvl2'] = 0
    df['num_stores_within_5km_and_same_lvl2'] = 0
    df['num_stores_within_10km_and_same_lvl2'] = 0
    df['num_stores_within_20km_and_same_lvl2'] = 0
    df['closest_competitor'] = 100
    num = 0 

    for index in range(len(df)):
        lat = df._get_value(index,'lat')
        lon = df._get_value(index,'lon')
        lvl = df._get_value(index, 'lv2')
        dist_to_all = dist_to_all_km(lat, lon, extra_df)


        count01 = 0
        count05 = 0
        count1 = 0
        count5 = 0
        count10 = 0
        count20 = 0
        closest = 100

        iter = 0
        for number in dist_to_all:
            if number < 0.1 and extra_df._get_value(iter, 'lv2') == lvl:
                count01 += 1
            if number < 0.5 and extra_df._get_value(iter, 'lv2') == lvl:
                count05 += 1
            if number < 1 and extra_df._get_value(iter, 'lv2') == lvl:
                count1 += 1
            if number < 5 and extra_df._get_value(iter, 'lv2') == lvl:
                count5 += 1
            if number < 10 and extra_df._get_value(iter, 'lv2') == lvl:
                count10 +=1
            if number < 20 and extra_df._get_value(iter, 'lv2') == lvl:
                count20 += 1
            if number < closest and extra_df._get_value(iter, 'lv2') == lvl and number != 0:
                closest = number
            iter += 1

        df._set_value(index, 'num_stores_within_100m_and_same_lvl2', count01)
        df._set_value(index, 'num_stores_within_500m_and_same_lvl2', count05)
        df._set_value(index, 'num_stores_within_1km_and_same_lvl2', count1)
        df._set_value(index, 'num_stores_within_5km_and_same_lvl2', count5)
        df._set_value(index, 'num_stores_within_10km_and_same_lvl2', count10)
        df._set_value(index, 'num_stores_within_20km_and_same_lvl2', count20)
        df._set_value(index, 'closest_competitor', closest)

        num+=1 #for runtime tracking only
        print(num)
    return df


def store_dist_lvl3(df):
    # create a datafram with all stores extra
    test = pd.read_csv('data/stores_test.csv')
    train = pd.read_csv('data/stores_train.csv')
    extra = pd.read_csv('data/stores_extra.csv')
    extra_df = pd.DataFrame()
    extra_df = extra_df.append(test).append(train).append(extra)
    extra_df = create_lvl(extra_df)

    df['num_stores_within_100m_and_same_lvl3'] = 0
    df['num_stores_within_500m_and_same_lvl3'] = 0
    df['num_stores_within_1km_and_same_lvl3'] = 0
    df['num_stores_within_5km_and_same_lvl3'] = 0
    df['num_stores_within_10km_and_same_lvl3'] = 0
    df['num_stores_within_20km_and_same_lvl3'] = 0
    df['closest_competitor'] = 100
    num = 0 

    for index in range(len(df)):
        lat = df._get_value(index,'lat')
        lon = df._get_value(index,'lon')
        lvl = df._get_value(index, 'lv3')
        dist_to_all = dist_to_all_km(lat, lon, extra_df)


        count01 = 0
        count05 = 0
        count1 = 0
        count5 = 0
        count10 = 0
        count20 = 0
        closest = 100

        iter = 0
        for number in dist_to_all:
            if number < 0.1 and extra_df._get_value(iter, 'lv3') == lvl:
                count01 += 1
            if number < 0.5 and extra_df._get_value(iter, 'lv3') == lvl:
                count05 += 1
            if number < 1 and extra_df._get_value(iter, 'lv3') == lvl:
                count1 += 1
            if number < 5 and extra_df._get_value(iter, 'lv3') == lvl:
                count5 += 1
            if number < 10 and extra_df._get_value(iter, 'lv3') == lvl:
                count10 +=1
            if number < 20 and extra_df._get_value(iter, 'lv3') == lvl:
                count20 += 1
            if number < closest and extra_df._get_value(iter, 'lv3') == lvl and number != 0:
                closest = number
            iter += 1

        df._set_value(index, 'num_stores_within_100m_and_same_lvl3', count01)
        df._set_value(index, 'num_stores_within_500m_and_same_lvl3', count05)
        df._set_value(index, 'num_stores_within_1km_and_same_lvl3', count1)
        df._set_value(index, 'num_stores_within_5km_and_same_lvl3', count5)
        df._set_value(index, 'num_stores_within_10km_and_same_lvl3', count10)
        df._set_value(index, 'num_stores_within_20km_and_same_lvl3', count20)
        df._set_value(index, 'closest_competitor', closest)

        num+=1 #for runtime tracking only
        print(num)
    return df


In [173]:
#------------------------TRAIN DATA BELOW----------------------------

In [174]:
#only run once for both train and test data
train = rev_to_log(train)
lat_long_busstop()
prep_gk()

train = convert_nan(train)
train = combine_grunnkrets_and_data(train)
train = create_lvl(train)
train = people_per_gk(train)
train = people_per_store_in_each_gk(train)
train = people_per_store_with_same_lvl2_in_each_gk(train)
train = people_per_sotre_with_same_lvl2_in_each_muninicipality(train)
train = busstops_wihin_distances(train)
train = store_dist_lvl2(train)
train = store_dist_lvl3(train)


1/12642
2/12642
3/12642
4/12642
5/12642
6/12642
7/12642
8/12642
9/12642
10/12642
11/12642
12/12642
13/12642
14/12642
15/12642
16/12642
17/12642
18/12642
19/12642
20/12642
21/12642
22/12642
23/12642
24/12642
25/12642
26/12642
27/12642
28/12642
29/12642
30/12642
31/12642
32/12642
33/12642
34/12642
35/12642
36/12642
37/12642
38/12642
39/12642
40/12642
41/12642
42/12642
43/12642
44/12642
45/12642
46/12642
47/12642
48/12642
49/12642
50/12642
51/12642
52/12642
53/12642
54/12642
55/12642
56/12642
57/12642
58/12642
59/12642
60/12642
61/12642
62/12642
63/12642
64/12642
65/12642
66/12642
67/12642
68/12642
69/12642
70/12642
71/12642
72/12642
73/12642
74/12642
75/12642
76/12642
77/12642
78/12642
79/12642
80/12642
81/12642
82/12642
83/12642
84/12642
85/12642
86/12642
87/12642
88/12642
89/12642
90/12642
91/12642
92/12642
93/12642
94/12642
95/12642
96/12642
97/12642
98/12642
99/12642
100/12642
101/12642
102/12642
103/12642
104/12642
105/12642
106/12642
107/12642
108/12642
109/12642
110/12642
111/1264

In [175]:
#------------------------TEST DATA BELOW----------------------------

train

Unnamed: 0,store_id,year,store_name,plaace_hierarchy_id,sales_channel_name,grunnkrets_id,address,lat,lon,chain_name,...,num_stores_within_5km_and_same_lvl2,num_stores_within_10km_and_same_lvl2,num_stores_within_20km_and_same_lvl2,closest_competitor,num_stores_within_100m_and_same_lvl3,num_stores_within_500m_and_same_lvl3,num_stores_within_1km_and_same_lvl3,num_stores_within_5km_and_same_lvl3,num_stores_within_10km_and_same_lvl3,num_stores_within_20km_and_same_lvl3
0,983540538-974187930-44774,2016.0,MCDONALD'S BRAGERNES TORG MAGASINET,1.1.1.0,Hamburger restaurants,6020303.0,BRAGERNES TORG 13,59.743104,10.204928,MCDONALDS,...,127,152,270,0,1,1,1,4,5,7
1,987074191-973117734-44755,2016.0,MCDONALD'S KLINGENBERGGATA,1.1.1.0,Hamburger restaurants,3010306.0,No Address,59.913759,10.734031,MCDONALDS,...,1363,1635,2007,0,1,4,12,26,34,49
2,984890265-981157303-64491,2016.0,BURGER KING HØNEFOSS,1.1.1.0,Hamburger restaurants,6050102.0,KONG RINGS GATE 1,60.164751,10.254656,BURGER KING,...,37,42,51,0,1,1,1,1,1,1
3,914057442-992924179-126912,2016.0,BURGER KING GLASSHUSPASSASJEN,1.1.1.0,Hamburger restaurants,18040102.0,STORGATA 12,67.283669,14.379796,BURGER KING,...,77,81,83,0,1,1,1,2,2,2
4,913018583-913063538-668469,2016.0,BURGER KING TILLERTORGET,1.1.1.0,Hamburger restaurants,16017414.0,No Address,63.358068,10.374832,BURGER KING,...,60,322,353,0,1,2,2,3,7,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12637,915789943-915806929-781991,2016.0,MEIERIGÅRDEN BRYGGERIUTSALG,2.8.11.2,Beer and soda shop,7010705.0,THUEGATA 2,59.416276,10.480970,No chain,...,28,41,212,0,1,2,2,3,4,14
12638,917921733-917982368-868081,2016.0,GULATING ØLUTSALG CC GJØVIK,2.8.11.2,Beer and soda shop,5020406.0,No Address,60.799991,10.693635,GULATING GRUPPEN,...,31,43,101,0,2,2,2,2,3,8
12639,911721961-911764474-496764,2016.0,GULATING ØLUTSALG STRØMMEN,2.8.11.2,Beer and soda shop,2310803.0,STØPERIVEIEN 6,59.946562,11.007659,GULATING GRUPPEN,...,120,248,1080,0,2,2,2,6,12,42
12640,914337046-914343372-721294,2016.0,DET GODE BRYGG,2.8.11.2,Beer and soda shop,11020113.0,VÅGSGATA 16,58.850261,5.735674,No chain,...,77,124,294,0,1,2,2,3,5,15


In [176]:
test = convert_nan(test)
test = combine_grunnkrets_and_data(test)
test = create_lvl(test)
test = people_per_gk(test)
test = people_per_store_in_each_gk(test)
test = people_per_store_with_same_lvl2_in_each_gk(test)
test = people_per_sotre_with_same_lvl2_in_each_muninicipality(test)
test = busstops_wihin_distances(test)
test = store_dist_lvl2(test)
test = store_dist_lvl3(test)

1/8577
2/8577
3/8577
4/8577
5/8577
6/8577
7/8577
8/8577
9/8577
10/8577
11/8577
12/8577
13/8577
14/8577
15/8577
16/8577
17/8577
18/8577
19/8577
20/8577
21/8577
22/8577
23/8577
24/8577
25/8577
26/8577
27/8577
28/8577
29/8577
30/8577
31/8577
32/8577
33/8577
34/8577
35/8577
36/8577
37/8577
38/8577
39/8577
40/8577
41/8577
42/8577
43/8577
44/8577
45/8577
46/8577
47/8577
48/8577
49/8577
50/8577
51/8577
52/8577
53/8577
54/8577
55/8577
56/8577
57/8577
58/8577
59/8577
60/8577
61/8577
62/8577
63/8577
64/8577
65/8577
66/8577
67/8577
68/8577
69/8577
70/8577
71/8577
72/8577
73/8577
74/8577
75/8577
76/8577
77/8577
78/8577
79/8577
80/8577
81/8577
82/8577
83/8577
84/8577
85/8577
86/8577
87/8577
88/8577
89/8577
90/8577
91/8577
92/8577
93/8577
94/8577
95/8577
96/8577
97/8577
98/8577
99/8577
100/8577
101/8577
102/8577
103/8577
104/8577
105/8577
106/8577
107/8577
108/8577
109/8577
110/8577
111/8577
112/8577
113/8577
114/8577
115/8577
116/8577
117/8577
118/8577
119/8577
120/8577
121/8577
122/8577
123/8577
1

In [178]:
train, test = people_per_municipality(train, test)
train, test = mean_rev_size_group(train, test)
train, test = median_rev_size_group(train, test)
train, test = std_red_size_group(train, test)
train.head()

Unnamed: 0,store_id,year,store_name,plaace_hierarchy_id,sales_channel_name,grunnkrets_id,address,lat,lon,chain_name,...,num_stores_within_100m_and_same_lvl3,num_stores_within_500m_and_same_lvl3,num_stores_within_1km_and_same_lvl3,num_stores_within_5km_and_same_lvl3,num_stores_within_10km_and_same_lvl3,num_stores_within_20km_and_same_lvl3,municipality_size_group,mean_revenue_for_municipality_size_group,median_revenue_for_municipality_size_group,st_dev_of_revenue_for_municipality_size_group
0,983540538-974187930-44774,2016.0,MCDONALD'S BRAGERNES TORG MAGASINET,1.1.1.0,Hamburger restaurants,6020303.0,BRAGERNES TORG 13,59.743104,10.204928,MCDONALDS,...,1,1,1,4,5,7,0,1.616047,1.498283,1.012403
1,999125026-999139256-472734,2016.0,KATANA SUSHI,1.1.2.0,Sushi and wok restaurants,6020509.0,GULDLISTEN 35,59.742529,10.157972,No chain,...,1,1,1,6,7,14,0,1.616047,1.498283,1.012403
2,898083292-898135322-538507,2016.0,FUJI SUSHI DRAMMEN,1.1.2.0,Sushi and wok restaurants,6020604.0,BLICHS GATE 4,59.7382,10.203061,FUJI SUSHI,...,1,1,5,6,6,14,0,1.616047,1.498283,1.012403
3,916744765-916758723-821461,2016.0,SUSHI OF NORWAY KONNERUDGATA,1.1.2.0,Sushi and wok restaurants,6020602.0,KONNERUDGATA 34,59.738563,10.192774,No chain,...,1,1,5,6,6,14,0,1.616047,1.498283,1.012403
4,916139195-998469740-540508,2016.0,WASABI SUSHI,1.1.2.0,Sushi and wok restaurants,6020303.0,ENGENE 4,59.74423,10.206481,No chain,...,1,3,5,6,6,14,0,1.616047,1.498283,1.012403


In [179]:
test.to_csv('feature_data/testing_set.csv', index=False)
train.to_csv('feature_data/training_set.csv', index=False)


In [180]:
def drop_all(train_data):
    train_data = train_data.drop(['year', 'store_name', 'plaace_hierarchy_id', 'sales_channel_name', 'grunnkrets_id'], axis=1)
    train_data = train_data.drop(['address', 'lat', 'lon', 'municipality_name'], axis=1)
    train_data = train_data.reset_index(drop=True)
    return train_data
    
test = drop_all(test)
train = drop_all(train)
train = remove_zero_revenue(train)
train = remove_high_revenue(train)

In [181]:
test.to_csv('feature_data/testing_set_dropped.csv', index=False)
train.to_csv('feature_data/training_set_dropped.csv', index=False)
