In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string
import copy
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler 
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
import lightgbm as lgb
from catboost import CatBoostRegressor
from xgboost import XGBRegressor

In [2]:
df_train = pd.read_csv("data/stores_train.csv")
df_test = pd.read_csv("data/stores_test.csv")
df_bus_stops = pd.read_csv("data/busstops_norway_fixed.csv")
df_grunnkrets_age_dist = pd.read_csv("data/grunnkrets_age_distribution.csv")
df_grunnkrets_house_pers = pd.read_csv("data/grunnkrets_households_num_persons.csv")
df_grunnkrets_income_house = pd.read_csv("data/grunnkrets_income_households.csv")
df_grunnkrets_stripped = pd.read_csv("data/grunnkrets_norway_stripped.csv")
df_plaace_hierarchy = pd.read_csv("data/plaace_hierarchy.csv")
df_extra = pd.read_csv("data/stores_extra.csv")

In [3]:
df_grunnkrets_stripped = df_grunnkrets_stripped[df_grunnkrets_stripped.year == 2016]
df_grunnkrets_age_dist = df_grunnkrets_age_dist[df_grunnkrets_age_dist.year == 2016]
df_grunnkrets_house_pers = df_grunnkrets_house_pers[df_grunnkrets_house_pers.year == 2016]
df_grunnkrets_income_house = df_grunnkrets_income_house[df_grunnkrets_income_house.year == 2016]

df_train['lat'] = df_train.lat * 11.112 * 10
df_train['lon'] = df_train.lon * 6.4757 * 10
df_test['lat'] = df_test.lat * 11.112 * 10
df_test['lon'] = df_test.lon * 6.4757 * 10

# Features engineering

## First Bus stop approach

In [None]:
from shapely.geometry import Point, LineString, Polygon
from shapely.wkt import loads

import time

def parsing_bus_grunnkrets(bus_df, grunnkrets):
    
    start = time.time()

    grunnkrets_id = []
    
    geometries =[loads(geometry) for geometry in np.asarray(grunnkrets['geometry'])]
    grunnkrets = {idx: geometrie for idx, geometrie in zip(np.asarray(grunnkrets['grunnkrets_id']), geometries)}
    
    for i in range(len(bus_coord)):
        loc = loads(bus_df.iloc[i][0])
    # Improvement -> Remove bus stop not in any grunnkret
        test = []
        for idx, grunnkret_id in enumerate(grunnkrets):
            if grunnkrets[grunnkret_id].contains(loc):
                grunnkrets_id.append(grunnkret_id)
                break
            elif idx == len(grunnkrets)-1:
                # Bus Stop isn't in any grunnkret
                grunnkrets_id.append(0)  
        if i % 500 == 0:
            print(f"{i+1} bus stops done in : {round(time.time()-start, 2)}s")
            
    return grunnkrets_id



df_grunnkrets_stripped = df_grunnkrets_stripped[df_grunnkrets_stripped.year == 2016]

# To test on a shorter sample
# df_bus_stops = df_bus_stops.head(100)

df_bus_stops['grunnkrets_id'] = parsing_bus_grunnkrets(df_bus_stops['geometry'], df_grunnkrets_stripped[['grunnkrets_id', 'geometry']])

# Save new dataset to not need to run it everytime
df_bus_stops.to_csv('data/busstops_norway_fixed.csv', index=False, header=True)
df_bus_stops.head()

In [4]:
df_bus_stops

Unnamed: 0,busstop_id,stopplace_type,importance_level,side_placement,geometry,grunnkrets_id
0,853cb081-cc32-4880-aa3e-26e96870d874,Plattform og lomme,Mangler viktighetsnivå,LEFT_AND_RIGHT,POINT(6.54064723379766 61.0618887850553),14170108
1,156b052b-2771-497a-b4f4-97fed59e1aca,"Lomme og skilt, ikke plattform",Mangler viktighetsnivå,LEFT_AND_RIGHT,POINT(5.89980086113255 60.1421872817075),0
2,7312a280-e14f-4b09-a421-02e8fe1bc63e,,Mangler viktighetsnivå,MIDDLE_LEFT,POINT(10.7781327278563 59.9299988828761),3012106
3,d9cda2c7-355a-49c1-b56c-a33180d2a82e,,Mangler viktighetsnivå,MIDDLE_LEFT,POINT(10.7781496457324 59.9301044643692),3012106
4,f803bcd3-182d-450b-bbb4-113c6ca885c2,,Mangler viktighetsnivå,MIDDLE_RIGHT,POINT(10.7781676444854 59.9302099995898),3012106
...,...,...,...,...,...,...
68390,12edd887-a122-44e2-896b-a5f663917f88,"Lomme og skilt, ikke plattform",Mangler viktighetsnivå,MIDDLE,POINT(5.85431348276154 60.2029816170228),0
68391,38dd0aee-b497-43c2-943b-51ab5c6d0eb7,"Lomme og skilt, ikke plattform",Mangler viktighetsnivå,MIDDLE,POINT(5.85387810987475 60.2029250857799),0
68392,3803f0fe-d6dc-4211-a44d-bbc3453af289,"Lomme og skilt, ikke plattform",Mangler viktighetsnivå,MIDDLE,POINT(10.6495319726142 62.9425599428163),16480209
68393,5b177625-d0dc-40ae-b554-e97f7d400e24,Plattform og lomme,Lokalt knutepunkt,MIDDLE,POINT(9.16421204735413 61.0634296716317),5440207


## Second Bus stop approach

In [26]:
from scipy.spatial.distance import cdist

def find_closest_bus_stop(df):
    """
    Combine the training data with the bus stop data by finding :
    - the closest bus stop from the store (create a feature the minimal distance then)
    - the mean distance of every bus stop in 1km radius
    for each category of bus stop
    """
    categories = ['Mangler viktighetsnivå', 
              'Standard holdeplass', 
              'Lokalt knutepunkt',
              'Nasjonalt knutepunkt',
              'Regionalt knutepunkt',
              'Annen viktig holdeplass']

    new_bs_features = pd.DataFrame(df.store_id)
    for category in categories:
        df_bus_tmp = df_bus_stops[df_bus_stops['importance_level'] == category].loc[:, ['busstop_id']]
        df_bus_tmp[['lon', 'lat']] = df_bus_stops['geometry'].str.extract(r'(?P<lat>[0-9]*[.]?[0-9]+)\s(?P<lon>[0-9]*[.]?[0-9]+)', expand=True)
        df_bus_tmp['lon'] = pd.to_numeric(df_bus_tmp['lon']) * 6.4757 * 10    # value in km
        df_bus_tmp['lat'] = pd.to_numeric(df_bus_tmp['lat']) * 11.112 * 10    # value in km

        mat = cdist(df_bus_tmp[['lat','lon']], df[['lat','lon']], metric='euclidean')
        correlation_dist = pd.DataFrame(mat, index=df_bus_tmp['busstop_id'], columns=df['store_id']) 
        new_bs_features = pd.merge(new_bs_features, pd.DataFrame(correlation_dist.min(), columns=['BS_closest_'+ category.lower().replace(' ', '_')]), on='store_id', how='left') 
        new_bs_features = pd.merge(new_bs_features, pd.DataFrame(correlation_dist[correlation_dist < 1].mean(), columns=['BS_mean_1km_'+category.lower().replace(' ', '_')]), on='store_id', how='left')
    return new_bs_features.fillna(0)

In [27]:
find_closest_bus_stop(df_train).to_csv('data/new_bs_features_train.csv', index=False, header=True)
find_closest_bus_stop(df_test).to_csv('data/new_bs_features_test.csv', index=False, header=True)

## Preprocessing

In [None]:
def remove_outliers(df):
    df.revenue.max()

def preprocessing():
    