In [2]:

import sys
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer

sys.path.append("/Users/nwong/Workspace/Projects/tdt4173_project/src")

from feature_engineering.cluster_similarity_features import *
from feature_engineering.store_features import *
from feature_engineering.utils import *

In [3]:
# Import datasets
stores_train_df = pd.read_csv("/Users/nwong/Workspace/Projects/tdt4173_project/data/raw/stores_train.csv")
stores_test_df = pd.read_csv("/Users/nwong/Workspace/Projects/tdt4173_project/data/raw/stores_test.csv")
stores_extra_df = pd.read_csv("/Users/nwong/Workspace/Projects/tdt4173_project/data/raw/stores_extra.csv")
bus_stops_df = pd.read_csv("/Users/nwong/Workspace/Projects/tdt4173_project/data/raw/busstops_norway.csv")

# Grouping mechanism 
geo_groups = ['grunnkrets_name','district_name','municipality_name']
store_types = ['lv1_desc', 'lv2_desc', 'lv3_desc', 'lv4_desc']
stop_importance_levels = ['Mangler viktighetsnivå',
                          'Standard holdeplass',
                          'Lokalt knutepunkt',
                          'Nasjonalt knutepunkt',
                          'Regionalt knutepunkt',
                          'Annen viktig holdeplass']

In [4]:
raw_path = "/Users/nwong/Workspace/Projects/tdt4173_project/data/raw/" # Folder with the raw data files

stores_extra_merged = enrich_keys(stores_extra_df, raw_path=raw_path)
stores_train_enriched = data_enricher(stores_train_df, raw_path=raw_path, geo_groups=geo_groups, importance_levels=stop_importance_levels)
stores_test_enriched = data_enricher(stores_test_df, raw_path=raw_path, geo_groups=geo_groups, importance_levels=stop_importance_levels)

stores_train_data = stores_train_enriched.drop("revenue", axis=1)
stores_train_labels = stores_train_enriched['revenue']

  bus_stops_df[['lon', 'lat']] = bus_stops_df['lng_lat'].str.split(
  bus_stops_df[['lon', 'lat']] = bus_stops_df['lng_lat'].str.split(


In [6]:
# Impuding number of stores in same grunnkrets with the same type (without stores extra).
num_stores_grunnkrets_lv1 = AggTransformer(
    agg_cols=["grunnkrets_id", "lv1_desc"], 
    agg_name="store_count", 
    calculations=store_count)
num_stores_grunnkrets_lv1.fit_transform(stores_train_data)

Unnamed: 0,store_count
0,20.0
1,37.0
2,4.0
3,14.0
4,5.0
...,...
12854,18.0
12855,11.0
12856,32.0
12857,4.0


In [7]:
# Impuding number of stores in same grunnkrets with the same type (with stores extra).
num_stores_grunnkrets_lv1_extra = AggTransformer(
    agg_cols=["grunnkrets_id", "lv1_desc"], 
    agg_name="store_count", calculations=store_count, 
    stores_extra=stores_extra_merged)
num_stores_grunnkrets_lv1_extra.fit_transform(stores_train_data)

Unnamed: 0,store_count
0,36.0
1,83.0
2,7.0
3,29.0
4,13.0
...,...
12854,73.0
12855,73.0
12856,166.0
12857,32.0


In [8]:
avg_rev_mall = AggTransformer(
    agg_cols=["mall_name"], 
    agg_name="avg_revenue", 
    calculations=average_revenue, 
    sample_revenue=stores_train_labels)
avg_rev_mall.fit_transform(stores_train_data)

Unnamed: 0,avg_revenue
0,3.527700
1,
2,8.057700
3,6.640667
4,3.831000
...,...
12854,
12855,8.471471
12856,10.366205
12857,3.042250


In [9]:
avg_rev_chain = AggTransformer(
    agg_cols=["chain_name"], 
    agg_name="avg_revenue", 
    calculations=average_revenue, 
    sample_revenue=stores_train_labels)
avg_rev_chain.fit_transform(stores_train_data)

Unnamed: 0,avg_revenue
0,23.642250
1,23.642250
2,7.013545
3,7.013545
4,7.013545
...,...
12854,
12855,15.284667
12856,15.284667
12857,


In [10]:
# Impuding average revenue of store in same grunnkrets of the same type. DO NOT USE stores_extra
avg_rev_store_grunnkrets_lv1 = AggTransformer(
    agg_cols=["grunnkrets_id", "lv1_desc"], 
    agg_name="avg_revenue", 
    calculations=average_revenue,
    sample_revenue=stores_train_labels)
avg_rev_store_grunnkrets_lv1.fit_transform(stores_train_data)

Unnamed: 0,avg_revenue
0,10.330250
1,11.242757
2,7.382250
3,8.675714
4,10.011000
...,...
12854,5.073278
12855,9.228182
12856,12.938906
12857,2.762750


In [11]:
# Impuding average revenue of store in same grunnkrets of the same type. DO NOT USE stores_extra
avg_rev_store_lv4 = AggTransformer(
    agg_cols=["lv4_desc"], 
    agg_name="avg_revenue", 
    calculations=average_revenue,
    sample_revenue=stores_train_labels)
avg_rev_store_lv4.fit_transform(stores_train_data)

Unnamed: 0,avg_revenue
0,10.341043
1,10.341043
2,10.341043
3,10.341043
4,10.341043
...,...
12854,6.928231
12855,6.928231
12856,6.928231
12857,6.928231


In [12]:
# Stores in radius irrespective of store type
num_stores_radius = StoresInRadiusTransformer(
    radius=0.1)
num_stores_radius.fit_transform(stores_train_data)

Unnamed: 0,count
0,245
1,1694
2,73
3,127
4,381
...,...
12854,70
12855,76
12856,255
12857,278


In [13]:
# Stores in radius irrespective of store type
num_stores_radius_extra = StoresInRadiusTransformer(
    radius=0.1,
    stores_extra=stores_extra_merged)
num_stores_radius_extra.fit_transform(stores_train_data)

Unnamed: 0,count
0,739
1,5258
2,201
3,403
4,1249
...,...
12854,211
12855,251
12856,1146
12857,1041


In [14]:
# Stores of matching lv1_type within radius. THIS DOES NOT WORK.
# num_stores_radius_lv1_extra = StoresInRadiusTransformer(
#     radius=0.1, 
#     store_type_group='lv1_desc')
# num_stores_radius_lv1_extra.fit_transform()

In [15]:
stores_extra_merged

Unnamed: 0,store_id,year,store_name,plaace_hierarchy_id,sales_channel_name_x,grunnkrets_id,address,lat,lon,chain_name,...,area_km2,sales_channel_name_y,lv1,lv1_desc,lv2,lv2_desc,lv3,lv3_desc,lv4,lv4_desc
0,911669196-973140302-27020,2016,BURGER KING STRANDGATEN,1.1.1.0,Hamburger restaurants,12010115,STRANDGATEN 5,60.393979,5.323851,BURGER KING,...,0.051651,Hamburger restaurants,1,Dining and Experiences,1.1,Restaurant,1.1.1,Hamburger restaurants,1.1.1.0,Hamburger restaurants
1,913155726-992998792-5832,2016,BURGER KING JESSHEIM STORSENTER,1.1.1.0,Hamburger restaurants,2350205,STORGATA 6,60.142760,11.171834,BURGER KING,...,0.264436,Hamburger restaurants,1,Dining and Experiences,1.1,Restaurant,1.1.1,Hamburger restaurants,1.1.1.0,Hamburger restaurants
2,988252905-981108604-47887,2016,BURGER KING LANGNES,1.1.1.0,Hamburger restaurants,19020419,HULDERVEGEN 6,69.671483,18.920483,BURGER KING,...,0.366438,Hamburger restaurants,1,Dining and Experiences,1.1,Restaurant,1.1.1,Hamburger restaurants,1.1.1.0,Hamburger restaurants
3,995194546-891354622-45262,2016,BURGER KING MAGNETEN LEVANGER,1.1.1.0,Hamburger restaurants,17190701,,63.732791,11.281785,BURGER KING,...,1.484765,Hamburger restaurants,1,Dining and Experiences,1.1,Restaurant,1.1.1,Hamburger restaurants,1.1.1.0,Hamburger restaurants
4,881339692-979954964-3203,2016,BURGER KING ULLEVAAL STADION,1.1.1.0,Hamburger restaurants,3014511,SOGNSVEIEN 75,59.948558,10.732823,BURGER KING,...,0.393890,Hamburger restaurants,1,Dining and Experiences,1.1,Restaurant,1.1.1,Hamburger restaurants,1.1.1.0,Hamburger restaurants
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28679,931186744-982303729-41793,2016,COOP PRIX FRAMNES,2.8.1.0,Grocery stores,7061108,VESTERØYVEIEN 27,59.123102,10.236432,COOP PRIX,...,0.366177,Grocery stores,2,Retail,2.8,Food and drinks,2.8.1,Grocery stores,2.8.1.0,Grocery stores
28680,931186744-983814964-6308,2016,KIWI MINI PRIS VALENTINLYST,2.8.1.0,Grocery stores,16012118,ANDERS ESTENSTADS VEG 4,63.424115,10.442566,KIWI NORGE,...,0.095770,Grocery stores,2,Retail,2.8,Food and drinks,2.8.1,Grocery stores,2.8.1.0,Grocery stores
28681,931186744-990018995-100187,2016,EXTRA MAASESKJÆRET,2.8.1.0,Grocery stores,12010617,SANDVIKSVEIEN 94,60.414032,5.320277,COOP EXTRA,...,0.142441,Grocery stores,2,Retail,2.8,Food and drinks,2.8.1,Grocery stores,2.8.1.0,Grocery stores
28682,931186744-971707283-20794,2016,EXTRA MANDAL,2.8.1.0,Grocery stores,10020110,KASTELLGATA 14,58.022656,7.451600,COOP EXTRA,...,0.080268,Grocery stores,2,Retail,2.8,Food and drinks,2.8.1,Grocery stores,2.8.1.0,Grocery stores


In [16]:
cluster_simil = ClusterSimilarity(
    n_clusters=20, 
    gamma=1., 
    random_state=42, 
    sample_weight=stores_train_labels)

cluster_simil.fit_transform(stores_train_data[['lat', 'lon']])

array([[4.15993735e-001, 2.28184731e-022, 3.05050361e-011, ...,
        1.27255607e-009, 1.87744408e-006, 9.47820280e-035],
       [6.58447910e-001, 3.91643711e-020, 1.56994904e-013, ...,
        1.64321709e-008, 4.41921377e-008, 8.94150476e-032],
       [2.72222070e-001, 5.43463624e-020, 2.61402943e-011, ...,
        4.67020550e-008, 2.27850682e-006, 8.38354491e-032],
       ...,
       [7.01844646e-001, 2.23423426e-019, 7.74435248e-015, ...,
        3.07233726e-008, 4.65290479e-009, 1.05194236e-030],
       [3.92771905e-013, 2.22898868e-048, 8.89650906e-002, ...,
        4.26019752e-027, 1.91325282e-002, 2.46544310e-066],
       [1.19308921e-074, 1.43317305e-020, 3.22463852e-120, ...,
        6.40260694e-039, 7.59631812e-103, 7.32416677e-012]])

In [17]:
preprocessing = ColumnTransformer([
        ("mall", is_null_pipeline(), ["mall_name"]),
        ("chain", is_null_pipeline(), ['chain_name']),
        ("mall_", avg_rev_mall, ['mall_name']),
        ("chain_", avg_rev_chain, ['chain_name']),
        ("stores_gk_lv1", num_stores_grunnkrets_lv1, ['grunnkrets_id', 'lv1_desc']),
        ("stores_gk_lv1_extra", num_stores_grunnkrets_lv1_extra, ['grunnkrets_id', 'lv1_desc']),
        ("rev_stores_gk_lv1", avg_rev_store_grunnkrets_lv1, ['grunnkrets_id', 'lv1_desc']),
        ("rev_stores_lv4", avg_rev_store_lv4, ['grunnkrets_id', 'lv4_desc']),
        ("stores_radius", num_stores_radius, ['store_id', 'lat', 'lon']),
        ("stores_radius_extra", num_stores_radius_extra, ['store_id', 'lat', 'lon']),
        ("geo", cluster_simil, ['lat', 'lon']),
        #("mall_name", one_hot_encode_pipeline(), ['mall_name']),
    ], remainder='passthrough')

preprocessing.fit(stores_train_data)

In [18]:
stores_train_prepped = preprocessing.transform(stores_train_data)

In [19]:
stores_test_prepped = preprocessing.transform(stores_test_enriched)

ValueError: columns are missing: {'revenue'}

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
random_forest = RandomForestClassifier()
random_forest.fit(stores_train_prepped, stores_train_labels)
y_pred = random_forest.predict(stores_test_prepped)

In [20]:
stores_train_data['revenue']

0        17.998
1        23.828
2        16.099
3         9.296
4         4.528
          ...  
12854     0.088
12855     1.816
12856    38.225
12857     3.642
12858     2.328
Name: revenue, Length: 12859, dtype: float64