In [1]:

import sys
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

sys.path.append("/Users/nwong/Workspace/Projects/tdt4173_project/src")

from feature_engineering.cluster_similarity_features import *
from feature_engineering.store_features import *
from feature_engineering.utils import *

In [2]:
# Import datasets
stores_train_df = pd.read_csv("/Users/nwong/Workspace/Projects/tdt4173_project/data/raw/stores_train.csv")
stores_test_df = pd.read_csv("/Users/nwong/Workspace/Projects/tdt4173_project/data/raw/stores_test.csv")
stores_extra_df = pd.read_csv("/Users/nwong/Workspace/Projects/tdt4173_project/data/raw/stores_extra.csv")
bus_stops_df = pd.read_csv("/Users/nwong/Workspace/Projects/tdt4173_project/data/raw/busstops_norway.csv")

# Grouping mechanism 
geo_groups = ['grunnkrets_name','district_name','municipality_name']
store_types = ['lv1_desc', 'lv2_desc', 'lv3_desc', 'lv4_desc']
stop_importance_levels = ['Mangler viktighetsnivå',
                          'Standard holdeplass',
                          'Lokalt knutepunkt',
                          'Nasjonalt knutepunkt',
                          'Regionalt knutepunkt',
                          'Annen viktig holdeplass']

In [3]:
raw_path = "/Users/nwong/Workspace/Projects/tdt4173_project/data/raw/" # Folder with the raw data files

stores_extra_merged = enrich_keys(stores_extra_df, raw_path=raw_path)
stores_train_enriched = data_enricher(stores_train_df, raw_path=raw_path, geo_groups=geo_groups, importance_levels=stop_importance_levels)
stores_test_enriched = data_enricher(stores_test_df, raw_path=raw_path, geo_groups=geo_groups, importance_levels=stop_importance_levels)

stores_train_data = stores_train_enriched.drop("revenue", axis=1)
stores_train_labels = stores_train_enriched[['store_id', 'revenue']]

  bus_stops_df[['lon', 'lat']] = bus_stops_df['lng_lat'].str.split(
  bus_stops_df[['lon', 'lat']] = bus_stops_df['lng_lat'].str.split(


In [4]:
# Impuding number of stores in same grunnkrets with the same type (without stores extra).
num_stores_grunnkrets_lv1 = AggTransformer(
    agg_cols=["grunnkrets_id", "lv1_desc"], 
    agg_name="store_count", 
    calculations=store_count)
num_stores_grunnkrets_lv1.fit_transform(stores_train_data)

Unnamed: 0,store_count
0,20.0
1,37.0
2,4.0
3,14.0
4,5.0
...,...
12854,18.0
12855,11.0
12856,32.0
12857,4.0


In [5]:
# Impuding number of stores in same grunnkrets with the same type (with stores extra).
num_stores_grunnkrets_lv1_extra = AggTransformer(
    agg_cols=["grunnkrets_id", "lv1_desc"], 
    agg_name="store_count", calculations=store_count, 
    stores_extra=stores_extra_merged)
num_stores_grunnkrets_lv1_extra.fit_transform(stores_train_data)

Unnamed: 0,store_count
0,36.0
1,83.0
2,7.0
3,29.0
4,13.0
...,...
12854,73.0
12855,73.0
12856,166.0
12857,32.0


In [6]:
avg_rev_mall = AggTransformer(
    agg_cols=["mall_name"], 
    agg_name="avg_revenue", 
    calculations=average_revenue, 
    sample_revenue=stores_train_labels)
avg_rev_mall.fit_transform(stores_train_data)

Unnamed: 0,avg_revenue
0,3.527700
1,
2,8.057700
3,6.640667
4,3.831000
...,...
12854,
12855,8.471471
12856,10.366205
12857,3.042250


In [7]:
avg_rev_chain = AggTransformer(
    agg_cols=["chain_name"], 
    agg_name="avg_revenue", 
    calculations=average_revenue, 
    sample_revenue=stores_train_labels)
avg_rev_chain.fit_transform(stores_train_data)

Unnamed: 0,avg_revenue
0,23.642250
1,23.642250
2,7.013545
3,7.013545
4,7.013545
...,...
12854,
12855,15.284667
12856,15.284667
12857,


In [8]:
# Impuding average revenue of store in same grunnkrets of the same type. DO NOT USE stores_extra
avg_rev_store_grunnkrets_lv1 = AggTransformer(
    agg_cols=["grunnkrets_id", "lv1_desc"], 
    agg_name="avg_revenue", 
    calculations=average_revenue,
    sample_revenue=stores_train_labels)
avg_rev_store_grunnkrets_lv1.fit_transform(stores_train_data)

Unnamed: 0,avg_revenue
0,10.330250
1,11.242757
2,7.382250
3,8.675714
4,10.011000
...,...
12854,5.073278
12855,9.228182
12856,12.938906
12857,2.762750


In [9]:
# Impuding average revenue of store in same grunnkrets of the same type. DO NOT USE stores_extra
avg_rev_store_lv4 = AggTransformer(
    agg_cols=["lv4_desc"], 
    agg_name="avg_revenue", 
    calculations=average_revenue,
    sample_revenue=stores_train_labels)
avg_rev_store_lv4.fit_transform(stores_train_data)

Unnamed: 0,avg_revenue
0,10.341043
1,10.341043
2,10.341043
3,10.341043
4,10.341043
...,...
12854,6.928231
12855,6.928231
12856,6.928231
12857,6.928231


In [10]:
# Stores in radius irrespective of store type
num_stores_radius = StoresInRadiusTransformer(
    radius=0.1)
num_stores_radius.fit_transform(stores_train_data)

Unnamed: 0,count
0,245
1,1694
2,73
3,127
4,381
...,...
12854,70
12855,76
12856,255
12857,278


In [11]:
# Stores in radius irrespective of store type
num_stores_radius_extra = StoresInRadiusTransformer(
    radius=0.1,
    stores_extra=stores_extra_merged)
num_stores_radius_extra.fit_transform(stores_train_data)

Unnamed: 0,count
0,739
1,5258
2,201
3,403
4,1249
...,...
12854,211
12855,251
12856,1146
12857,1041


In [12]:
# Stores of matching lv1_type within radius. THIS DOES NOT WORK.
# num_stores_radius_lv1_extra = StoresInRadiusTransformer(
#     radius=0.1, 
#     store_type_group='lv1_desc')
# num_stores_radius_lv1_extra.fit_transform()

In [13]:
cluster_simil = ClusterSimilarity(
    n_clusters=20, 
    gamma=1., 
    random_state=42, 
    sample_weight=stores_train_labels)

cluster_simil.fit_transform(stores_train_data[['lat', 'lon']])

array([[4.15993735e-001, 2.28184731e-022, 3.05050361e-011, ...,
        1.27255607e-009, 1.87744408e-006, 9.47820280e-035],
       [6.58447910e-001, 3.91643711e-020, 1.56994904e-013, ...,
        1.64321709e-008, 4.41921377e-008, 8.94150476e-032],
       [2.72222070e-001, 5.43463624e-020, 2.61402943e-011, ...,
        4.67020550e-008, 2.27850682e-006, 8.38354491e-032],
       ...,
       [7.01844646e-001, 2.23423426e-019, 7.74435248e-015, ...,
        3.07233726e-008, 4.65290479e-009, 1.05194236e-030],
       [3.92771905e-013, 2.22898868e-048, 8.89650906e-002, ...,
        4.26019752e-027, 1.91325282e-002, 2.46544310e-066],
       [1.19308921e-074, 1.43317305e-020, 3.22463852e-120, ...,
        6.40260694e-039, 7.59631812e-103, 7.32416677e-012]])

In [14]:
# num_pipeline = Pipeline([
#     ("impute", SimpleImputer(strategy="median")),
#     ("standardize", StandardScaler()),
# ])

# cat_pipeline = make_pipeline(
#     SimpleImputer(strategy="most_frequent"),
#     OneHotEncoder(handle_unknown="ignore"))


# kmeans_pipeline = make_pipeline(
#     StandardScaler(),
#     ClusterSimilarity()
# )

preprocessing = ColumnTransformer([
        ("mall", is_null_pipeline(), ["mall_name"]),
        ("chain", is_null_pipeline(), ['chain_name']),
        ("mall_", avg_rev_mall, ['mall_name']),
        ("chain_", avg_rev_chain, ['chain_name']),
        ("stores_gk_lv1", num_stores_grunnkrets_lv1, ['grunnkrets_id', 'lv1_desc']),
        ("stores_gk_lv1_extra", num_stores_grunnkrets_lv1_extra, ['grunnkrets_id', 'lv1_desc']),
        ("rev_stores_gk_lv1", avg_rev_store_grunnkrets_lv1, ['grunnkrets_id', 'lv1_desc']),
        ("rev_stores_lv4", avg_rev_store_lv4, ['grunnkrets_id', 'lv4_desc']),
        ("stores_radius", num_stores_radius, ['store_id', 'lat', 'lon']),
        ("stores_radius_extra", num_stores_radius_extra, ['store_id', 'lat', 'lon']),
        ("geo", cluster_simil, ['lat', 'lon']),
        # ("num", num_pipeline, make_column_selector(dtype_include=np.number)),
        # ("cat", num_pipeline, make_column_selector(dtype_include=object))
        #("mall_name", one_hot_encode_pipeline(), ['mall_name']),
    ], remainder="passthrough")

preprocessing.fit(stores_train_data)

In [15]:
train_ = preprocessing.transform(stores_train_data)

In [16]:
test_ = preprocessing.transform(stores_test_enriched)

In [17]:
train_set = pd.DataFrame(train_, columns=preprocessing.get_feature_names_out(), index=stores_train_data.index)

In [18]:
test_set = pd.DataFrame(test_, columns=preprocessing.get_feature_names_out(), index=stores_test_enriched.index)

In [19]:
stores_train_labels

Unnamed: 0,store_id,revenue
0,983540538-974187930-44774,17.998
1,987074191-973117734-44755,23.828
2,984890265-981157303-64491,16.099
3,914057442-992924179-126912,9.296
4,913018583-913063538-668469,4.528
...,...,...
12854,915789943-915806929-781991,0.088
12855,917921733-917982368-868081,1.816
12856,911721961-911764474-496764,38.225
12857,914337046-914343372-721294,3.642


In [20]:
train_set

Unnamed: 0,mall__is,chain__is,mall___avg_revenue,chain___avg_revenue,stores_gk_lv1__store_count,stores_gk_lv1_extra__store_count,rev_stores_gk_lv1__avg_revenue,rev_stores_lv4__avg_revenue,stores_radius__in_radius,stores_radius_extra__in_radius,...,remainder__number_of_lokalt_knutepunkt,remainder__number_of_nasjonalt_knutepunkt,remainder__number_of_regionalt_knutepunkt,remainder__number_of_annen_viktig_holdeplass,remainder__distance_to_mangler_viktighetsnivå,remainder__distance_to_standard_holdeplass,remainder__distance_to_lokalt_knutepunkt,remainder__distance_to_nasjonalt_knutepunkt,remainder__distance_to_regionalt_knutepunkt,remainder__distance_to_annen_viktig_holdeplass
0,True,True,3.5277,23.64225,20.0,17.0,10.33025,10.341043,245,739,...,25,0,1,0,0.005315,0.0126,0.005467,0.294614,0.016952,0.155313
1,False,True,,23.64225,37.0,47.0,11.242757,10.341043,1694,5258,...,74,10,13,0,0.001579,0.045599,0.001257,0.020375,0.001559,0.116319
2,True,True,8.0577,7.013545,4.0,4.0,7.38225,10.341043,73,201,...,12,0,5,7,0.000371,0.011161,0.025783,0.326362,0.01733,0.054763
3,True,True,6.640667,7.013545,14.0,16.0,8.675714,10.341043,127,403,...,21,2,9,0,0.005329,0.010858,0.016854,0.012352,0.000522,5.257526
4,True,True,3.831,7.013545,5.0,9.0,10.011,10.341043,381,1249,...,25,0,0,6,0.003096,0.002093,0.00337,0.319223,0.644539,0.006737
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12854,False,False,,,18.0,56.0,5.073278,6.928231,70,211,...,6,0,4,3,0.001349,0.008576,0.006203,0.176835,0.005978,0.009062
12855,True,True,8.471471,15.284667,11.0,63.0,9.228182,6.928231,76,251,...,18,0,2,0,0.002951,0.118968,0.022918,0.333546,0.051408,0.168095
12856,True,True,10.366205,15.284667,32.0,135.0,12.938906,6.928231,255,1146,...,13,0,15,24,0.001587,0.102015,0.025107,0.249433,0.032679,0.037453
12857,True,False,3.04225,,4.0,29.0,2.76275,6.928231,278,1041,...,4,0,0,0,0.001097,0.005468,0.080072,0.116255,0.144443,0.101411


In [21]:
test_set

Unnamed: 0,mall__is,chain__is,mall___avg_revenue,chain___avg_revenue,stores_gk_lv1__store_count,stores_gk_lv1_extra__store_count,rev_stores_gk_lv1__avg_revenue,rev_stores_lv4__avg_revenue,stores_radius__in_radius,stores_radius_extra__in_radius,...,remainder__number_of_lokalt_knutepunkt,remainder__number_of_nasjonalt_knutepunkt,remainder__number_of_regionalt_knutepunkt,remainder__number_of_annen_viktig_holdeplass,remainder__distance_to_mangler_viktighetsnivå,remainder__distance_to_standard_holdeplass,remainder__distance_to_lokalt_knutepunkt,remainder__distance_to_nasjonalt_knutepunkt,remainder__distance_to_regionalt_knutepunkt,remainder__distance_to_annen_viktig_holdeplass
0,False,False,,,1.0,1.0,4.304,10.341043,1655,5399,...,64,10,13,0,0.000231,0.030165,0.010282,0.026472,0.013886,0.159301
1,False,True,,7.013545,,1.0,,10.341043,201,636,...,3,0,7,0,0.022422,0.001014,0.019396,0.489463,0.019258,0.135651
2,True,True,3.358375,7.013545,6.0,9.0,1.964333,10.341043,272,1309,...,20,0,2,9,0.00369,0.017462,0.012602,0.171475,0.078346,0.024432
3,False,True,,7.013545,,5.0,,10.341043,372,1253,...,15,0,0,4,0.002887,0.003348,0.019146,0.409014,0.746042,0.033637
4,False,True,,28.881,3.0,3.0,17.063667,10.341043,1647,5345,...,64,10,13,0,0.000919,0.022926,0.007982,0.026247,0.017349,0.155829
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8572,False,True,,15.284667,,,,6.928231,32,84,...,3,0,0,0,0.003276,0.100738,0.028465,0.161441,0.121447,0.471161
8573,False,False,,,,1.0,,6.928231,380,1354,...,20,8,2,8,0.004102,0.002065,0.011969,0.014388,0.031321,0.013542
8574,False,True,,15.284667,1.0,8.0,45.264,6.928231,18,90,...,3,0,2,0,0.002042,0.023005,0.073009,0.271889,0.001239,0.103923
8575,False,False,,,1.0,3.0,39.642,6.928231,104,359,...,2,0,6,0,0.001162,0.060774,0.075504,0.160333,0.091335,0.125114
