In [1]:

import sys
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

sys.path.append("/Users/nwong/Workspace/Projects/tdt4173_project/src")

from feature_engineering.sklearn_transformers import *
from feature_engineering.store_features import *
from feature_engineering.utils import *

In [2]:
# Import datasets
stores_train_df = pd.read_csv("/Users/nwong/Workspace/Projects/tdt4173_project/data/raw/stores_train.csv")
stores_test_df = pd.read_csv("/Users/nwong/Workspace/Projects/tdt4173_project/data/raw/stores_test.csv")
stores_extra_df = pd.read_csv("/Users/nwong/Workspace/Projects/tdt4173_project/data/raw/stores_extra.csv")
bus_stops_df = pd.read_csv("/Users/nwong/Workspace/Projects/tdt4173_project/data/raw/busstops_norway.csv")

# Grouping mechanism 
geo_groups = ['grunnkrets_name','district_name','municipality_name']
store_types = ['lv1_desc', 'lv2_desc', 'lv3_desc', 'lv4_desc']
stop_importance_levels = ['Mangler viktighetsnivå',
                          'Standard holdeplass',
                          'Lokalt knutepunkt',
                          'Nasjonalt knutepunkt',
                          'Regionalt knutepunkt',
                          'Annen viktig holdeplass']

In [3]:
raw_path = "/Users/nwong/Workspace/Projects/tdt4173_project/data/raw/" # Folder with the raw data files

stores_extra_merged = enrich_keys(stores_extra_df, raw_path=raw_path)
stores_train_enriched = data_enricher(stores_train_df, raw_path=raw_path, geo_groups=geo_groups, importance_levels=stop_importance_levels)
stores_test_enriched = data_enricher(stores_test_df, raw_path=raw_path, geo_groups=geo_groups, importance_levels=stop_importance_levels)

stores_train_data = stores_train_enriched.drop("revenue", axis=1)
stores_train_labels = stores_train_enriched[['store_id', 'revenue']]

  bus_stops_df[['lon', 'lat']] = bus_stops_df['lng_lat'].str.split(
  bus_stops_df[['lon', 'lat']] = bus_stops_df['lng_lat'].str.split(


In [4]:
num_stores_district_lv1 = AggTransformer(
    agg_cols=["district_name", "lv1_desc"], 
    agg_name="store_count", 
    calculations=store_count,
    stores_extra=stores_extra_merged
)

num_stores_district_lv2 = AggTransformer(
    agg_cols=["district_name", "lv2_desc"], 
    agg_name="store_count", 
    calculations=store_count,
    stores_extra=stores_extra_merged
)

num_stores_district_lv3 = AggTransformer(
    agg_cols=["district_name", "lv3_desc"], 
    agg_name="store_count", 
    calculations=store_count,
    stores_extra=stores_extra_merged
)

num_stores_municipality_lv1 = AggTransformer(
    agg_cols=["municipality_name", "lv1_desc"], 
    agg_name="store_count", 
    calculations=store_count,
    stores_extra=stores_extra_merged
)

num_stores_municipality_lv2 = AggTransformer(
    agg_cols=["municipality_name", "lv2_desc"], 
    agg_name="store_count", 
    calculations=store_count,
    stores_extra=stores_extra_merged
)

num_stores_municipality_lv3 = AggTransformer(
    agg_cols=["municipality_name", "lv3_desc"], 
    agg_name="store_count", 
    calculations=store_count,
    stores_extra=stores_extra_merged
)

In [5]:
avg_rev_mall = AggTransformer(
    agg_cols=["mall_name"], 
    agg_name="avg_revenue", 
    calculations=average_revenue, 
    sample_revenue=stores_train_labels
)

avg_rev_mall_lv2 = AggTransformer(
    agg_cols=["mall_name", "lv2_desc"], 
    agg_name="avg_revenue", 
    calculations=average_revenue, 
    sample_revenue=stores_train_labels
)

avg_rev_chain = AggTransformer(
    agg_cols=["chain_name"], 
    agg_name="avg_revenue", 
    calculations=average_revenue, 
    sample_revenue=stores_train_labels
)

avg_rev_district = AggTransformer(
    agg_cols=['district_name'],
    agg_name='avg_rev_district',
    calculations=average_revenue,
    sample_revenue=stores_train_labels
)

avg_rev_municipality = AggTransformer(
    agg_cols=['municipality_name'],
    agg_name='avg_rev_district',
    calculations=average_revenue,
    sample_revenue=stores_train_labels
)

avg_rev_district_lv3 = AggTransformer(
    agg_cols=['district_name', 'lv3_desc'],
    agg_name='avg_rev_district',
    calculations=average_revenue,
    sample_revenue=stores_train_labels
)

avg_rev_municipality_lv3 = AggTransformer(
    agg_cols=['municipality_name', 'lv3_desc'],
    agg_name='avg_rev_municipality',
    calculations=average_revenue,
    sample_revenue=stores_train_labels
)

avg_rev_lv3 = AggTransformer(
    agg_cols=["lv3_desc"], 
    agg_name="avg_rev_lv3", 
    calculations=average_revenue,
    sample_revenue=stores_train_labels
)

avg_rev_lv4 = AggTransformer(
    agg_cols=["lv4_desc"], 
    agg_name="avg_rev_lv4", 
    calculations=average_revenue,
    sample_revenue=stores_train_labels
)

In [6]:
# Stores in radius irrespective of store type
num_stores_radius = StoresInRadiusTransformer(
    radius=0.1)

num_stores_radius_extra = StoresInRadiusTransformer(
    radius=0.1,
    stores_extra=stores_extra_merged)

In [7]:
cluster_simil = ClusterSimilarity(
    n_clusters=20, 
    gamma=1., 
    random_state=42, 
    sample_weight=stores_train_labels)
cluster_simil.fit_transform(stores_train_data[['lat', 'lon']])

array([[4.15993735e-001, 2.28184731e-022, 3.05050361e-011, ...,
        1.27255607e-009, 1.87744408e-006, 9.47820280e-035],
       [6.58447910e-001, 3.91643711e-020, 1.56994904e-013, ...,
        1.64321709e-008, 4.41921377e-008, 8.94150476e-032],
       [2.72222070e-001, 5.43463624e-020, 2.61402943e-011, ...,
        4.67020550e-008, 2.27850682e-006, 8.38354491e-032],
       ...,
       [7.01844646e-001, 2.23423426e-019, 7.74435248e-015, ...,
        3.07233726e-008, 4.65290479e-009, 1.05194236e-030],
       [3.92771905e-013, 2.22898868e-048, 8.89650906e-002, ...,
        4.26019752e-027, 1.91325282e-002, 2.46544310e-066],
       [1.19308921e-074, 1.43317305e-020, 3.22463852e-120, ...,
        6.40260694e-039, 7.59631812e-103, 7.32416677e-012]])

In [8]:
closest_stores = ClosestStore(
    stores_extra=stores_extra_merged,
    store_type_groups=store_types
)
closest_stores.fit_transform(stores_train_data)

Unnamed: 0,distance_to_lv1_desc,distance_to_lv2_desc,distance_to_lv3_desc,distance_to_lv4_desc
0,6.266641e-04,6.266641e-04,4.695977e-02,0.046960
1,3.596392e-07,3.151592e-04,1.864898e-03,0.001865
2,8.339069e-04,8.339069e-04,3.571459e-01,0.357146
3,3.480443e-04,3.480443e-04,4.327842e-02,0.043278
4,3.908446e-07,1.208005e-03,2.808519e-03,0.002809
...,...,...,...,...
12854,2.546594e-04,1.679672e-03,5.398634e-03,0.010215
12855,3.666254e-07,3.666254e-07,3.666254e-07,0.259925
12856,5.596648e-04,5.596648e-04,5.596648e-04,0.059335
12857,3.492682e-04,4.838586e-04,3.165082e-03,0.144909


In [9]:
num_pipeline = Pipeline([
    ("impute", SimpleImputer(strategy="median")),
    ("standardize", StandardScaler()),
])

cat_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore")
)

kmeans_pipeline = make_pipeline(
    StandardScaler(),
    ClusterSimilarity()
)

In [10]:

preprocessing = make_column_transformer(
        (is_null_pipeline(), ['mall_name']),
        (is_null_pipeline(), ['chain_name']),
        (num_stores_district_lv1, ['district_name', 'lv1_desc']),
        (num_stores_district_lv2, ['district_name', 'lv2_desc']),
        (num_stores_district_lv3, ['district_name', 'lv3_desc']),
        (num_stores_municipality_lv1, ["municipality_name", "lv1_desc"]),
        (num_stores_municipality_lv2, ["municipality_name", "lv2_desc"]),
        (num_stores_municipality_lv3, ["municipality_name", "lv3_desc"]),
        (avg_rev_mall, ['mall_name']),
        (avg_rev_mall_lv2, ['mall_name', 'lv2_desc']),
        (avg_rev_chain, ['chain_name']),
        (avg_rev_district, ['district_name']),
        (avg_rev_district_lv3, ['district_name', 'lv3_desc']),
        (avg_rev_municipality_lv3, ['municipality_name', 'lv3_desc']),
        (avg_rev_lv3, ['lv3_desc']),
        (avg_rev_lv4, ['lv4_desc']),
        (num_stores_radius_extra, ['store_id', 'lat', 'lon']),
        (cluster_simil, ['lat', 'lon']),
        (closest_stores, ['store_id', 'lat', 'lon'] + store_types),
        
        remainder="passthrough"
)

preprocessing.fit_transform(stores_train_data)

array([[True, True, 28.0, ..., 0.29461364885847413, 0.016952059455969913,
        0.15531349260113408],
       [False, True, 129.0, ..., 0.02037523831893929,
        0.001558611343048421, 0.11631916573449938],
       [True, True, 25.0, ..., 0.32636190477615207, 0.017329648090058877,
        0.05476340966692611],
       ...,
       [True, True, 157.0, ..., 0.2494327456176235, 0.03267858307296947,
        0.03745339062900866],
       [True, False, 785.0, ..., 0.1162554115969373, 0.14444260028473968,
        0.10141074003809641],
       [False, False, 2.0, ..., 0.08618008181076015, 0.06296454401599076,
        8.577262006921943]], dtype=object)

In [11]:
train_ = preprocessing.transform(stores_train_data)

In [12]:
test_ = preprocessing.transform(stores_test_enriched)

In [13]:
train_set = pd.DataFrame(train_, columns=preprocessing.get_feature_names_out(), index=stores_train_data.index)

In [14]:
test_set = pd.DataFrame(test_, columns=preprocessing.get_feature_names_out(), index=stores_test_enriched.index)

In [21]:
train_set

Unnamed: 0,pipeline-1__is,pipeline-2__is,aggtransformer-1__store_count,aggtransformer-2__store_count,aggtransformer-3__store_count,aggtransformer-4__store_count,aggtransformer-5__store_count,aggtransformer-6__store_count,aggtransformer-7__avg_revenue,aggtransformer-8__avg_revenue,...,remainder__number_of_lokalt_knutepunkt,remainder__number_of_nasjonalt_knutepunkt,remainder__number_of_regionalt_knutepunkt,remainder__number_of_annen_viktig_holdeplass,remainder__distance_to_mangler_viktighetsnivå,remainder__distance_to_standard_holdeplass,remainder__distance_to_lokalt_knutepunkt,remainder__distance_to_nasjonalt_knutepunkt,remainder__distance_to_regionalt_knutepunkt,remainder__distance_to_annen_viktig_holdeplass
0,True,True,28.0,19.0,1.0,78.0,58.0,3.0,3.5277,8.999,...,25,0,1,0,0.005315,0.0126,0.005467,0.294614,0.016952,0.155313
1,False,True,129.0,91.0,3.0,1220.0,856.0,25.0,,,...,74,10,13,0,0.001579,0.045599,0.001257,0.020375,0.001559,0.116319
2,True,True,25.0,19.0,1.0,28.0,21.0,1.0,8.0577,16.099,...,12,0,5,7,0.000371,0.011161,0.025783,0.326362,0.01733,0.054763
3,True,True,240.0,174.0,7.0,62.0,46.0,1.0,6.640667,8.918,...,21,2,9,0,0.005329,0.010858,0.016854,0.012352,0.000522,5.257526
4,True,True,18.0,10.0,2.0,252.0,170.0,4.0,3.831,4.528,...,25,0,0,6,0.003096,0.002093,0.00337,0.319223,0.644539,0.006737
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12854,False,False,785.0,143.0,11.0,92.0,21.0,3.0,,,...,6,0,4,3,0.001349,0.008576,0.006203,0.176835,0.005978,0.009062
12855,True,True,785.0,143.0,11.0,149.0,28.0,2.0,8.471471,1.816,...,18,0,2,0,0.002951,0.118968,0.022918,0.333546,0.051408,0.168095
12856,True,True,157.0,12.0,2.0,324.0,41.0,4.0,10.366205,25.9074,...,13,0,15,24,0.001587,0.102015,0.025107,0.249433,0.032679,0.037453
12857,True,False,785.0,143.0,11.0,343.0,57.0,3.0,3.04225,3.642,...,4,0,0,0,0.001097,0.005468,0.080072,0.116255,0.144443,0.101411


In [24]:
test_set

Unnamed: 0,pipeline-1__is,pipeline-2__is,aggtransformer-1__store_count,aggtransformer-2__store_count,aggtransformer-3__store_count,aggtransformer-4__store_count,aggtransformer-5__store_count,aggtransformer-6__store_count,aggtransformer-7__avg_revenue,aggtransformer-8__avg_revenue,...,remainder__number_of_lokalt_knutepunkt,remainder__number_of_nasjonalt_knutepunkt,remainder__number_of_regionalt_knutepunkt,remainder__number_of_annen_viktig_holdeplass,remainder__distance_to_mangler_viktighetsnivå,remainder__distance_to_standard_holdeplass,remainder__distance_to_lokalt_knutepunkt,remainder__distance_to_nasjonalt_knutepunkt,remainder__distance_to_regionalt_knutepunkt,remainder__distance_to_annen_viktig_holdeplass
0,False,False,9.0,4.0,,1220.0,856.0,25.0,,,...,64,10,13,0,0.000231,0.030165,0.010282,0.026472,0.013886,0.159301
1,False,True,3.0,1.0,,37.0,29.0,1.0,,,...,3,0,7,0,0.022422,0.001014,0.019396,0.489463,0.019258,0.135651
2,True,True,13.0,7.0,,1220.0,856.0,25.0,3.358375,1.307667,...,20,0,2,9,0.00369,0.017462,0.012602,0.171475,0.078346,0.024432
3,False,True,15.0,7.0,,252.0,170.0,4.0,,,...,15,0,0,4,0.002887,0.003348,0.019146,0.409014,0.746042,0.033637
4,False,True,17.0,14.0,,1220.0,856.0,25.0,,,...,64,10,13,0,0.000919,0.022926,0.007982,0.026247,0.017349,0.155829
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8572,False,True,18.0,3.0,1.0,149.0,32.0,2.0,,,...,3,0,0,0,0.003276,0.100738,0.028465,0.161441,0.121447,0.471161
8573,False,False,24.0,13.0,,472.0,91.0,5.0,,,...,20,8,2,8,0.004102,0.002065,0.011969,0.014388,0.031321,0.013542
8574,False,True,9.0,3.0,,17.0,8.0,,,,...,3,0,2,0,0.002042,0.023005,0.073009,0.271889,0.001239,0.103923
8575,False,False,5.0,3.0,,60.0,14.0,1.0,,,...,2,0,6,0,0.001162,0.060774,0.075504,0.160333,0.091335,0.125114
