In [32]:
import sys
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PowerTransformer
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.cluster import KMeans

sys.path.append("/Users/nwong/Workspace/Projects/tdt4173_project/src")

from feature_engineering.sklearn_transformers import *
from feature_engineering.store_features import *
from feature_engineering.bus_stop_features import *
from feature_engineering.utils import *

stop_importance_levels = ['Mangler viktighetsnivå',
                          'Standard holdeplass',
                          'Lokalt knutepunkt',
                          'Nasjonalt knutepunkt',
                          'Regionalt knutepunkt',
                          'Annen viktig holdeplass']
store_types = ['lv1_desc', 'lv2_desc', 'lv3_desc', 'lv4_desc']
geo_groups = ['grunnkrets_id','t_district','municipality_name']

In [33]:
stores_train_df = set_year_2016(pd.read_csv("/Users/nwong/Workspace/Projects/tdt4173_project/data/raw/stores_train.csv"))
stores_test_df = set_year_2016(pd.read_csv("/Users/nwong/Workspace/Projects/tdt4173_project/data/raw/stores_test.csv"))
stores_extra_df = set_year_2016(pd.read_csv("/Users/nwong/Workspace/Projects/tdt4173_project/data/raw/stores_extra.csv"))

income_dist = set_year_2016(pd.read_csv("/Users/nwong/Workspace/Projects/tdt4173_project/data/raw/grunnkrets_income_households.csv"))
age_dist = set_year_2016(pd.read_csv("/Users/nwong/Workspace/Projects/tdt4173_project/data/raw/grunnkrets_age_distribution.csv"))
household_dist = set_year_2016(pd.read_csv("/Users/nwong/Workspace/Projects/tdt4173_project/data/raw/grunnkrets_households_num_persons.csv"))
grunnkrets_df = combine_keys(set_year_2016(pd.read_csv("/Users/nwong/Workspace/Projects/tdt4173_project/data/raw/grunnkrets_norway_stripped.csv")))
plaace_df = pd.read_csv("/Users/nwong/Workspace/Projects/tdt4173_project/data/raw/plaace_hierarchy.csv")

bus_stops_df = bus_stops_lat_lon(pd.read_csv("/Users/nwong/Workspace/Projects/tdt4173_project/data/raw/busstops_norway.csv"))


  bus_stops_df[['lon', 'lat']] = bus_stops_df['lng_lat'].str.split(


In [58]:
stores_train_merged = stores_train_df.merge(plaace_df, on="plaace_hierarchy_id", how="left")
stores_test_merged = stores_extra_df.merge(plaace_df, on="plaace_hierarchy_id", how="left")

store_closest(stores_train_merged, pd.concat([stores_train_merged, stores_test_merged]), store_type_group="lv4_desc")

Unnamed: 0,store_id,closest_store,distance
0,983540538-974187930-44774,983610846-996267245-8088,0.046960
1,987074191-973117734-44755,914526647-981909100-55309,0.001865
2,984890265-981157303-64491,998614821-984886063-78205,0.357146
3,914057442-992924179-126912,915953964-916000669-789430,0.043278
4,913018583-913063538-668469,916476612-975790215-15244,0.002809
...,...,...,...
12854,915789943-915806929-781991,916717504-916738250-820525,0.010215
12855,917921733-917982368-868081,983559042-972015385-44630,0.259925
12856,911721961-911764474-496764,915466648-973084135-58149,0.059335
12857,914337046-914343372-721294,977056934-972208477-43421,0.144909


In [59]:
stores_train_merged[stores_train_merged['store_id'] == '983540538-974187930-44774'][['store_id', 'lv4_desc', 'address']]

Unnamed: 0,store_id,lv4_desc,address
0,983540538-974187930-44774,Hamburger restaurants,BRAGERNES TORG 13


In [61]:
stores_test_merged[stores_test_merged['store_id'] == '983610846-996267245-8088'][['store_id', 'lv4_desc', 'address']]

Unnamed: 0,store_id,lv4_desc,address
11986,983610846-996267245-8088,Hamburger restaurants,GULDLISTEN 35


In [62]:
store_closest_by_store_groups(stores_train_merged, pd.concat([stores_train_merged, stores_test_merged]), store_types)

Unnamed: 0,store_id,distance_to_lv1_desc,distance_to_lv2_desc,distance_to_lv3_desc,distance_to_lv4_desc
0,983540538-974187930-44774,6.266641e-04,0.000627,0.046960,0.046960
1,987074191-973117734-44755,3.596392e-07,0.000315,0.001865,0.001865
2,984890265-981157303-64491,8.339069e-04,0.000834,0.357146,0.357146
3,914057442-992924179-126912,3.480443e-04,0.000348,0.043278,0.043278
4,913018583-913063538-668469,3.908446e-07,0.001208,0.002809,0.002809
...,...,...,...,...,...
12854,913913973-913919653-703808,5.323396e-03,0.005323,0.005323,0.005323
12855,971230584-974872617-42523,1.161426e-02,0.067161,0.067161,0.067161
12856,979617615-979639317-58196,9.500240e-03,0.013467,0.013467,0.013467
12857,991015361-991077588-321523,5.392853e-04,0.000539,0.000539,0.000539


In [38]:
def new_kmeans(stores_train_merged, stores_test_merged, groups):
    train_list = []
    test_list = []
    for group in groups:
        _stores_train_merged = stores_train_merged[stores_train_merged['lv2_desc'] == group]
        _stores_test_merged = stores_test_merged[stores_test_merged['lv2_desc'] == group]
        kmeans = KMeans(n_clusters=1, init='k-means++')
        kmeans.fit(_stores_train_merged[['lat', 'lon']], sample_weight=_stores_train_merged['revenue'])
        _stores_train_merged[f'{group}_cluster_label'] = kmeans.predict(_stores_train_merged[['lat', 'lon']])
        _stores_test_merged[f'{group}_cluster_label'] = kmeans.predict(_stores_test_merged[['lat', 'lon']])
        
        train_list.append(_stores_train_merged[['store_id', f'{group}_cluster_label']])
        test_list.append(_stores_test_merged[['store_id', f'{group}_cluster_label']])
    
    return pd.concat(train_list), pd.concat(test_list)
    


In [39]:
groups = plaace_df.lv2_desc.unique()

stores_train_merged = stores_train_df.merge(plaace_df, on="plaace_hierarchy_id", how="inner")
stores_test_merged = stores_test_df.merge(plaace_df, on="plaace_hierarchy_id", how="inner")
new_kmeans(stores_train_merged, stores_test_merged, groups)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  _stores_train_merged[f'{group}_cluster_label'] = kmeans.predict(_stores_train_merged[['lat', 'lon']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  _stores_test_merged[f'{group}_cluster_label'] = kmeans.predict(_stores_test_merged[['lat', 'lon']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  _st

ValueError: Found array with 0 sample(s) (shape=(0, 2)) while a minimum of 1 is required by KMeans.

In [None]:
stores_train_df