***
## todo list:
- Consider the bssids observed on test for deciding the set of bssids for the whole site

In [1]:
import gc
from glob import glob
from joblib import Parallel,delayed
import numpy as np
import pandas as pd
from tqdm import tqdm
import yaml
import sys

sys.path.append("../utils/")
from iln_io_f import read_data_file

***

In [2]:
train_files = glob("../data/train/*.parquet")
test_files  = glob("../data/test/*.parquet")

In [3]:
with open("../data/bssid_by_site.yml", "r") as file:
    bssid_by_site = yaml.load(file, Loader=yaml.FullLoader)

***
### Creating dataset-1

In [4]:
!mkdir -p ../data/ds1/train
!mkdir -p ../data/ds1/test

In [5]:
# minimum percentaje of observation to be considered as top seen bssid
MIN_PERC = 0.05

In [6]:
def compute_seq_nbr(ts_serie):
    """
    Function to calculate the sequence number: for each path, the 
    sequence number corresponds to the ordered index of each 
    waypoint observation
    """
    mapping = {v:i for i,v in enumerate(np.sort(ts_serie.unique()))}
    return ts_serie.map(mapping)

In [7]:
%%time

all_sites = list()
top_seen_bssids_by_site = dict()

for i,file in enumerate(train_files):
    
    site_id = file.split("/")[-1].split(".")[0]
    all_sites.append(site_id)
    
    print(f"\nProcessing site {i+1}/{len(train_files)}: {site_id}")

    all_bssids = bssid_by_site[site_id]
    df = pd.read_parquet(file)
    df["site"] = site_id
    df["wifi_time_diff"] = np.abs(df.wifi_time_delta)
    df["seq_nbr"] = df.groupby("path")["timestamp"].apply(compute_seq_nbr)
    
    # selects the top seen bssids for the site
    n_wifi_obs = len(df.loc[:,["path","timestamp_wifi"]].drop_duplicates())
    min_obs = int(n_wifi_obs*MIN_PERC)
    count_by_bssid = (df.query("seq_nbr == 1")[all_bssids] > -999).sum(axis=0)
    top_seen_bssids = count_by_bssid[count_by_bssid > min_obs].index.tolist()
    top_seen_bssids_by_site[site_id] = top_seen_bssids
    bssids_to_remove = list(set(all_bssids)-set(top_seen_bssids))
    print("Min number of observations:", min_obs, "from:", n_wifi_obs)
        
    n_top = len(top_seen_bssids)
    n_all = len(all_bssids)
    print(f"Selected {n_top} bssids from {n_all} ({100*n_top/n_all:.2f}%)")
    
    df.drop(bssids_to_remove, axis=1, inplace=True)
    # sanity check
    if (df[top_seen_bssids] == -999).all(axis=0).any():
        cols_to_drop = df[top_seen_bssids].columns[(df[top_seen_bssids] == -999).all(axis=0)]
        df.drop(cols_to_drop, axis=1, inplace=True)
        print("columns with no signal removed in dataset:", len(cols_to_drop))
    if (df[top_seen_bssids] == -999).all(axis=1).any():
        idx_to_drop = df.index[(df[top_seen_bssids] == -999).all(axis=1)]
        df.drop(idx_to_drop, inplace=True)
        print("rows with no signal removed in dataset:", len(idx_to_drop))
        
    df.to_parquet(f"../data/ds1/train/{site_id}.parquet", index=False)
    del df; gc.collect()


Processing site 1/24: 5da1389e4db8ce0c98bd0547
Min number of observations: 315 from: 6312
Selected 225 bssids from 1021 (22.04%)
rows with no signal removed in dataset: 6

Processing site 2/24: 5d27099f03f801723c32511d
Min number of observations: 212 from: 4251
Selected 554 bssids from 925 (59.89%)

Processing site 3/24: 5d2709b303f801723c327472
Min number of observations: 767 from: 15358
Selected 667 bssids from 1913 (34.87%)
rows with no signal removed in dataset: 786

Processing site 4/24: 5dc8cea7659e181adb076a3f
Min number of observations: 782 from: 15655
Selected 716 bssids from 4864 (14.72%)
rows with no signal removed in dataset: 468

Processing site 5/24: 5d2709c303f801723c3299ee
Min number of observations: 504 from: 10083
Selected 2159 bssids from 5831 (37.03%)

Processing site 6/24: 5d2709d403f801723c32bd39
Min number of observations: 501 from: 10027
Selected 564 bssids from 2139 (26.37%)

Processing site 7/24: 5d27097f03f801723c320d97
Min number of observations: 525 from: 

In [8]:
# reads test files in parallel
with Parallel(n_jobs=8) as parallel:
    delayed_func = delayed(pd.read_parquet)
    test_dataframes = parallel(delayed_func(f) for f in tqdm(test_files))

100%|██████████| 626/626 [01:21<00:00,  7.72it/s]


In [9]:
for i,site_id in enumerate(all_sites):
    
    print(f"\nProcessing site {i+1}/{len(all_sites)}: {site_id}")
    
    all_bssids = bssid_by_site[site_id]
    top_seen_bssids = top_seen_bssids_by_site[site_id] 
    bssids_to_remove = list(set(all_bssids)-set(top_seen_bssids))
    
    df = pd.concat(filter(lambda x: x.site.unique()[0] == site_id, test_dataframes), axis=0, ignore_index=True)
    df.drop(bssids_to_remove, axis=1, inplace=True)
    n_pred_points = df.site_path_timestamp.nunique()
    
    # sanity check
    if (df[top_seen_bssids] == -999).all(axis=0).any():
        cols_to_drop = df[top_seen_bssids].columns[(df[top_seen_bssids] == -999).all(axis=0)]
        # columns are not dropped beacuse can be useful for fitting the training set
        #df.drop(cols_to_drop, axis=1, inplace=True)
        print(f"columns with no signal in dataset: {len(cols_to_drop)} from {len(top_seen_bssids)}")
        
        # updates the top_seen_bssids
        #top_seen_bssids = set(top_seen_bssids) - set(cols_to_drop)
        #top_seen_bssids_by_site[site_id] = top_seen_bssids
        
    if (df[top_seen_bssids] == -999).all(axis=1).any():
        idx_to_drop = df.index[(df[top_seen_bssids] == -999).all(axis=1)]
        df.drop(idx_to_drop, inplace=True)
        print(f"rows with no signal removed in dataset: {len(idx_to_drop)} from {len(df)}")
        
        # verifies if there are no missing prediction points
        assert n_pred_points == df.site_path_timestamp.nunique(), "Missing prediction points"
    
    df["wifi_time_diff"] = np.abs(df.wifi_time_delta)
    df["seq_nbr"] = df.groupby("path")["timestamp"].apply(compute_seq_nbr)
    
    df.to_parquet(f"../data/ds1/test/{site_id}.parquet", index=False)


Processing site 1/24: 5da1389e4db8ce0c98bd0547
columns with no signal in dataset: 6 from 225

Processing site 2/24: 5d27099f03f801723c32511d
columns with no signal in dataset: 100 from 554

Processing site 3/24: 5d2709b303f801723c327472
columns with no signal in dataset: 34 from 667
rows with no signal removed in dataset: 544 from 60310

Processing site 4/24: 5dc8cea7659e181adb076a3f
columns with no signal in dataset: 1 from 716

Processing site 5/24: 5d2709c303f801723c3299ee
columns with no signal in dataset: 124 from 2159

Processing site 6/24: 5d2709d403f801723c32bd39

Processing site 7/24: 5d27097f03f801723c320d97
columns with no signal in dataset: 48 from 776

Processing site 8/24: 5d27075f03f801723c2e360f
columns with no signal in dataset: 895 from 1585

Processing site 9/24: 5d27096c03f801723c31e5e0

Processing site 10/24: 5c3c44b80379370013e0fd2b
columns with no signal in dataset: 601 from 1354

Processing site 11/24: 5da138364db8ce0c98bc00f1
columns with no signal in dataset:

In [None]:
#!kaggle datasets version -r zip -p ../data/ds1 -m ""

***
## dataset-v2