***
## todo list:
- Fix the removing of waypoints (should be fixed?)
- Create a dataset of waypoints

In [1]:
import gc
from glob import glob
from joblib import Parallel,delayed
import numpy as np
import pandas as pd
from tqdm import tqdm
import yaml
import sys

sys.path.append("../utils/")
from iln_io_f import read_data_file

***

In [2]:
train_files = glob("../data/train/*.parquet")
test_files  = glob("../data/test/*.parquet")

In [3]:
with open("../data/bssid_by_site.yml", "r") as file:
    bssid_by_site = yaml.load(file, Loader=yaml.FullLoader)
    
all_sites = list(bssid_by_site.keys())

***
### Creating dataset-1

In [4]:
!mkdir -p ../data/ds1/train
!mkdir -p ../data/ds1/test

In [5]:
# minimum percentaje of observation to be considered as top seen bssid
MIN_PERC = 0.05

# params to configure near bssids
MAX_TIME_DIFF = 10000
MIN_RSSI = -70

In [6]:
def compute_seq_nbr(ts_serie):
    """
    Function to calculate the sequence number: for each path, the 
    sequence number corresponds to the ordered index of each 
    waypoint observation
    """
    mapping = {v:i for i,v in enumerate(np.sort(ts_serie.unique()))}
    return ts_serie.map(mapping)

def compute_near_bssids(df, all_bssids, max_time_diff=MAX_TIME_DIFF, min_rssi=MIN_RSSI):
    """
    Function to calculate near bssids for each path,timestamp
    """
    near_bssids = (df.query("wifi_time_diff <= @max_time_diff | index == 0")[all_bssids] >= min_rssi).any()
    near_bssids = near_bssids[near_bssids]
    return near_bssids.index.tolist()

In [7]:
# reads test files in parallel
with Parallel(n_jobs=8) as parallel:
    delayed_func = delayed(pd.read_parquet)
    test_dataframes = parallel(delayed_func(f) for f in tqdm(test_files))

100%|██████████| 626/626 [01:19<00:00,  7.86it/s]


In [8]:
# finds the bssids that must be kept, based on test observations
def compute_bssids_to_keep(test_dataframes, site_id):
    all_bssids = bssid_by_site[site_id]
    df = pd.concat(filter(lambda x: x.site.unique()[0] == site_id, test_dataframes), axis=0, ignore_index=True)
    df["wifi_time_diff"] = np.abs(df.wifi_time_delta)
    
    keep_bssids = np.concatenate(df.groupby(["path","timestamp"]).apply(compute_near_bssids, all_bssids).values)
    keep_bssids = list(set(keep_bssids))
    return (site_id, keep_bssids)

with Parallel(n_jobs=8) as parallel:
    delayed_func = delayed(compute_bssids_to_keep)
    keep_bssids_by_site = parallel(delayed_func(test_dataframes, site) for site in tqdm(all_sites))
keep_bssids_by_site = {k:v for k,v in keep_bssids_by_site}

100%|██████████| 24/24 [03:46<00:00,  9.43s/it]


In [16]:
for k,v in keep_bssids_by_site.items():
    print(k, len(v))

5a0546857ecc773753327266 1809
5c3c44b80379370013e0fd2b 511
5d27075f03f801723c2e360f 487
5d27096c03f801723c31e5e0 667
5d27097f03f801723c320d97 591
5d27099f03f801723c32511d 283
5d2709a003f801723c3251bf 618
5d2709b303f801723c327472 893
5d2709bb03f801723c32852c 1215
5d2709c303f801723c3299ee 3273
5d2709d403f801723c32bd39 947
5d2709e003f801723c32d896 624
5da138274db8ce0c98bbd3d2 162
5da1382d4db8ce0c98bbe92e 1316
5da138314db8ce0c98bbf3a0 760
5da138364db8ce0c98bc00f1 271
5da1383b4db8ce0c98bc11ab 638
5da138754db8ce0c98bca82f 550
5da138764db8ce0c98bcaa46 863
5da1389e4db8ce0c98bd0547 212
5da138b74db8ce0c98bd4774 1415
5da958dd46f8266d0737457b 2228
5dbc1d84c1eb61796cf7c010 2733
5dc8cea7659e181adb076a3f 1237


In [20]:
%%time

top_seen_bssids_by_site = dict()

for i,file in enumerate(train_files):
    
    site_id = file.split("/")[-1].split(".")[0]
    print(f"\nProcessing site {i+1}/{len(train_files)}: {site_id}")

    df = pd.read_parquet(file)
    df["site"] = site_id
    df["wifi_time_diff"] = np.abs(df.wifi_time_delta)
    df["seq_nbr"] = df.groupby("path")["timestamp"].apply(compute_seq_nbr)
    
    all_bssids = bssid_by_site[site_id]
    keep_bssids = keep_bssids_by_site[site_id]

    # selects the top seen bssids for the site
    n_wifi_obs = len(df.loc[:,["path","timestamp_wifi"]].drop_duplicates())
    min_obs = int(n_wifi_obs*MIN_PERC)
    count_by_bssid = (df.query("seq_nbr == 0")[all_bssids] > -999).sum(axis=0)
    top_seen_bssids = count_by_bssid[count_by_bssid > min_obs].index.tolist()
    top_seen_bssids = list(set(top_seen_bssids) | set(keep_bssids))
    top_seen_bssids_by_site[site_id] = top_seen_bssids
    bssids_to_remove = list(set(all_bssids) - set(top_seen_bssids))
    print("Min number of observations:", min_obs, "from", n_wifi_obs)
    
    n_top = len(top_seen_bssids)
    n_all = len(all_bssids)
    print(f"Selected {n_top} bssids from {n_all} ({100*n_top/n_all:.2f}%)")
    
    df.drop(bssids_to_remove, axis=1, inplace=True)
    n_waypoints = len(df.loc[:,["path","timestamp"]].drop_duplicates())
    
    # sanity check
    if (df[top_seen_bssids] == -999).all(axis=0).any():
        cols_to_drop = df[top_seen_bssids].columns[(df[top_seen_bssids] == -999).all(axis=0)]
        df.drop(cols_to_drop, axis=1, inplace=True)
        print("columns with no signal removed in dataset:", len(cols_to_drop))
    if (df[top_seen_bssids] == -999).all(axis=1).any():
        idx_to_drop = df.index[(df[top_seen_bssids] == -999).all(axis=1)]
        df.drop(idx_to_drop, inplace=True)
        print("rows with no signal removed in dataset:", len(idx_to_drop))
        
        # verifies if there are no missing waypoints
        #assert n_waypoints == len(df.loc[:,["path","timestamp"]].drop_duplicates()), "Missing waypoints"
        n_waypoints_new = len(df.loc[:,["path","timestamp"]].drop_duplicates())
        if n_waypoints_new != n_waypoints:
            print("waypoints prev:", n_waypoints, "waypoints now:", n_waypoints_new, "diff:", n_waypoints-n_waypoints_new)
        
    df.to_parquet(f"../data/ds1/train/{site_id}.parquet", index=False)
    del df; gc.collect()


Processing site 1/24: 5da1389e4db8ce0c98bd0547
Min number of observations: 315 from 6312
Selected 265 bssids from 1021 (25.95%)

Processing site 2/24: 5d27099f03f801723c32511d
Min number of observations: 212 from 4251
Selected 567 bssids from 925 (61.30%)

Processing site 3/24: 5d2709b303f801723c327472
Min number of observations: 767 from 15358
Selected 991 bssids from 1913 (51.80%)

Processing site 4/24: 5dc8cea7659e181adb076a3f
Min number of observations: 782 from 15655
Selected 1396 bssids from 4864 (28.70%)
rows with no signal removed in dataset: 326

Processing site 5/24: 5d2709c303f801723c3299ee
Min number of observations: 504 from 10083
Selected 3852 bssids from 5831 (66.06%)

Processing site 6/24: 5d2709d403f801723c32bd39
Min number of observations: 501 from 10027
Selected 1016 bssids from 2139 (47.50%)

Processing site 7/24: 5d27097f03f801723c320d97
Min number of observations: 525 from 10507
Selected 855 bssids from 2490 (34.34%)
rows with no signal removed in dataset: 2704
w

In [21]:
# reads test files in parallel
with Parallel(n_jobs=8) as parallel:
    delayed_func = delayed(pd.read_parquet)
    test_dataframes = parallel(delayed_func(f) for f in tqdm(test_files))

100%|██████████| 626/626 [01:22<00:00,  7.57it/s]


In [22]:
for i,site_id in enumerate(all_sites):
    
    print(f"\nProcessing site {i+1}/{len(all_sites)}: {site_id}")
    
    all_bssids = bssid_by_site[site_id]
    top_seen_bssids = top_seen_bssids_by_site[site_id] 
    bssids_to_remove = list(set(all_bssids)-set(top_seen_bssids))
    
    df = pd.concat(filter(lambda x: x.site.unique()[0] == site_id, test_dataframes), axis=0, ignore_index=True)
    df["site"] = site_id
    df["wifi_time_diff"] = np.abs(df.wifi_time_delta)
    df["seq_nbr"] = df.groupby("path")["timestamp"].apply(compute_seq_nbr)
    df.drop(bssids_to_remove, axis=1, inplace=True)
    n_pred_points = df.site_path_timestamp.nunique()
    
    # sanity check
    if (df[top_seen_bssids] == -999).all(axis=0).any():
        cols_to_drop = df[top_seen_bssids].columns[(df[top_seen_bssids] == -999).all(axis=0)]
        # columns are not dropped beacuse can be useful for fitting the training set
        print(f"columns with no signal in dataset: {len(cols_to_drop)} from {len(top_seen_bssids)}")
        
    if (df[top_seen_bssids] == -999).all(axis=1).any():
        idx_to_drop = df.index[(df[top_seen_bssids] == -999).all(axis=1)]
        df.drop(idx_to_drop, inplace=True)
        print(f"rows with no signal removed in dataset: {len(idx_to_drop)} from {len(df)}")
        
        # verifies if there are no missing prediction points
        assert n_pred_points == df.site_path_timestamp.nunique(), "Missing prediction points"
    
    df.to_parquet(f"../data/ds1/test/{site_id}.parquet", index=False)


Processing site 1/24: 5a0546857ecc773753327266
columns with no signal in dataset: 103 from 2122

Processing site 2/24: 5c3c44b80379370013e0fd2b
columns with no signal in dataset: 601 from 1495

Processing site 3/24: 5d27075f03f801723c2e360f
columns with no signal in dataset: 895 from 1886

Processing site 4/24: 5d27096c03f801723c31e5e0

Processing site 5/24: 5d27097f03f801723c320d97
columns with no signal in dataset: 48 from 855

Processing site 6/24: 5d27099f03f801723c32511d
columns with no signal in dataset: 100 from 567

Processing site 7/24: 5d2709a003f801723c3251bf
columns with no signal in dataset: 26 from 735

Processing site 8/24: 5d2709b303f801723c327472
columns with no signal in dataset: 34 from 991

Processing site 9/24: 5d2709bb03f801723c32852c
columns with no signal in dataset: 57 from 1348

Processing site 10/24: 5d2709c303f801723c3299ee
columns with no signal in dataset: 124 from 3852

Processing site 11/24: 5d2709d403f801723c32bd39

Processing site 12/24: 5d2709e003f80

In [23]:
!kaggle datasets version -r zip -p ../data/ds1 -m "Adds bssids near to prediction points in test dataset"

Starting upload for file test.zip
100%|██████████████████████████████████████| 33.3M/33.3M [00:01<00:00, 20.2MB/s]
Upload successful: test.zip (33MB)
Starting upload for file train.zip
100%|████████████████████████████████████████| 164M/164M [00:02<00:00, 58.5MB/s]
Upload successful: train.zip (164MB)
Dataset version is being created. Please check progress at https://www.kaggle.com/mavillan/iln-dataset1


***
## dataset-v2