In [1]:
#from google.colab import drive
#from google.colab import files
#drive.mount('/content/drive', force_remount=True)

In [2]:
#!pip install --upgrade kaggle > /dev/null 2>&1
#!pip install optuna  > /dev/null 2>&1

In [3]:
# move kaggle.json into the folder where the API expects to find it
#!mkdir -p ~/.kaggle/ && cp /content/drive/MyDrive/kaggle/kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

***

In [4]:
from glob import glob
import numpy as np
import pandas as pd
import scipy.interpolate
import scipy.sparse
from tqdm import tqdm
from joblib import Parallel,delayed
import yaml

import optuna
#optuna.logging.set_verbosity(optuna.logging.ERROR)

***

In [5]:
def mean_position_error(labels, preds):
    x_label = labels.x.values
    y_label = labels.y.values
    x_pred = preds.x.values
    y_pred = preds.y.values
    return np.mean(np.sqrt((x_label-x_pred)**2 + (y_label-y_pred)**2))

def correct_path(wp_preds, delta_preds, lambda1=0.1, lambda2=0.5, lambda3=2):

    T_ref  = wp_preds['timestamp'].values
    xy_preds = wp_preds[['x', 'y']].values
    delta_preds = delta_preds.loc[:,["timestamp","dx","dy"]].values

    if T_ref[-1] > delta_preds[-1, 0]:
        delta_preds = [np.array([[0, 0, 0]]), delta_preds, np.array([[T_ref[-1], 0, 0]])]
    else:
        delta_preds = [np.array([[0, 0, 0]]), delta_preds]
    delta_preds = np.concatenate(delta_preds)
    
    T_rel = delta_preds[:, 0]
    delta_xy_preds = np.diff(scipy.interpolate.interp1d(T_rel, np.cumsum(delta_preds[:, 1:3], axis=0), axis=0)(T_ref), axis=0)

    N = xy_preds.shape[0]
    delta_t = np.diff(T_ref)
        
    alpha = lambda1 * np.ones(N)
    beta  = (1-lambda1) * np.ones(N-1) / (1 + lambda2 * delta_t * 1e-3)**lambda3
    
    A = scipy.sparse.spdiags(alpha, [0], N, N)
    B = scipy.sparse.spdiags( beta, [0], N-1, N-1)
    D = scipy.sparse.spdiags(np.stack([-np.ones(N), np.ones(N)]), [0, 1], N-1, N)

    Q = A + (D.T @ B @ D)
    c = (A @ xy_preds) + (D.T @ (B @ delta_xy_preds))
    xy_corr = scipy.sparse.linalg.spsolve(Q, c)

    return pd.DataFrame({
        'site' : wp_preds['site'],
        'path' : wp_preds['path'],
        'timestamp' : wp_preds['timestamp'],
        'x' : xy_corr[:, 0],
        'y' : xy_corr[:, 1],
    })

***

In [6]:
!kaggle datasets download -d mavillan/iln-imu-preds --force --unzip

Downloading iln-imu-preds.zip to /Users/mavillan/Repositories/kg/indoor-location-navigation/tuning
100%|███████████████████████████████████████▉| 471M/471M [00:47<00:00, 11.5MB/s]
100%|████████████████████████████████████████| 471M/471M [00:47<00:00, 10.3MB/s]


In [7]:
!kaggle datasets download -d mavillan/iln-dnn-dset1 --force --unzip

Downloading iln-dnn-dset1.zip to /Users/mavillan/Repositories/kg/indoor-location-navigation/tuning
100%|███████████████████████████████████████▉| 152M/152M [00:16<00:00, 11.5MB/s]
100%|████████████████████████████████████████| 152M/152M [00:16<00:00, 9.76MB/s]


In [9]:
input_path = "./dnn-ds1-30f-5lt-pl"

delta_preds = pd.read_csv("./delta_preds.csv")

predictions = (
    pd.read_csv(f"{input_path}/oof_preds.csv")
    .loc[:,["site","path","timestamp","oof_x_agg","oof_y_agg"]]
    .rename({"oof_x_agg":"x", "oof_y_agg":"y"}, axis=1)
    .drop_duplicates()
    .reset_index(drop=True)
)

labels = (
    pd.read_csv(f"{input_path}/oof_preds.csv")
    .loc[:,["site","path","timestamp","x","y"]]
    .drop_duplicates()
    .reset_index(drop=True)
)

In [10]:
predictions = predictions.sort_values(["site","path","timestamp"], ignore_index=True)
labels = labels.sort_values(["site","path","timestamp"], ignore_index=True)
delta_preds = delta_preds.sort_values(["site","path","timestamp"], ignore_index=True)

In [11]:
target_sites = predictions.site.unique().tolist()

***
## optimization of parameters: by site

In [22]:
class MinimizeContainer():
    
    def __init__(self, preds, deltas, labels):
        self.pred_inputs = preds
        self.delta_inputs = deltas
        self.labels = labels
        self.site = labels["site"].unique()[0]
        
    def objective(self, trial):
        lambda1 = trial.suggest_uniform("lambda1", 0, 1)
        lambda2 = trial.suggest_uniform("lambda2", 0, 1)
        lambda3 = trial.suggest_uniform("lambda3", 0.5, 2)

        inputs = zip(self.pred_inputs,self.delta_inputs)
        corrected = [correct_path(wp, delta, lambda1, lambda2, lambda3) 
                     for wp,delta in inputs]
        corrected = pd.concat(corrected, ignore_index=True)
        return mean_position_error(self.labels, corrected)

    def correct(self, lambda1, lambda2, lambda3):
        inputs = zip(self.pred_inputs,self.delta_inputs)
        corrected = [correct_path(wp, delta, lambda1, lambda2, lambda3) 
                     for wp,delta in inputs]
        corrected = pd.concat(corrected, ignore_index=True)
        return corrected      
    
def minimize(container, n_trials=200):
    study = optuna.create_study(direction='minimize')
    study.optimize(container.objective, n_trials=n_trials, timeout=14400, n_jobs=2)
    return (container.site, study.best_value, study.best_params)

In [23]:
all_containers = list()

for i,site in enumerate(target_sites):
    paths = labels.query("site==@site").path.unique()
    pred_inputs = [predictions.query("site==@site & path==@path") for path in paths]
    delta_inputs = [delta_preds.query("site==@site & path==@path") for path in paths]
    mc = MinimizeContainer(pred_inputs, delta_inputs, labels.query("site==@site"))
    all_containers.append(mc)
    

In [24]:
%%time
with Parallel(n_jobs=-1) as parallel:
    delayed_minimize = delayed(minimize)
    all_results = parallel(delayed_minimize(mc, n_trials=500) for mc in tqdm(all_containers))

100%|██████████| 24/24 [00:00<00:00, 31457.28it/s]


CPU times: user 29.2 s, sys: 2.18 s, total: 31.4 s
Wall time: 56min 14s


In [25]:
# calculate original cv
mean_position_error(labels, predictions)

6.538272938333347

In [32]:
all_corrected = list()
for container,result in zip(all_containers,all_results):
    best_params = result[-1]
    corrected = container.correct(**best_params)
    all_corrected.append(corrected)

corrected = pd.concat(all_corrected, ignore_index=True)

In [34]:
mean_position_error(labels, corrected)

5.745694250291014

In [35]:
all_results_dict = {site:{"error":error,"params":params} for site,error,params in all_results}
output_path = "/content/drive/MyDrive/kaggle/indoor-location-navigation/output"

with open(f'{output_path}/cm_params_by_site.yml', 'w') as file:
    yaml.dump(all_results_dict, file, default_flow_style=False)
    file.close()

***
## optimization of parameters: jointly

In [12]:
# calculate original cv
mean_position_error(labels, predictions)

6.218607637611398

In [13]:
pred_inputs = list()
delta_inputs = list()
for i,row in labels.loc[:,["site","path"]].drop_duplicates().iterrows():
    pred_inputs.append(predictions.query("site==@row.site & path==@row.path"))
    delta_inputs.append(delta_preds.query("site==@row.site & path==@row.path"))

In [14]:
def objective(trial):
    lambda1 = trial.suggest_uniform("lambda1", 0, 1)
    lambda2 = trial.suggest_uniform("lambda2", 0, 1)
    lambda3 = trial.suggest_uniform("lambda3", 0.5, 2)
    
    inputs = zip(pred_inputs, delta_inputs)
    with Parallel(n_jobs=6) as parallel:
        delayed_correct = delayed(correct_path)
        corrected = parallel(
            delayed_correct(wp, delta, lambda1, lambda2, lambda3)
            for wp,delta in inputs
        )
    corrected = pd.concat(corrected, ignore_index=True)
    return mean_position_error(labels, corrected)

In [15]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=500, timeout=14400)

[32m[I 2021-04-25 10:33:33,319][0m A new study created in memory with name: no-name-c872e74e-93e7-4860-b890-ccac97f50825[0m
[32m[I 2021-04-25 10:33:48,736][0m Trial 0 finished with value: 5.618594760185517 and parameters: {'lambda1': 0.24210280840026221, 'lambda2': 0.6226046078675583, 'lambda3': 0.5858799147992659}. Best is trial 0 with value: 5.618594760185517.[0m
[32m[I 2021-04-25 10:34:05,720][0m Trial 1 finished with value: 5.954532323403229 and parameters: {'lambda1': 0.6547211597902424, 'lambda2': 0.29021419348701727, 'lambda3': 0.8158254089699479}. Best is trial 0 with value: 5.618594760185517.[0m
[32m[I 2021-04-25 10:34:20,820][0m Trial 2 finished with value: 5.8636095548714895 and parameters: {'lambda1': 0.7136493458848807, 'lambda2': 0.014745407902319485, 'lambda3': 1.8896705900392965}. Best is trial 0 with value: 5.618594760185517.[0m
[32m[I 2021-04-25 10:34:35,090][0m Trial 3 finished with value: 6.01505574355185 and parameters: {'lambda1': 0.5734744913999641,

In [16]:
study.best_params

{'lambda1': 0.05048006425296692,
 'lambda2': 0.6030121978587605,
 'lambda3': 0.9263460288833916}

In [17]:
study.best_value

5.492992240101407

In [20]:
# save the corrected oof predictions
inputs = zip(pred_inputs, delta_inputs)
with Parallel(n_jobs=6) as parallel:
    delayed_correct = delayed(correct_path)
    corrected = parallel(
        delayed_correct(wp, delta, **study.best_params)
        for wp,delta in inputs
    )
corrected = pd.concat(corrected, ignore_index=True)

In [22]:
mean_position_error(labels, corrected)

5.492992240101407

In [23]:
corrected.to_csv(f"{input_path}/oof_preds_stg.csv")

***
previous ones

In [31]:
study.best_params

{'lambda1': 0.041182343848726044,
 'lambda2': 0.559843900391014,
 'lambda3': 1.0305677022844741}

In [32]:
study.best_value

5.78361572908014

***