In [1]:
#from google.colab import drive
#from google.colab import files
#drive.mount('/content/drive', force_remount=True)

In [2]:
#!pip install --upgrade kaggle > /dev/null 2>&1
#!pip install optuna  > /dev/null 2>&1

In [3]:
# move kaggle.json into the folder where the API expects to find it
#!mkdir -p ~/.kaggle/ && cp /content/drive/MyDrive/kaggle/kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

***

In [4]:
from glob import glob
import numpy as np
import pandas as pd
import scipy.interpolate
import scipy.sparse
from tqdm import tqdm
from joblib import Parallel,delayed
import yaml

import optuna
#optuna.logging.set_verbosity(optuna.logging.ERROR)

***

In [5]:
def mean_position_error(labels, preds):
    x_label = labels.x.values
    y_label = labels.y.values
    x_pred = preds.x.values
    y_pred = preds.y.values
    return np.mean(np.sqrt((x_label-x_pred)**2 + (y_label-y_pred)**2))

def correct_path(wp_preds, delta_preds, lambda1=0.1, lambda2=0.5, lambda3=2):

    T_ref  = wp_preds['timestamp'].values
    xy_preds = wp_preds[['x', 'y']].values
    delta_preds = delta_preds.loc[:,["timestamp","dx","dy"]].values

    if T_ref[-1] > delta_preds[-1, 0]:
        delta_preds = [np.array([[0, 0, 0]]), delta_preds, np.array([[T_ref[-1], 0, 0]])]
    else:
        delta_preds = [np.array([[0, 0, 0]]), delta_preds]
    delta_preds = np.concatenate(delta_preds)
    
    T_rel = delta_preds[:, 0]
    delta_xy_preds = np.diff(scipy.interpolate.interp1d(T_rel, np.cumsum(delta_preds[:, 1:3], axis=0), axis=0)(T_ref), axis=0)

    N = xy_preds.shape[0]
    delta_t = np.diff(T_ref)
        
    alpha = lambda1 * np.ones(N)
    beta  = (1-lambda1) * np.ones(N-1) / (1 + lambda2 * delta_t * 1e-3)**lambda3
    
    A = scipy.sparse.spdiags(alpha, [0], N, N)
    B = scipy.sparse.spdiags( beta, [0], N-1, N-1)
    D = scipy.sparse.spdiags(np.stack([-np.ones(N), np.ones(N)]), [0, 1], N-1, N)

    Q = A + (D.T @ B @ D)
    c = (A @ xy_preds) + (D.T @ (B @ delta_xy_preds))
    xy_corr = scipy.sparse.linalg.spsolve(Q, c)

    return pd.DataFrame({
        'site' : wp_preds['site'],
        'path' : wp_preds['path'],
        'timestamp' : wp_preds['timestamp'],
        'x' : xy_corr[:, 0],
        'y' : xy_corr[:, 1],
    })

***

In [None]:
!kaggle datasets download -d mavillan/iln-imu-preds --force --unzip

In [None]:
!kaggle datasets download -d mavillan/iln-dnn-dset1 --force --unzip

In [6]:
input_path = "./dnn-ds1-30f-5lt-pl"

delta_preds = pd.read_csv("./delta_preds.csv")

predictions = (
    pd.read_csv(f"{input_path}/oof_preds.csv")
    .loc[:,["site","path","timestamp","oof_x_agg","oof_y_agg"]]
    .rename({"oof_x_agg":"x", "oof_y_agg":"y"}, axis=1)
    .drop_duplicates()
    .reset_index(drop=True)
)

labels = (
    pd.read_csv(f"{input_path}/oof_preds.csv")
    .loc[:,["site","path","timestamp","x","y"]]
    .drop_duplicates()
    .reset_index(drop=True)
)

In [7]:
predictions = predictions.sort_values(["site","path","timestamp"], ignore_index=True)
labels = labels.sort_values(["site","path","timestamp"], ignore_index=True)
delta_preds = delta_preds.sort_values(["site","path","timestamp"], ignore_index=True)

In [8]:
target_sites = predictions.site.unique().tolist()

In [9]:
mean_position_error(labels, predictions)

6.190770283659762

***
## optimization of parameters: by site

In [None]:
class MinimizeContainer():
    
    def __init__(self, preds, deltas, labels):
        self.pred_inputs = preds
        self.delta_inputs = deltas
        self.labels = labels
        self.site = labels["site"].unique()[0]
        
    def objective(self, trial):
        lambda1 = trial.suggest_uniform("lambda1", 0, 1)
        lambda2 = trial.suggest_uniform("lambda2", 0, 1)
        lambda3 = trial.suggest_uniform("lambda3", 0.5, 2)

        inputs = zip(self.pred_inputs,self.delta_inputs)
        corrected = [correct_path(wp, delta, lambda1, lambda2, lambda3) 
                     for wp,delta in inputs]
        corrected = pd.concat(corrected, ignore_index=True)
        return mean_position_error(self.labels, corrected)

    def correct(self, lambda1, lambda2, lambda3):
        inputs = zip(self.pred_inputs,self.delta_inputs)
        corrected = [correct_path(wp, delta, lambda1, lambda2, lambda3) 
                     for wp,delta in inputs]
        corrected = pd.concat(corrected, ignore_index=True)
        return corrected      
    
def minimize(container, n_trials=200):
    study = optuna.create_study(direction='minimize')
    study.optimize(container.objective, n_trials=n_trials, timeout=14400, n_jobs=2)
    return (container.site, study.best_value, study.best_params)

In [None]:
all_containers = list()

for i,site in enumerate(target_sites):
    paths = labels.query("site==@site").path.unique()
    pred_inputs = [predictions.query("site==@site & path==@path") for path in paths]
    delta_inputs = [delta_preds.query("site==@site & path==@path") for path in paths]
    mc = MinimizeContainer(pred_inputs, delta_inputs, labels.query("site==@site"))
    all_containers.append(mc)
    

In [None]:
%%time
with Parallel(n_jobs=-1) as parallel:
    delayed_minimize = delayed(minimize)
    all_results = parallel(delayed_minimize(mc, n_trials=500) for mc in tqdm(all_containers))

In [None]:
# calculate original cv
mean_position_error(labels, predictions)

In [None]:
all_corrected = list()
for container,result in zip(all_containers,all_results):
    best_params = result[-1]
    corrected = container.correct(**best_params)
    all_corrected.append(corrected)

corrected = pd.concat(all_corrected, ignore_index=True)

In [None]:
mean_position_error(labels, corrected)

In [None]:
all_results_dict = {site:{"error":error,"params":params} for site,error,params in all_results}
output_path = "/content/drive/MyDrive/kaggle/indoor-location-navigation/output"

with open(f'{output_path}/cm_params_by_site.yml', 'w') as file:
    yaml.dump(all_results_dict, file, default_flow_style=False)
    file.close()

***
## optimization of parameters: jointly

In [10]:
# calculate original cv
mean_position_error(labels, predictions)

6.190770283659762

In [11]:
pred_inputs = list()
delta_inputs = list()
for i,row in labels.loc[:,["site","path"]].drop_duplicates().iterrows():
    pred_inputs.append(predictions.query("site==@row.site & path==@row.path"))
    delta_inputs.append(delta_preds.query("site==@row.site & path==@row.path"))

In [12]:
def objective(trial):
    lambda1 = trial.suggest_uniform("lambda1", 0, 1)
    lambda2 = trial.suggest_uniform("lambda2", 0, 1)
    lambda3 = trial.suggest_uniform("lambda3", 0.5, 2)
    
    inputs = zip(pred_inputs, delta_inputs)
    with Parallel(n_jobs=6) as parallel:
        delayed_correct = delayed(correct_path)
        corrected = parallel(
            delayed_correct(wp, delta, lambda1, lambda2, lambda3)
            for wp,delta in inputs
        )
    corrected = pd.concat(corrected, ignore_index=True)
    return mean_position_error(labels, corrected)

In [13]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=300, timeout=14400)

[32m[I 2021-04-26 14:22:51,244][0m A new study created in memory with name: no-name-f1adfddd-b992-42b9-83cc-b2c905cefc36[0m
[32m[I 2021-04-26 14:23:14,262][0m Trial 0 finished with value: 5.480252110711861 and parameters: {'lambda1': 0.09795428343832868, 'lambda2': 0.16403757920945872, 'lambda3': 1.485955858532987}. Best is trial 0 with value: 5.480252110711861.[0m
[32m[I 2021-04-26 14:23:31,100][0m Trial 1 finished with value: 5.568176812296558 and parameters: {'lambda1': 0.19366833525953442, 'lambda2': 0.7376968748702268, 'lambda3': 0.6270574095000152}. Best is trial 0 with value: 5.480252110711861.[0m
[32m[I 2021-04-26 14:23:47,700][0m Trial 2 finished with value: 5.9446620339314595 and parameters: {'lambda1': 0.6631606229107168, 'lambda2': 0.24304868510068234, 'lambda3': 0.9658811392408571}. Best is trial 0 with value: 5.480252110711861.[0m
[32m[I 2021-04-26 14:24:05,105][0m Trial 3 finished with value: 5.92535645446788 and parameters: {'lambda1': 0.6188415012307914, 

In [14]:
study.best_params

{'lambda1': 0.049159220117571435,
 'lambda2': 0.6612600699192859,
 'lambda3': 0.8670388019897984}

In [15]:
study.best_value

5.466886047657341

In [16]:
# save the corrected oof predictions
inputs = zip(pred_inputs, delta_inputs)
with Parallel(n_jobs=6) as parallel:
    delayed_correct = delayed(correct_path)
    corrected = parallel(
        delayed_correct(wp, delta, **study.best_params)
        for wp,delta in inputs
    )
corrected = pd.concat(corrected, ignore_index=True)

In [17]:
mean_position_error(labels, corrected)

5.466886047657341

In [18]:
corrected.to_csv(f"{input_path}/oof_preds_stg.csv")

***