In [8]:
import os.path
import random as random

import clean
import numpy as np
import polars as pl
import tqdm
from clean.constants import data_dir

In [3]:
# Test SVD with all made kernels
splits = clean.get_kernel_splits()
i_train, _, i_test, t_train, _, t_test = (
    splits[part]
    for part in ["i_train", "i_tune", "i_test", "t_train", "t_tune", "t_test"]
)

params = np.linalg.lstsq(i_train, t_train, rcond=None)

predictions = i_test @ params[0]

print("RMSE:", np.sqrt(np.square(np.subtract(t_test, predictions)).mean()))

RMSE: 18.71248691247243


In [25]:
def base_mse(y_true, y_pred):
    return np.sqrt(np.mean((y_true - y_pred) ** 2))


def directly_weighted_mse(y_true, y_pred, y_true_mean: int):
    return np.mean(((y_true - y_pred) * (y_true / y_true_mean)) ** 2)


def curved_mse(y_true, y_pred):
    y_true -= 180
    y_pred -= 180
    y_true = np.sign(y_true) * np.sqrt(np.abs(y_true))
    y_pred = np.sign(y_pred) * np.sqrt(np.abs(y_pred))
    return base_mse(y_true, y_pred) * 100

In [46]:
def kernel_tuning(kernels_file: str = "kernels.pq") -> None:
    data = pl.scan_parquet(data_dir(f"samples/{kernels_file}"))
    columns = data.columns
    columns.remove("sensor_voltage")
    col_arr = np.array(columns)

    split = clean.get_kernel_splits()
    i_tr_full, i_tu_full, t_tr, t_tu = (
        split[part]
        for part in [
            "i_train",
            "i_tune",
            "t_train",
            "t_tune",
        ]
    )
    target_train_mean = t_tr.mean()

    with open("kernel_file.csv", "w") as file:
        file.write("n_kernels,loss_type,loss,kernels\n")

    for i in range(1, len(columns)):
        best_mse, best_wmse, best_cmse = np.inf, np.inf, np.inf
        best_cols_mse, best_cols_wmse, best_cols_cmse = [], [], []

        random.seed(42)
        kernels = [True] * i + [False] * (len(columns) - i)
        for _ in tqdm.trange(100_000, desc=f"Size {i}"):
            random.shuffle(kernels)
            i_tr, i_tu = i_tr_full[:, kernels], i_tu_full[:, kernels]

            parameters = np.linalg.lstsq(i_tr, t_tr, rcond=None)[0]
            prediction = i_tu @ parameters

            mse = base_mse(t_tu, prediction)
            wmse = directly_weighted_mse(
                t_tu, prediction, y_true_mean=target_train_mean
            )
            cmse = curved_mse(t_tu, prediction)

            if wmse < best_wmse:
                best_wmse = wmse
                best_cols_wmse = col_arr[kernels]

            if cmse < best_cmse:
                best_cmse = cmse
                best_cols_cmse = col_arr[kernels]

            if mse < best_mse:
                best_mse = mse
                best_cols_mse = col_arr[kernels]

        with open("kernel_file.csv", "a") as file:
            file.write(f"{i:0>2},MSE ,{best_mse},{best_cols_mse}\n")
            file.write(f"{i:0>2},WMSE,{best_wmse},{best_cols_wmse}\n")
            file.write(f"{i:0>2},CMSE,{best_cmse},{best_cols_cmse}\n")

Need to load `kernel_file.csv` in, change datatypes and save it as pq

In [14]:
kernels_df = pl.read_parquet("../models/kernels.pq")
print(
    "Best kernels:",
    kernels_df.filter(pl.col("n_kernels").eq(4))["kernels"][0].to_list(),
)
kernels_df.head()

Best kernels: ['volt_1', 'volt_1*volt_2*distance_to_sensor', 'volt_2*volt_7*distance_to_sensor', 'distance_to_sensor_squared']


n_kernels,loss_type,loss,kernels
u8,cat,f64,list[str]
1,"""MSE""",107.394896,"[""volt_7""]"
1,"""WMSE""",2074400.0,"[""volt_7""]"
1,"""CMSE""",2300.938053,"[""volt_7""]"
2,"""MSE""",102.061866,"[""volt_7"", ""volt_1*volt_2*distance_to_sensor""]"
2,"""WMSE""",1694300.0,"[""volt_7"", ""volt_1*volt_2*distance_to_sensor""]"


In [9]:
if not os.path.isfile(data_dir("samples/svd_final_split.npz")):
    clean.split_data(
        "svd_joined.pq",
        kernels_df.filter(pl.col("n_kernels").eq(4))["kernels"][2].to_list(),
        "sensor_voltage",
        "svd_final_split.npz",
    )

splits = np.load(data_dir("samples/svd_final_split.npz"))
i_train, _, i_test, t_train, _, t_test = (
    splits[part]
    for part in ["i_train", "i_tune", "i_test", "t_train", "t_tune", "t_test"]
)

params = np.linalg.lstsq(i_train, t_train, rcond=None)

predictions = i_test @ params[0]
np.save(data_dir("pred/svm.npy"), predictions)
print("RMSE:", np.sqrt(np.square(np.subtract(t_test, predictions)).mean()))

RMSE: 18.75821330611865
