In [1]:
# !pip install optuna
# import optuna
# optuna.__version__

In [2]:
import polars as pl
import numpy as np
from tqdm import tqdm
import sys
sys.path.append('..')
from tqdm import tqdm
from src import metric_fast
import json
import os

In [3]:
class cfg:
    pass

In [4]:
settings_json = json.load(open('../settings.json', 'r'))
print(settings_json)

for k,v in settings_json.items():
    setattr(cfg, k, v)
    
print(cfg.__dict__)

{'train_events_path': '../data/train_events.csv', 'train_series_path': '../data/train_series.parquet', 'processed_data_path': '../data_processed_models', 'output_dir': '../outputs'}
{'__module__': '__main__', '__dict__': <attribute '__dict__' of 'cfg' objects>, '__weakref__': <attribute '__weakref__' of 'cfg' objects>, '__doc__': None, 'train_events_path': '../data/train_events.csv', 'train_series_path': '../data/train_series.parquet', 'processed_data_path': '../data_processed_models', 'output_dir': '../outputs'}


In [5]:
train_events = pl.read_ipc(os.path.join(settings_json['processed_data_path'], 'train_events.ipc'))



In [6]:
train_events['series_id'].n_unique()

264

In [7]:
df = pl.read_parquet(os.path.join(cfg.output_dir, 'fm-v13-final', 'oof_preds.parquet'))
df_v15 = pl.read_parquet(os.path.join(cfg.output_dir, 'fm-v15-final', 'oof_preds.parquet'))
df_v20 = pl.read_parquet(os.path.join(cfg.output_dir, 'fm-v20-final', 'oof_preds.parquet'))
df_v21 = pl.read_parquet(os.path.join(cfg.output_dir, 'fm-v21-final', 'oof_preds.parquet'))

In [8]:
df_v15 = df[['series_id', 'step']].join(df_v15, on=['series_id', 'step'], how='left')
df_v20 = df[['series_id', 'step']].join(df_v20, on=['series_id', 'step'], how='left')
df_v21 = df[['series_id', 'step']].join(df_v21, on=['series_id', 'step'], how='left')

In [9]:
df = df.with_columns([pl.Series(((df[c]+ df_v15[c] + df_v20[c] + df_v21[c])/4), dtype=pl.Float32).alias(c) for c in ['onset', 'wakeup']])

df = df.rename({'onset': 'onset_oof', 'wakeup': 'wakeup_oof'})

In [10]:
def scale_array(arr, new_min, new_max):
    """
    Scale a numpy array to a new given range [new_min, new_max].

    :param arr: numpy array to be scaled
    :param new_min: new minimum value of the range
    :param new_max: new maximum value of the range
    :return: scaled numpy array
    """
    min_val = np.min(arr)
    max_val = np.max(arr)
    
    # Scale the array
    scaled_arr = ((arr - min_val) / (max_val - min_val)) * (new_max - new_min) + new_min
    
    return scaled_arr


def scale(arr, new_min, new_max, power=1.25):
    
    
    arr = scale_array(arr, new_min, new_max)
    arr = arr**power
    arr = scale_array(arr, new_min, new_max)
    return arr


In [11]:
df = df.with_columns([pl.Series(scale(df[c].to_numpy(), 0, 15, 1)).alias(f'{c}_t') for c in ['wakeup_oof', 'onset_oof']])

In [12]:
print(df['onset_oof'].min(), df['wakeup_oof'].max())

-0.7398790717124939 15.790359497070312


In [13]:
print(df['onset_oof_t'].min(), df['wakeup_oof_t'].max())

0.0 15.0


In [14]:
res_series_ids = []
res_steps = []
res_preds = []

partitions = df.partition_by(by='series_id', maintain_order=True)
for df_sub in tqdm(partitions):
    series_id = df_sub['series_id'][0]
    res_series_ids.append(series_id)
    res_steps.append(df_sub['step'].to_numpy())
    res_preds.append(df_sub[['onset_oof_t', 'wakeup_oof_t']].to_numpy())

100%|██████████| 264/264 [00:00<00:00, 590.31it/s]


In [15]:
def wbf_nikhil(preds_orig, max_thresh=0.1, max_count=600, hyperparams=None):
    k_dist = hyperparams['k_dist']
    log_base = hyperparams['log_base']
    log_scale = hyperparams['log_scale']
    curr_max_power = hyperparams['curr_max_power']
    weight_coeff = hyperparams['weight_coeff']
    convolution_kernel = hyperparams['convolution_kernel']
    section_weight_method = hyperparams['section_weight_method']
    preds_reduction_power = hyperparams['preds_reduction_power']
    overlap_coeff = hyperparams['overlap_coeff']
    min_distance = hyperparams['min_distance']
    
    preds = preds_orig.copy()
    preds = np.convolve(preds, convolution_kernel, mode='same')

    count = 0
    indices = []
    scores = []

    while count < max_count:
        curr_max_idx = np.argmax(preds)
        curr_max = preds[curr_max_idx]

        if curr_max < max_thresh:
            break

        k = int(k_dist - max(min_distance, (curr_max**curr_max_power)))

        start_idx = max(curr_max_idx - k, 0)
        end_idx = min(curr_max_idx + k + 1, len(preds))

        section = preds[start_idx:end_idx]

        # Different weight calculation methods
        distances = np.abs(np.arange(len(section)) - k)
        if section_weight_method == 'logarithmic':
            weights = 1 / (log_base ** (distances / (k * log_scale)))
        elif section_weight_method == 'linear':
            weights = 1 - (distances / k) * weight_coeff
        # Add more methods as needed

        weighted_avg = np.sum(section * weights) / np.sum(weights)

        scores.append(weighted_avg)
        indices.append(curr_max_idx)

        preds[start_idx:end_idx] *= ((1 - weights * overlap_coeff))**preds_reduction_power

        count += 1

    return indices, scores

def get_actual_preds(val_preds, val_series_ids, val_steps, type_, hyperparams):
    times = []
    series_ids = []
    scores = []

    for i in np.arange(len(val_preds)):
        
        vp_i = val_preds[i]
        ser_id = val_series_ids[i]
        
        col_index = 0 if type_ == "onset" else 1

        preds = vp_i[:, col_index] 
        peaks, peak_scores = wbf_nikhil(preds, hyperparams=hyperparams)
    # ...

        times.extend(val_steps[i][peaks])
        scores.extend(list(peak_scores))
        series_ids.extend([ser_id] * len(peaks))

    return np.array(series_ids), np.array(times), np.array(scores)

def post_process_preds(val_events_df, res_series_ids, res_preds, res_steps, hyperparams, get_score=False):

    series_ids_onsets, onsets, scores_onsets = get_actual_preds(res_preds, res_series_ids, res_steps, 'onset', hyperparams)
    series_ids_wakeups, wakeups, scores_wakeups = get_actual_preds(res_preds, res_series_ids, res_steps, 'wakeup', hyperparams)
    
    
    onset_preds = pl.DataFrame().with_columns([pl.Series(series_ids_onsets).alias('series_id'),
                                           pl.Series(onsets).cast(pl.Int64).alias('step'),
                                           pl.lit('onset').alias('event'),
                                           pl.Series(scores_onsets).alias('score')])

    wakeup_preds = pl.DataFrame().with_columns([pl.Series(series_ids_wakeups).alias('series_id'),
                                               pl.Series(wakeups).cast(pl.Int64).cast(pl.Int64).alias('step'),
                                               pl.lit('wakeup').alias('event'),
                                               pl.Series(scores_wakeups).alias('score')])
    
    val_preds_df = pl.concat([onset_preds, wakeup_preds]).sort(by=['series_id', 'step'])
    
    if get_score:
        toleranaces = {'onset': [12, 36, 60, 90, 120, 150, 180, 240, 300, 360],'wakeup': [12, 36, 60, 90, 120, 150, 180, 240, 300, 360]}
        comp_score = metric_fast.comp_scorer(
        val_events_df,
        val_preds_df,
        tolerances = toleranaces,
        )
        return comp_score
    
    else:
        return val_preds_df

In [16]:
params = {'k_dist': 265,
 'log_base': 14,
 'log_scale': 0.4790984168498803,
 'curr_max_power': 0.7379097334221248,
 'weight_coeff': 1.4190240522794364,
 'score_thresh_coeff': 1.3935907064707698,
 'convolution_kernel': [0.15, 0.7, 0.15],
 'section_weight_method': 'logarithmic',
 'preds_reduction_power': 4.2185511941820355,
 'overlap_coeff': 0.6879294472420623,
 'min_distance': 21}
post_process_preds(train_events, res_series_ids, res_preds, res_steps, params, get_score=True)

0.8230877867171313