In [33]:
from prj.config import DATA_DIR
from prj.data.data_loader import DataConfig, DataLoader
import polars as pl
from sklearn.metrics import r2_score
from catboost import CatBoostRegressor
from prj.data.data_loader import PARTITIONS_DATE_INFO
import pandas as pd
import lightgbm as lgb
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
import gc
from tqdm import tqdm
import numpy as np
from prj.utils import online_iterator, online_iterator_daily
import time
from sklearn.model_selection import train_test_split
SEED = 42
import lleaves

# Base

In [46]:
use_weighted_loss = False
base_params = {
    'verbose': 50,
    'iterations': 717,
    'learning_rate': 0.019678599283449602,
    'depth': 8,
    'has_time': False,
    'bootstrap_type': 'Bernoulli',
    'reg_lambda': 0.00924440304487912,
    'min_data_in_leaf': 72,
    'subsample': 0.63603957073985,
    'task_type': 'GPU',
}

model = CatBoostRegressor()
model_file_path = DATA_DIR / 'models' / 'catboost' / 'catboost_model.cbm'
model.load_model(model_file_path)

model

<catboost.core.CatBoostRegressor at 0x72f9c7947400>

In [47]:
model.get_params()

{'has_time': False,
 'bootstrap_type': 'Bernoulli',
 'verbose': 50,
 'iterations': 717,
 'l2_leaf_reg': 0.009244403045,
 'loss_function': 'RMSE',
 'subsample': 0.6360395707,
 'task_type': 'GPU',
 'depth': 8,
 'min_data_in_leaf': 72,
 'learning_rate': 0.01967859928}

In [35]:
data_args = {'include_time_id': True, 'include_intrastock_norm_temporal': True}
config = DataConfig(**data_args)
loader = DataLoader(data_dir=DATA_DIR, config=config)
# start_dt, end_dt = 1530, 1698
start_dt, end_dt = 1530, 1600
# start_dt, end_dt = 1600, 1635
test_ds = loader.load(start_dt, end_dt)
X_test, y_test, w_test, _ = loader._build_splits(test_ds)

100%|██████████| 78/78 [00:04<00:00, 17.33it/s]


Skipping 1594-1600
Skipping 1595-1600
Skipping 1596-1600
Skipping 1597-1600
Skipping 1598-1600
Skipping 1599-1600
Skipping 1600-1600


In [36]:
y_hat = model.predict(X_test).clip(-5, 5).flatten()
offline_score = r2_score(y_test, y_hat, sample_weight=w_test)
offline_score

0.00936511603357526

In [37]:
del X_test
gc.collect()

0

In [38]:
features = loader.features
print(len(features))

134


In [39]:
MEAN_FEATURES = [0, 2, 3, 5, 6, 7, 18, 19, 34, 35, 36, 37, 38, 41, 43, 44, 48, 53, 55, 59, 62, 65, 68, 73, 74, 75, 76, 77, 78]
STD_FEATURES = [39, 42, 46, 53, 57, 66]
SKEW_FEATURES = [5, 40, 41, 42, 43, 44]
ZSCORE_FEATURES = [1, 36, 40, 45, 48, 49, 51, 52, 53, 54, 55, 59, 60]

def include_intrastock_norm(df: pl.LazyFrame, responder) -> pl.LazyFrame:
    df = df.with_columns(
        pl.col([f'feature_{j:02d}' for j in set(MEAN_FEATURES + ZSCORE_FEATURES)]).mean().over(['date_id', 'time_id', f'cluster_label_{responder}']).name.suffix(f'_{responder}_mean'),
        pl.col([f'feature_{j:02d}' for j in set(STD_FEATURES + ZSCORE_FEATURES)]).std().over(['date_id', 'time_id', f'cluster_label_{responder}']).name.suffix(f'_{responder}_std'),
        pl.col([f'feature_{j:02d}' for j in SKEW_FEATURES]).skew().over(['date_id', 'time_id', f'cluster_label_{responder}']).name.suffix(f'_{responder}_skew'),
    ).with_columns(
        pl.col(f'feature_{j:02d}').sub(f'feature_{j:02d}_{responder}_mean').truediv(f'feature_{j:02d}_{responder}_std').name.suffix(f'_{responder}_zscore') for j in ZSCORE_FEATURES
    ).drop([f'feature_{j:02d}_{responder}_std' for j in ZSCORE_FEATURES if j not in STD_FEATURES] + \
        [f'feature_{j:02d}_{responder}_mean' for j in ZSCORE_FEATURES if j not in MEAN_FEATURES])
    return df

# Inference

In [40]:
config = DataConfig()
loader = DataLoader(data_dir=DATA_DIR, config=config)
test_ds = loader.load(start_dt-1, end_dt).collect()

In [41]:
from catboost import Pool
MAX_ITERATIONS = 1000
FINE_TUNING_TIME_LIMIT = 50

class CatboostTimeLimitCallback:
    def __init__(self, time_limit):
        self.time_limit = time_limit
        self.start_time = None

    def after_iteration(self, info):
        if self.start_time is None:
            self.start_time = time.time()

        elapsed_time = time.time() - self.start_time
        if elapsed_time > self.time_limit:
            print(f"Stopping training after {elapsed_time:.2f} seconds (time limit reached). Iteration {info.iteration}")
            return False
        
        return True
    
def build_splits(df: pl.DataFrame, features: list):
    X = df.select(features).to_numpy()
    y = df['responder_6'].to_numpy().flatten()
    w = df['weight'].to_numpy().flatten()
    return X, y, w

def train_with_es(init_model: CatBoostRegressor, params: dict, train_df: pl.DataFrame, val_df: pl.DataFrame, use_weighted_loss, es_patience, task_type = 'CPU', max_iters = 1000):
    start_time = time.time()
    _params = params.copy()
    _params.pop('iterations', None)
    _params.pop('task_type', None)
        
    X_train, y_train, w_train = build_splits(train_df, features)
    train_pool = Pool(data=X_train, label=y_train, weight=w_train if use_weighted_loss else None)
    del X_train, y_train, w_train
    gc.collect()
    
    is_early_stopping = val_df is not None and val_df.shape[0] > 0
    
    if is_early_stopping:
        X_val, y_val, w_val = build_splits(val_df, features)
        val_pool = Pool(data=X_val, label=y_val, weight=w_val if use_weighted_loss else None)
        del X_val, y_val, w_val
        gc.collect()

    
    print(f"Learning rate: {_params['learning_rate']:e}")
    model = CatBoostRegressor(
        iterations=max_iters,
        task_type=task_type,
        **_params
    )
        
    model.fit(
        train_pool,
        init_model=init_model,
        eval_set=val_pool if is_early_stopping else None,
        early_stopping_rounds=es_patience if is_early_stopping else None,
        callbacks=[CatboostTimeLimitCallback(50)] if task_type != 'GPU' else None,
    )
    print(f'Train completed in {((time.time() - start_time)/60):.3f} minutes')
    
    
    return model

In [42]:
responder_replay_buffer_config = DataConfig()
responder_replay_buffer_loader = DataLoader(data_dir=DATA_DIR, config=responder_replay_buffer_config)
base_responder_replay_buffer = responder_replay_buffer_loader.load(start_dt-1-loader.window_period, start_dt-2)\
    .select('date_id', 'time_id', 'symbol_id', 'responder_6')\
    .with_columns(pl.col('date_id').sub(start_dt))\
    .collect()
    

TREE_OLD_DATASET_MAX_HISTORY = 30
AUX_COLS = ['date_id', 'time_id', 'symbol_id', 'weight', 'responder_6']

data_config = DataConfig(**data_args)
loader = DataLoader(data_dir=DATA_DIR, config=data_config)
base_old_dataset = loader.load(start_dt-TREE_OLD_DATASET_MAX_HISTORY, start_dt-1)\
    .select(AUX_COLS + features) \
    .with_columns(pl.col('date_id').sub(start_dt)) \
    .collect()
    


100%|██████████| 37/37 [00:01<00:00, 18.88it/s]


Skipping 1523-1529
Skipping 1524-1529
Skipping 1525-1529
Skipping 1526-1529
Skipping 1527-1529
Skipping 1528-1529
Skipping 1529-1529


In [43]:
def evaluate_model(model, X, y, w):
    y_hat = model.predict(X).clip(-5, 5).flatten()
    return r2_score(y, y_hat, sample_weight=w)

In [45]:
from catboost import sum_models
from prj.utils import timeout

verbose=True

class TrainerConfig:
    TREE_FINE_TUNING = True
    
    TREE_REFIT_EVERY = 1000
    TREE_TRAIN_EVERY = 28
    TREE_OLD_DATA_FRACTION = 0.
    TREE_ES_RATIO = 0.15
    TREE_ES_PATIENCE = 50
    TREE_LR_DECAY = 0.8
    TREE_USE_WEIGHTED_LOSS = True
    TREE_MAX_FINE_TUNING_TIME_LIMIT = time.time() + 60 * 60 * 8
    MAX_HISTORY_DAYS = 30
    USE_INTRA_STOCK_NORM = True
    USE_TIME_NORM_ID = True
    
    DEFAULT_MAX_TIME_ID = 967
    DEFAULT_CLUSTER = -1
    INTRASTOCK_WINDOW_PERIOD = 7
    
    INITIAL_ONLINE_LR = 1e-5
    
    
class ModelTrainer:
    def __init__(self, model, params, old_dataset: pl.DataFrame, responder_replay_buffer: pl.DataFrame):
        self.model = model
        self.params = params.copy()
        
        self.config = TrainerConfig()
        
        
        self.params['learning_rate'] = self.config.INITIAL_ONLINE_LR


        self.old_dataset = old_dataset
        self.new_dataset: pl.DataFrame | None = None
        self.current_day_data = None
        self.responder_replay_buffer = responder_replay_buffer
        self.stock_cluster_mapping = {}
        self.stock_max_time_id = {}
        self.date_idx = 0
        self.corr_responder = 'responder_6'
        
        self.catboost_models = []


    def fine_tune_model(self, date_id: int):
        should_retrain =  (self.date_idx + 1) % self.config.TREE_TRAIN_EVERY == 0
        should_refit = (self.date_idx + 1) % self.config.TREE_REFIT_EVERY == 0                   
        if should_retrain or should_refit:
            print(f"Fine tuning model on date {date_id}")
            if should_retrain:
                train_val_days = self.new_dataset['date_id'].unique().sort().to_numpy()     
                len_train_val_days = len(train_val_days)     
                
                
                random_split_type = 'None'
                
                if random_split_type in ['random_days', 'holdout_first', 'holdout_last']:
                    if random_split_type == 'random_days':
                        train_days, val_days = train_test_split(train_val_days, test_size=self.config.TREE_ES_RATIO, random_state=SEED)
                    elif random_split_type == 'holdout_first':
                        split_point = max(int(len_train_val_days * self.config.TREE_ES_RATIO), 1)
                        val_days = train_val_days[:split_point]
                        train_days = train_val_days[split_point:]
                    elif random_split_type == 'holdout_last':
                        split_point = max(int(len_train_val_days * (self.config.TREE_ES_RATIO)), 1)
                        val_days = train_val_days[-split_point:]
                        train_days = train_val_days[:-split_point]
                        
                    val_df = self.new_dataset.filter(pl.col('date_id').is_in(val_days))
                    train_df = self.new_dataset.filter(pl.col('date_id').is_in(train_days))
                elif random_split_type == 'random_samples':
                    np.random.seed(SEED)
                    shuffled_indices = np.random.permutation(len(self.new_dataset))
                    split_index = int(len(self.new_dataset) * (1 - self.config.TREE_ES_RATIO))
                    train_indices = shuffled_indices[:split_index]
                    val_indices = shuffled_indices[split_index:]
                    
                    val_df = self.new_dataset[val_indices]
                    train_df = self.new_dataset[train_indices]                
                elif random_split_type == 'None':
                    train_df = self.new_dataset
                    val_df = self.new_dataset.clear()
                else:
                    raise ValueError(f"Unknown split type: {random_split_type}")
            else:
                train_df = self.new_dataset
                val_df = self.new_dataset.clear()
            

            if verbose:
                old_days = self.old_dataset['date_id'].unique().sort().to_list()
                train_days = train_df['date_id'].unique().sort().to_list()
                val_days = val_df['date_id'].unique().sort().to_list()
                print('Old days: ', old_days)
                print('Train days: ', train_days)
                print('Val days: ', val_days)
            
            
            
            
            if self.config.TREE_OLD_DATA_FRACTION > 0:
                unique_train_val_symbols = self.new_dataset['symbol_id'].unique().to_list()
                filtered_old_dataset = self.old_dataset.filter(pl.col('symbol_id').is_in(unique_train_val_symbols))
                train_df_len = train_df.shape[0]
                old_dataset_len = filtered_old_dataset.shape[0]
                old_data_len = min(int(self.config.TREE_OLD_DATA_FRACTION * train_df_len / (1 - self.config.TREE_OLD_DATA_FRACTION)), old_dataset_len)
                if verbose:
                    print(f"Adding {old_data_len} old data samples to training set, {self.config.TREE_OLD_DATA_FRACTION * 100:.2f}% of the current training set")
                
                train_df = filtered_old_dataset\
                    .sample(n=old_data_len)\
                    .vstack(train_df)
                    
                del filtered_old_dataset
                gc.collect()
                
            if verbose:
                print('Shapes: ', train_df.shape, val_df.shape)
            
            if should_retrain:
                self.params['learning_rate'] = max(self.params['learning_rate'] * self.config.TREE_LR_DECAY, 1e-6)
                self.model = train_with_es(
                    init_model= self.model, 
                    train_df=train_df,
                    val_df=val_df,
                    use_weighted_loss=self.config.TREE_USE_WEIGHTED_LOSS,
                    es_patience=self.config.TREE_ES_PATIENCE,
                    params=self.params,
                    max_iters = 100,
                    task_type='CPU',
                )
                
                # model = train_with_es(
                #     init_model= None,
                #     train_df=train_df,
                #     val_df=val_df,
                #     use_weighted_loss=self.config.TREE_USE_WEIGHTED_LOSS,
                #     es_patience=self.config.TREE_ES_PATIENCE,
                #     params=self.params,
                #     task_type='GPU',
                # )
                # self.catboost_models.append(model)
                
                # if val_df.shape[0] > 0:
                #     X_val, y_val, w_val = build_splits(val_df, features)
                #     catboost_models_scores = [
                #         evaluate_model(m, X_val, y_val, w_val) for m in self.catboost_models
                #     ]
                #     del X_val, y_val, w_val
                #     gc.collect()
                #     total_score = sum(catboost_models_scores)
                #     print('Catboost models scores: ', catboost_models_scores)
                #     weights = [score / total_score for score in catboost_models_scores]
                # else:
                #     weights = [1 / len(self.catboost_models)] * len(self.catboost_models)
                
                # print('Weights: ', weights)
                # self.model = sum_models(self.catboost_models, weights=weights)
                                
                
                
                 
                            
            new_max_old_dataset_date = self.new_dataset['date_id'].max()
            self.old_dataset = pl.concat([
                self.old_dataset,
                self.new_dataset
            ]).filter(
                pl.col('date_id').is_between(new_max_old_dataset_date-TREE_OLD_DATASET_MAX_HISTORY, new_max_old_dataset_date)
            )
            self.new_dataset = None
            
        

    def preprocess_time_norm(self, test: pl.DataFrame, lags: pl.DataFrame | None):
        if lags is not None:
            stock_max_time_id_map = lags.group_by('symbol_id').agg(pl.col('time_id').max())
            self.stock_max_time_id = dict(zip(stock_max_time_id_map['symbol_id'], stock_max_time_id_map['time_id']))
            self.default_max_time_id = max(list(self.stock_max_time_id.values()))

        return test.with_columns(
            pl.col('symbol_id')\
                .replace_strict(
                    self.stock_max_time_id, default=self.config.DEFAULT_MAX_TIME_ID, return_dtype=pl.Int16
                ).alias('max_prev_stock_time_id')
            ).with_columns(
                pl.col('time_id').truediv('max_prev_stock_time_id').alias('time_id_norm')
            ).drop('max_prev_stock_time_id')


    def preprocess_intrastock_norm(self, test: pl.DataFrame, lags: pl.DataFrame | None, corr_responder='responder_6'):
        if lags is not None:
            _lags = lags.select(
                pl.col('date_id').sub(1), 'time_id', 'symbol_id',
                pl.col(f'{corr_responder}_lag_1').alias(corr_responder)
            )
            self.responder_replay_buffer = self.responder_replay_buffer.vstack(_lags).filter(
                pl.col('date_id').is_between(self.date_idx - self.config.INTRASTOCK_WINDOW_PERIOD, self.date_idx)
            )

            pivot = self.responder_replay_buffer.filter(pl.col('date_id') < self.date_idx)\
                .pivot(index=['date_id', 'time_id'], values=[corr_responder], separator='_', on='symbol_id')\
                .sort('date_id', 'time_id')\
                .fill_nan(None)\
                .fill_null(strategy='zero')

            corr_cols = [col for col in pivot.columns if col not in ['date_id', 'time_id']]
            stocks = [int(col) for col in corr_cols]
            df_corr_responder = pivot.select(corr_cols).corr()
            linked = linkage(df_corr_responder, method='ward')
            cluster_labels = fcluster(linked, t=5, criterion='distance')
            self.stock_cluster_mapping = dict(zip(stocks, cluster_labels))

        return test.with_columns(
            pl.col('symbol_id').replace_strict(
                self.stock_cluster_mapping, default=self.config.DEFAULT_CLUSTER, return_dtype=pl.Int8
            ).alias(f'cluster_label_{corr_responder}')
        ).pipe(
            include_intrastock_norm,
            corr_responder
        ).drop(f'cluster_label_{corr_responder}')


    def predict(self, test: pl.DataFrame, lags: pl.DataFrame | None):
        if self.config.USE_TIME_NORM_ID:
            test = self.preprocess_time_norm(test, lags)
            
        if self.config.USE_INTRA_STOCK_NORM:
            test = self.preprocess_intrastock_norm(test, lags)

        FINE_TUNING = self.config.TREE_FINE_TUNING and time.time() < self.config.TREE_MAX_FINE_TUNING_TIME_LIMIT
        if FINE_TUNING:
            if lags is not None:
                if self.current_day_data is not None:
                    _lags = lags.select(
                        pl.col('date_id').sub(1), 'time_id', 'symbol_id',
                        pl.col(f'{self.corr_responder}_lag_1').alias(self.corr_responder)
                    )
                    self.current_day_data = self.current_day_data.join(_lags, on=['date_id', 'time_id', 'symbol_id'], how='left')\
                        .select(AUX_COLS + features)
                    
                    self.new_dataset = self.current_day_data if self.new_dataset is None else self.new_dataset.vstack(self.current_day_data)
                    
                    self.current_day_data = None
                    
            
                curr_date_id = test['date_id'].min()
                self.fine_tune_model(date_id=curr_date_id)
            
            self.current_day_data = test if lags is not None else self.current_day_data.vstack(test)

        if lags is not None:
            self.date_idx += 1
        
        if test['is_scored'].any():
            X = test.select(features).cast(pl.Float32).to_numpy()
            y_hat = self.model.predict(X).clip(-5, 5).flatten()
        else:
            y_hat = np.zeros(test.shape[0])

        return test.select('row_id', pl.Series(y_hat).alias(self.corr_responder))


model = CatBoostRegressor(**base_params)
model.load_model(model_file_path)
params = base_params.copy()

trainer = ModelTrainer(model, params, base_old_dataset, base_responder_replay_buffer)
y_hat_iterator = []

for i, (test, lags) in enumerate(online_iterator_daily(test_ds, show_progress=True)):
    res = trainer.predict(test, lags)
    y_hat_iterator.append(res['responder_6'].to_numpy())

y_hat_iterator = np.concatenate(y_hat_iterator)
online_score = r2_score(y_true=y_test, y_pred=y_hat_iterator, sample_weight=w_test)
gain = online_score - offline_score

print(f'Online score: {online_score:.4f}, Offline score: {offline_score:.4f} -> Gain: {gain:.6f}')

 38%|███▊      | 27/71 [00:03<00:06,  7.19it/s]

Fine tuning model on date 27
Old days:  [-30, -29, -28, -27, -26, -25, -24, -23, -22, -21, -20, -19, -18, -17, -16, -15, -14, -13, -12, -11, -10, -9, -8, -7, -6, -5, -4, -3, -2, -1]
Train days:  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26]
Val days:  []
Shapes:  (992200, 139) (0, 139)
Learning rate: 8.000000e-06
0:	learn: 0.7373029	total: 144ms	remaining: 14.3s
50:	learn: 0.7372972	total: 7.9s	remaining: 7.59s


 39%|███▉      | 28/71 [00:22<04:05,  5.71s/it]

99:	learn: 0.7372918	total: 15.4s	remaining: 0us
Train completed in 0.308 minutes


 77%|███████▋  | 55/71 [00:26<00:02,  7.07it/s]

Fine tuning model on date 55
Old days:  [-4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26]
Train days:  [27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54]
Val days:  []
Shapes:  (1049312, 139) (0, 139)
Learning rate: 6.400000e-06
0:	learn: 0.8111659	total: 186ms	remaining: 18.4s
50:	learn: 0.8111332	total: 9.01s	remaining: 8.66s


 79%|███████▉  | 56/71 [00:47<01:36,  6.43s/it]

99:	learn: 0.8111005	total: 17.6s	remaining: 0us
Train completed in 0.348 minutes


100%|██████████| 71/71 [00:49<00:00,  1.43it/s]

Online score: 0.0093, Offline score: 0.0094 -> Gain: -0.000069



