# Model Testing

| Date | User | Change Type | Remarks |  
| ---- | ---- | ----------- | ------- |
| 12/06/2025   | Martin | Create  | Baseline model with Decision Tree and HistGradientBoosting regressor | 
| 24/06/2025   | Martin | Update  | Completed baseline models. Started on ARIMA models | 

__Exploration Scores__

- Baseline (DecisionTree | Full Data): -0.003280
- Baseline (HistGradientBoosting | Full Data): 0.054610
- Baseline (LightGBM | Full Data): 0.023996

__Submitted Scores__

- Baseline (LightGBM | Full Data): 0.04734

# Content

* [Baseline Models](#baseline-models)
* [ARIMA Models](#arima-models)
* [Make Predictions](#make-predictions)

# Baseline Models

Baseline model serves as the starting point for all future scores. Using a Decision Tree, HistGradientBoostingRegressor, LightGBM with KFold Cross Validation together with entire dataset for testing.

In [1]:
import polars as pl
import numpy as np
import wandb

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from lightgbm import LGBMRegressor

from scipy.stats import pearsonr

In [2]:
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mminimartzz[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [3]:
# Global parameters
PROJ_NAME = "drw_cryto_market"

In [4]:
df = pl.read_csv("data/clean/orig_plus.csv")

# Remove timestamp
df = df.drop('timestamp')

In [5]:
df.head()

bid_qty,ask_qty,buy_qty,sell_qty,volume,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,X11,X12,X13,X14,X15,X16,X17,X18,X19,X20,X21,X22,X23,X24,X25,X26,X27,X28,X29,X30,X31,X32,…,X863,X864,X865,X866,X867,X868,X869,X870,X871,X872,X873,X874,X875,X876,X877,X878,X879,X880,X881,X882,X883,X884,X885,X886,X887,X888,X889,X890,label,bair,moi,ofr,lpr,obs,vwoi,va,obrsi
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
15.283,8.425,176.405,44.984,221.389,0.121263,-0.41769,0.005399,0.125948,0.058359,0.027359,0.03578,0.068219,1.034825,-0.029575,0.327805,0.485823,0.668596,0.617389,0.770037,0.857631,1.754456,0.572503,0.883229,0.58567,0.816321,0.529973,0.508244,0.448616,1.341892,1.406392,0.953631,1.183991,1.474789,0.774389,0.660586,0.269043,…,0.21857,0.0,1.728155,0.62414,0.0,-0.051211,0.0,0.0,0.0,0.0,0.691754,0.242124,2.096157,3.369195,0.244667,0.286611,0.722679,0.901931,1.000007,1.925423,1.847943,0.005676,0.190791,0.369691,0.37763,0.210153,0.159183,0.530636,0.562539,1.814006,3.921505,0.59362,3.589058,138.279,0.59362,626.407,57.874677
38.59,2.336,525.846,321.95,847.796,0.302841,-0.049576,0.356667,0.481087,0.237954,0.208359,0.217057,0.249624,0.948694,-0.183488,0.150526,0.308421,0.492232,0.529787,0.682958,0.770965,1.686504,0.273357,0.591695,0.442391,0.674792,0.460741,0.439681,0.380399,1.304113,1.003783,0.776628,1.015943,1.312735,0.696895,0.584217,0.231104,…,0.088014,0.0,1.665698,0.622775,0.0,-0.079621,0.0,0.0,0.0,0.0,0.691665,0.242091,2.46103,4.127584,0.321394,0.31246,0.746452,0.912371,1.003153,1.928569,1.849468,0.005227,0.18466,0.363642,0.374515,0.209573,0.158963,0.530269,0.533686,16.519692,1.633316,0.240501,1.74055,240.15,0.240501,626.407,57.874677
0.442,60.25,159.227,136.369,295.596,0.167462,-0.291212,0.083138,0.206881,0.101727,0.072778,0.081564,0.114166,0.896459,-0.261779,0.044571,0.200608,0.384558,0.476229,0.629848,0.718232,1.656707,0.140156,0.457268,0.376524,0.610116,0.429751,0.409316,0.350359,1.28325,0.760801,0.670816,0.917205,1.219124,0.653355,0.541739,0.210095,…,-0.147363,0.0,1.666893,0.621414,0.0,-0.080427,0.0,0.0,0.0,0.0,0.691674,0.242093,2.493249,4.182112,0.326701,0.314636,0.746681,0.911129,1.002502,1.928047,1.849282,0.004796,0.178719,0.357689,0.371424,0.208993,0.158744,0.529901,0.546505,0.007336,1.167619,0.077329,0.812073,-36.95,0.077329,-552.2,57.874677
4.865,21.016,335.742,124.963,460.705,0.072944,-0.43659,-0.102483,0.017551,0.007149,-0.021681,-0.012936,0.019634,0.732634,-0.535845,-0.273947,-0.124959,0.056438,0.311539,0.465377,0.554022,1.663491,0.152084,0.468778,0.383696,0.618529,0.435326,0.415523,0.356895,1.319538,0.955549,0.789646,1.044941,1.353001,0.72392,0.613462,0.246212,…,-0.09459,0.0,1.735322,0.620057,0.0,-0.094702,0.0,0.0,0.0,0.0,0.69121,0.24193,2.525526,4.292975,0.350791,0.32357,0.753829,0.913363,1.002985,1.928621,1.849608,0.004398,0.172967,0.351832,0.368358,0.208416,0.158524,0.529534,0.357703,0.23149,2.686731,0.457514,2.33326,194.628,0.457514,165.109,57.874677
27.158,3.451,98.411,44.407,142.818,0.17382,-0.213489,0.096067,0.215709,0.107133,0.078976,0.087818,0.120426,0.763537,-0.430945,-0.205298,-0.062118,0.117266,0.341493,0.495591,0.584519,1.668419,0.156177,0.472732,0.3871,0.623192,0.439034,0.419868,0.361572,1.324595,0.90546,0.78375,1.047708,1.36188,0.732001,0.622712,0.251095,…,0.162221,0.0,1.712096,0.618703,0.0,-0.091884,0.0,0.0,0.0,0.0,0.691207,0.241928,2.52443,4.306694,0.335599,0.31907,0.747533,0.908904,1.001286,1.927084,1.84895,0.004008,0.167391,0.346066,0.365314,0.207839,0.158304,0.529167,0.362452,7.869603,2.216115,0.378132,2.623783,77.711,0.378132,-317.887,57.874677


In [6]:
# Split into X and y
y = df['label'].to_numpy()
X = df.drop('label').to_numpy()

# Standardise the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

X.shape, y.shape

((525887, 882), (525887,))

Change configuration according to model used

In [None]:
# Define model configuration
config = {
  "num_kfold_splits": 7,
  "boosting_type": "gbdt",
  "num_leaves": 31,
  "max_depth": -1,
  "learning_rate": 0.1,
  "n_estimators": 100,
  "reg_alpha": 0,
  "reg_lambda": 0
}

# Define wandb object
run = wandb.init(
  project=PROJ_NAME,
  notes="Light Gradient Boosting with original data + additional features",
  tags=["baseline", "orig_plus"],
  config=config
)

In [20]:
# Create KFold cross validation and train
kf = KFold(n_splits=run.config['num_kfold_splits'])

split_idx = {}
scores = []

for i, (train_idx, test_idx) in enumerate(kf.split(X)):
  print(f"Fold {i+1}")

  # Split data and store training indices
  train_X, train_y = X[train_idx], y[train_idx]
  test_X, test_y = X[test_idx], y[test_idx]
  split_idx[f"fold_{i+1}"] = train_idx

  # Train model
  # model = DecisionTreeRegressor(
  #   max_depth=run.config['max_depth'],
  #   random_state=43
  # )

  # model = HistGradientBoostingRegressor(
  #   loss=run.config['loss'],
  #   learning_rate=run.config['learning_rate'],
  #   max_iter=run.config['max_iter'],
  #   max_depth=run.config['max_depth'],
  #   l2_regularization=run.config['l2_regularization']
  # )

  model = LGBMRegressor(
    boosting_type=run.config['boosting_type'],
    num_leaves=run.config['num_leaves'],
    max_depth=run.config['max_depth'],
    learning_rate=run.config['learning_rate'],
    n_estimators=run.config['n_estimators'],
    reg_alpha=run.config['reg_alpha'],
    reg_lambda=run.config['reg_lambda']
  )
  model.fit(train_X, train_y)

  # Prediction on val set
  results = model.predict(test_X)

  # Scoring and logging
  pearson_corr = pearsonr(results, test_y).statistic
  metrics = {
    'pearson_corr': pearson_corr,
    'mae': mean_absolute_error(test_y, results),
    'mse': mean_squared_error(test_y, results),
    'r2': r2_score(test_y, results),
  }
  run.log({ **metrics })
  scores.append(pearson_corr)

  print(f"Pearson Coefficient Score: {pearson_corr}")
  print("---------------------------")

print(f"Average Pearson Correlation Score: {np.mean(pearson_corr)}")


Fold 1
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.563921 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 223363
[LightGBM] [Info] Number of data points in the train set: 450760, number of used features: 876
[LightGBM] [Info] Start training from score 0.039411




Pearson Coefficient Score: 0.05302349659897179
---------------------------
Fold 2
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.484957 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 223363
[LightGBM] [Info] Number of data points in the train set: 450760, number of used features: 876
[LightGBM] [Info] Start training from score 0.043508




Pearson Coefficient Score: 0.1564405035878033
---------------------------
Fold 3
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.475298 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 223377
[LightGBM] [Info] Number of data points in the train set: 450760, number of used features: 876
[LightGBM] [Info] Start training from score 0.037446




Pearson Coefficient Score: 0.04456215794940085
---------------------------
Fold 4
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.522840 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 223368
[LightGBM] [Info] Number of data points in the train set: 450760, number of used features: 876
[LightGBM] [Info] Start training from score 0.048383




Pearson Coefficient Score: -0.02082677114586489
---------------------------
Fold 5
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.546718 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 223364
[LightGBM] [Info] Number of data points in the train set: 450760, number of used features: 876
[LightGBM] [Info] Start training from score 0.025289




Pearson Coefficient Score: 0.09251158013259256
---------------------------
Fold 6
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.531671 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 223367
[LightGBM] [Info] Number of data points in the train set: 450761, number of used features: 876
[LightGBM] [Info] Start training from score 0.031455




Pearson Coefficient Score: 0.11015414436875745
---------------------------
Fold 7
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.563751 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 223370
[LightGBM] [Info] Number of data points in the train set: 450761, number of used features: 876
[LightGBM] [Info] Start training from score 0.027389
Pearson Coefficient Score: 0.0239964933769129
---------------------------
Average Pearson Correlation Score: 0.0239964933769129




In [21]:
run.finish()

0,1
mae,▃▁▁▂▃▅█
mse,▄▁▁▅▅▄█
pearson_corr,▄█▄▁▅▆▃
r2,▅█▆▄▇▆▁

0,1
mae,0.814
mse,1.34892
pearson_corr,0.024
r2,-0.27143


---

# ARIMA Models



In [31]:
from statsmodels.tsa.arima.model import ARIMA

In [14]:
df = pl.read_csv("data/clean/perm_impt_45.csv")

# Convert timestamp column to datetime
df = df.with_columns(
  pl.col('timestamp').str.to_datetime("%Y-%m-%dT%H:%M:%S%.f")
)

# Optional: if don't need the timestamp columns
# df = df.drop('timestamp')

In [20]:
VALID_PERC = 0.2
SEED = 43

In [26]:
# Extract the last p% of columns as validation set
np.random.seed(SEED)
num_valid = int(len(df) * VALID_PERC)

# Convert to numpy arrays
y = df['label'].to_numpy()
dt = df['timestamp'].to_numpy()
X = df.drop(['label', 'timestamp']).to_numpy()

# Subset into train valid
X_train, y_train, dt_train = X[:-num_valid], y[:-num_valid], dt[:-num_valid]
X_valid, y_valid, dt_valid = X[-num_valid:], y[-num_valid:], dt[-num_valid:]

In [33]:
model = ARIMA(
  endog=y_train,
  exog=X_train,
  order=(3, 1, 3),
  dates=dt_train,
  freq='min'
)

ValueError: Lengths must match to compare

In [32]:
import pandas as pd
# 1. Create a DatetimeIndex with minute frequency
start_date = '2023-01-01 00:00:00'
end_date = '2023-01-01 00:30:00' # 31 minutes of data
minute_index = pd.date_range(start=start_date, end=end_date, freq='min')

# 2. Create a sample time series (replace with your actual data)
# Let's create some synthetic minute-level data
np.random.seed(42)
data = pd.Series(np.random.randn(len(minute_index)).cumsum() + 100, index=minute_index)

print("Sample Data with Minute Frequency Index:")
print(data)
print("\nIndex Frequency:", data.index.freq)

# Check the frequency directly
print("Index Freq (as string):", data.index.freqstr)

Sample Data with Minute Frequency Index:
2023-01-01 00:00:00    100.496714
2023-01-01 00:01:00    100.358450
2023-01-01 00:02:00    101.006138
2023-01-01 00:03:00    102.529168
2023-01-01 00:04:00    102.295015
2023-01-01 00:05:00    102.060878
2023-01-01 00:06:00    103.640091
2023-01-01 00:07:00    104.407525
2023-01-01 00:08:00    103.938051
2023-01-01 00:09:00    104.480611
2023-01-01 00:10:00    104.017193
2023-01-01 00:11:00    103.551464
2023-01-01 00:12:00    103.793426
2023-01-01 00:13:00    101.880146
2023-01-01 00:14:00    100.155228
2023-01-01 00:15:00     99.592940
2023-01-01 00:16:00     98.580109
2023-01-01 00:17:00     98.894357
2023-01-01 00:18:00     97.986332
2023-01-01 00:19:00     96.574029
2023-01-01 00:20:00     98.039678
2023-01-01 00:21:00     97.813901
2023-01-01 00:22:00     97.881429
2023-01-01 00:23:00     96.456681
2023-01-01 00:24:00     95.912299
2023-01-01 00:25:00     96.023221
2023-01-01 00:26:00     94.872228
2023-01-01 00:27:00     95.247926
2023-01

---

# Make Predictions

In [None]:
test = pl.read_csv("./data/clean/test_plus.csv")

test = test.drop(['timestamp', 'label'])
test = test.to_numpy()
test = scaler.transform(test)

In [23]:
model = LGBMRegressor()
model.fit(X, y)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.592967 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 223360
[LightGBM] [Info] Number of data points in the train set: 525887, number of used features: 876
[LightGBM] [Info] Start training from score 0.036126


In [24]:
preds = model.predict(test)



In [27]:
pl.DataFrame({
  "ID": range(1, 1+len(test)), 
  "prediction": preds
}).write_csv("results/baseline.csv")