In [8]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import pyarrow
import numpy as np

In [3]:
# Load datasets
train = pd.read_parquet(r'C:\Users\Matt Willer\Downloads\drw-crypto-market-prediction\train.parquet', engine='pyarrow')
test = pd.read_parquet(r'C:\Users\Matt Willer\Downloads\drw-crypto-market-prediction\test.parquet', engine='pyarrow')

# Show heads
print("Train head:")
print(train.head())
print("\nTest head:")
print(test.head())

Train head:
                     bid_qty  ask_qty  buy_qty  sell_qty   volume        X1  \
2023-03-01 00:00:00   15.283    8.425  176.405    44.984  221.389  0.181844   
2023-03-01 00:01:00   38.590    2.336  525.846   321.950  847.796  0.489497   
2023-03-01 00:02:00    0.442   60.250  159.227   136.369  295.596  0.260121   
2023-03-01 00:03:00    4.865   21.016  335.742   124.963  460.705  0.099976   
2023-03-01 00:04:00   27.158    3.451   98.411    44.407  142.818  0.270893   

                           X2        X3        X4        X5  ...      X772  \
2023-03-01 00:00:00 -0.637860  0.006652  0.136870  0.116698  ...  0.333753   
2023-03-01 00:01:00 -0.075619  0.431594  0.522400  0.475255  ...  0.333657   
2023-03-01 00:02:00 -0.444684  0.100695  0.224729  0.203282  ...  0.333667   
2023-03-01 00:03:00 -0.666728 -0.123858  0.019197  0.014459  ...  0.333174   
2023-03-01 00:04:00 -0.325973  0.116336  0.234311  0.214073  ...  0.333171   

                         X773      X774     

In [6]:
# Prepare features and target
df = train
X = df.drop(columns=['label'])
y = df['label']

# time‑based split
split_idx = int(len(df)*0.8)
X_tr, X_val = X.iloc[:split_idx], X.iloc[split_idx:]
y_tr, y_val = y.iloc[:split_idx], y.iloc[split_idx:]

# fit
model = LinearRegression()
model.fit(X_tr, y_tr)

# predict & evaluate
val_preds = model.predict(X_val)
corr, pval = pearsonr(y_val, val_preds)
print(f'Validation Pearson r = {corr:.4f} (p = {pval:.2g})')

Validation Pearson r = 0.0989 (p = 9.6e-227)


In [10]:
from sklearn.linear_model import RidgeCV, LassoCV
from scipy.stats import pearsonr
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# set up alphas to search
alphas = np.logspace(-4, 2, 30)

# Ridge
ridge = RidgeCV(alphas=alphas, cv=3).fit(X_tr, y_tr)
ridge_preds = ridge.predict(X_val)
print("Ridge  R:", pearsonr(y_val, ridge_preds)[0].round(4))

# build a scaler + lasso pipeline
lasso_pipe = make_pipeline(
    StandardScaler(),
    LassoCV(
        alphas = np.logspace(-4, 2, 30),
        cv = 3,
        max_iter = 10000,      # give it more room to converge
        tol = 1e-4,
        n_jobs = -1
    )
)

# fit on your training slice
lasso_pipe.fit(X_tr, y_tr)

# predict and evaluate on validation
lasso_preds = lasso_pipe.predict(X_val)
r_lasso = pearsonr(y_val, lasso_preds)[0]
print(f"Lasso (scaled)  R = {r_lasso:.4f}")

Ridge  R: 0.0899


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(


Lasso (scaled)  R = 0.0956


In [None]:
import lightgbm as lgb
from scipy.stats import pearsonr

# build dataset
dtrain = lgb.Dataset(X_tr, label=y_tr)
dval   = lgb.Dataset(X_val, label=y_val)

params = {
    "objective": "regression",
    "metric":    "None",
    "learning_rate": 0.05,
    "num_leaves":    64,
    "min_data_in_leaf": 100,
    "verbose":      -1,
}

gbm = lgb.train(
    params,
    dtrain,
    num_boost_round=1000,
    valid_sets=[dval],
    early_stopping_rounds=50,
    feval=lambda preds, ds: ("pearson", pearsonr(ds.get_label(), preds)[0], True)
)

val_pred_gbm = gbm.predict(X_val)
print("LightGBM R:", pearsonr(y_val, val_pred_gbm)[0].round(4))