In [2]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
# kagglehub.login()
# //read from kaggle.json
from kagglehub.config import set_kaggle_credentials

import json
with open('kaggle.json', 'r') as f:
    kaggle_creds = json.load(f)
    
# kagglehub.login(username=kaggle_creds['username'], key=kaggle_creds['key'])
set_kaggle_credentials(username=kaggle_creds['username'], api_key=kaggle_creds['key'])


Kaggle credentials set.




In [3]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

# drw_crypto_market_prediction_path = kagglehub.competition_download('drw-crypto-market-prediction')

# print('Data source import complete.')
drw_crypto_market_prediction_path = '/Users/mahta/Projects/Time-Series-Library/data/drw-crypto-market-prediction'


In [4]:
drw_crypto_market_prediction_path

'/Users/mahta/Projects/Time-Series-Library/data/drw-crypto-market-prediction'

In [None]:
"""Competition: DRW Crypto Market Prediction | Date: Week 3 | Purpose: Add Back X174"""

import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from scipy.stats import pearsonr

def reduce_mem_usage(dataframe, dataset):
   print('Reducing memory usage for:', dataset)
   initial_mem_usage = dataframe.memory_usage().sum() / 1024**2

   for col in dataframe.columns:
       col_type = dataframe[col].dtype
       c_min = dataframe[col].min()
       c_max = dataframe[col].max()

       if str(col_type)[:3] == 'int':
           if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
               dataframe[col] = dataframe[col].astype(np.int8)
           elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
               dataframe[col] = dataframe[col].astype(np.int16)
           elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
               dataframe[col] = dataframe[col].astype(np.int32)
           elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
               dataframe[col] = dataframe[col].astype(np.int64)
       else:
           if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
               dataframe[col] = dataframe[col].astype(np.float16)
           elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
               dataframe[col] = dataframe[col].astype(np.float32)
           else:
               dataframe[col] = dataframe[col].astype(np.float64)

   final_mem_usage = dataframe.memory_usage().sum() / 1024**2
   print('--- Memory usage before: {:.2f} MB'.format(initial_mem_usage))
   print('--- Memory usage after: {:.2f} MB'.format(final_mem_usage))
   print('--- Decreased memory usage by {:.1f}%\n'.format(100 * (initial_mem_usage - final_mem_usage) / initial_mem_usage))

   return dataframe

def create_time_weights(n_samples, decay_factor=0.95):
   """Create exponentially decaying weights based on sample position."""
   positions = np.arange(n_samples)
   normalized_positions = positions / (n_samples - 1)
   weights = decay_factor ** (1 - normalized_positions)
   weights = weights * n_samples / weights.sum()
   return weights

# Load data
train = pd.read_parquet(drw_crypto_market_prediction_path + '/train.parquet')
test = pd.read_parquet(drw_crypto_market_prediction_path + '/test.parquet')
sample_submission = pd.read_csv(drw_crypto_market_prediction_path + '/sample_submission.csv')

# Add back X174 (lowest AV importance of removed features)
selected_features = [
   'X863', 'X856', 'X344', 'X598', 'X862', 'X385', 'X852', 'X603', 'X860',
   'X415', 'X345', 'X137', 'X855', 'X178', 'X532', 'X168', 'X174',
   'bid_qty', 'ask_qty', 'buy_qty', 'sell_qty', 'volume'
]

train = train[selected_features + ["label"]]
test = test[selected_features]

train = reduce_mem_usage(train, "train")
test = reduce_mem_usage(test, "test")

print("Train=", train.shape)
print("Test=", test.shape)

FEATURES = [c for c in train.columns if c not in ["label"]]
print(f"There are {len(FEATURES)} FEATURES (added back X174)")

# Cross-validation
FOLDS = 5
kf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

# Create bins for stratification
train['label_float32'] = train['label'].astype(np.float32)
train['label_bins'] = pd.qcut(train['label_float32'], q=10, labels=False, duplicates='drop')
train = train.drop('label_float32', axis=1)

# XGBoost parameters
xgb_params = {
   "tree_method": "hist",
   "device": "cuda",
   "colsample_bylevel": 0.4778015829774066,
   "colsample_bynode": 0.362764358742407,
   "colsample_bytree": 0.7107423488010493,
   "gamma": 1.7094857725240398,
   "learning_rate": 0.02213323588455387,
   "max_depth": 20,
   "max_leaves": 12,
   "min_child_weight": 16,
   "n_estimators": 1667,
   "n_jobs": -1,
   "random_state": 42,
   "reg_alpha": 39.352415706891264,
   "reg_lambda": 75.44843704068275,
   "subsample": 0.06566669853471274,
   "verbosity": 0,
   "objective": "reg:squarederror"
}

# Define model configurations
model_configs = [
   {"name": "Model 1 (100% Full Data)", "percent": 1.00},
   {"name": "Model 2 (90% Recent)", "percent": 0.90},
   {"name": "Model 3 (80% Recent)", "percent": 0.80},
   {"name": "Model 4 (70% Recent)", "percent": 0.70},
   {"name": "Model 5 (60% Recent)", "percent": 0.60},
   {"name": "Model 6 (50% Recent)", "percent": 0.50},
   {"name": "Model 7 (40% Recent)", "percent": 0.40}
]

# Initialize predictions
n_models = len(model_configs)
oof_preds_all = [np.zeros(len(train)) for _ in range(n_models)]
test_preds_all = [np.zeros(len(test)) for _ in range(n_models)]

# Generate sample weights for full data
sample_weights_full = create_time_weights(len(train), decay_factor=0.95)
print(f"\nFull data sample weights range: [{sample_weights_full.min():.4f}, {sample_weights_full.max():.4f}]")

# Calculate cutoffs
cutoffs = []
for config in model_configs:
   if config["percent"] == 1.00:
       cutoffs.append(0)
   else:
       cutoff_idx = int(len(train) * (1 - config["percent"]))
       cutoffs.append(cutoff_idx)
       print(f"{config['name']} - Using most recent {len(train) - cutoff_idx} samples")

# Cross-validation loop
for fold_num, (train_idx, valid_idx) in enumerate(kf.split(train, train['label_bins'])):
   print("\n" + "#" * 50)
   print(f"### Fold {fold_num + 1}")
   print("#" * 50)

   X_valid = train.iloc[valid_idx][FEATURES]
   y_valid = train.iloc[valid_idx]["label"]
   X_test = test[FEATURES]

   # Train each model
   for model_idx, (config, cutoff) in enumerate(zip(model_configs, cutoffs)):
       print(f"\n--- {config['name']} ---")

       if config["percent"] == 1.00:
           X_train = train.iloc[train_idx][FEATURES]
           y_train = train.iloc[train_idx]["label"]
           train_weights = sample_weights_full[train_idx]
       else:
           train_idx_recent = train_idx[train_idx >= cutoff]
           train_idx_recent_adjusted = train_idx_recent - cutoff
           train_recent = train.iloc[cutoff:].reset_index(drop=True)

           X_train = train_recent.iloc[train_idx_recent_adjusted][FEATURES]
           y_train = train_recent.iloc[train_idx_recent_adjusted]["label"]

           sample_weights_recent = create_time_weights(len(train_recent), decay_factor=0.95)
           train_weights = sample_weights_recent[train_idx_recent_adjusted]

       # Train model
       model = xgb.XGBRegressor(**xgb_params, early_stopping_rounds=25)
       model.fit(
           X_train, y_train,
           sample_weight=train_weights,
           eval_set=[(X_valid, y_valid)],
           verbose=200
       )

       # Make predictions
       if config["percent"] == 1.00:
           oof_preds_all[model_idx][valid_idx] = model.predict(X_valid)
       else:
           valid_idx_in_range = valid_idx[valid_idx >= cutoff]
           if len(valid_idx_in_range) > 0:
               X_valid_subset = train.iloc[valid_idx_in_range][FEATURES]
               oof_preds_all[model_idx][valid_idx_in_range] = model.predict(X_valid_subset)

           valid_idx_out_range = valid_idx[valid_idx < cutoff]
           if len(valid_idx_out_range) > 0:
               oof_preds_all[model_idx][valid_idx_out_range] = oof_preds_all[0][valid_idx_out_range]

       test_preds_all[model_idx] += model.predict(X_test)

# Average test predictions across folds
for i in range(n_models):
   test_preds_all[i] /= FOLDS

# Calculate individual model scores
pearson_scores = []
for i, config in enumerate(model_configs):
   score = pearsonr(train["label"], oof_preds_all[i])[0]
   pearson_scores.append(score)

print("\n" + "=" * 50)
print("INDIVIDUAL MODEL PERFORMANCE (+X174)")
print("=" * 50)
for config, score in zip(model_configs, pearson_scores):
   print(f"{config['name']} Pearson Correlation: {score:.4f}")

# Create ensemble predictions
ensemble_oof_preds = np.mean(oof_preds_all, axis=0)
ensemble_test_preds = np.mean(test_preds_all, axis=0)

ensemble_pearson_score = pearsonr(train["label"], ensemble_oof_preds)[0]
print(f"\nEnsemble (Equal Weight) Pearson Correlation: {ensemble_pearson_score:.4f}")

# Weighted ensemble
total_score = sum(pearson_scores)
weights = [score / total_score for score in pearson_scores]

weighted_ensemble_oof = np.zeros(len(train))
weighted_ensemble_test = np.zeros(len(test))

for i in range(n_models):
   weighted_ensemble_oof += weights[i] * oof_preds_all[i]
   weighted_ensemble_test += weights[i] * test_preds_all[i]

weighted_ensemble_score = pearsonr(train["label"], weighted_ensemble_oof)[0]
print(f"Weighted Ensemble Pearson Correlation: {weighted_ensemble_score:.4f}")

# Use the better ensemble
if weighted_ensemble_score > ensemble_pearson_score:
   final_test_preds = weighted_ensemble_test
   print("\nUsing weighted ensemble for final predictions")
else:
   final_test_preds = ensemble_test_preds
   print("\nUsing simple average ensemble for final predictions")


In [8]:

# Save predictions
sample_sub = pd.read_csv('/Users/mahta/Projects/Time-Series-Library/data/drw-crypto-market-prediction/sample_submission.csv')
submission = pd.DataFrame({
   sample_sub.columns[0]: sample_sub.iloc[:, 0],
   'prediction': final_test_preds
})
submission.to_csv("submission_with_X174.csv", index=False)
print("\nSubmission saved to submission_with_X174.csv!")

# Gap-first validation
cv_scores = [np.sqrt(np.mean((train["label"] - oof_preds_all[i])**2)) for i in range(n_models)]
cv_mean = np.mean(cv_scores)
cv_std = np.std(cv_scores)

print(f"\n=== GAP-FIRST VALIDATION ===")
print(f"Ensemble Pearson: {weighted_ensemble_score:.4f}")
print(f"Model RMSE Std: {cv_std:.6f}")
print(f"Features: {len(FEATURES)} (added back X174)")

if cv_std < 0.01 and weighted_ensemble_score > 0.4:
   print("✅ SUBMIT: High correlation and stable models")
else:
   print("⚠️  CAUTION: Review before submission")


Submission saved to submission_with_X174.csv!

=== GAP-FIRST VALIDATION ===
Ensemble Pearson: 0.5100
Model RMSE Std: 0.004741
Features: 22 (added back X174)
✅ SUBMIT: High correlation and stable models


## Timeseries Time!