In [None]:
import polars as pl
import numpy as np
import warnings
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")
commission = 1 / 10000.0

In [None]:
import os
LOAD_FILE_NO_CALC = 1
result_hour_path = 'data/result_hour.parquet'
result_hour_path = 'data/rolling_factors.parquet'
result_hour_path = "data/agg_data_hour_to_day_alpha101.parquet"

if LOAD_FILE_NO_CALC:
    if os.path.exists(result_hour_path):
        result_hour = pl.read_parquet(result_hour_path)
    else:
        assert 0, 'miss file'
result_hour

In [None]:

FACTOR_COMBINATION_LIST = ['amihud']
for i in [13, 15, 16, 30, 33, 34, 35, 36, 45, 50, 51, 54, 55, 64, 71, 74, 99]:
    FACTOR_COMBINATION_LIST.append(f"alpha{i}")

In [None]:
date_threshold = pl.datetime(2023, 1, 4)
origin_xgb_x_eval = result_hour.filter(pl.col('open_time') >= date_threshold).select(['open_time', 'symbol', 'close'] + FACTOR_COMBINATION_LIST)
origin_xgb_x_eval

In [None]:
FACTOR_COMBINATION_LIST = [ 'return_skew',  'amihud', 'return_auto_corr_1_pearson_lag1', 'return_auto_corr_1_pearson_lag10', 'return_auto_corr_1_spearman_lag2']

# add future_ret in result_hour
UPDATE_POSITION_TIME = 7
for i in range(1, UPDATE_POSITION_TIME + 1):
    result_hour = result_hour.with_columns(
        ((pl.col("close").shift(-i) / pl.col("close") - 1) * 100)
        .over("symbol")  # Applying the function over each symbol group
        .alias(f"future_{i}day_return")
    ).fill_null (0)
result_hour

## XGBoost 组合

In [None]:
import optuna
import xgboost as xgb
from sklearn.metrics import mean_squared_error

def objective(trial, xgb_x_train, xgb_y_train, xgb_x_eval, xgb_y_eval):
    # Suggest values for the hyperparameters
    param = {
        'tree_method': 'gpu_hist',  # Use GPU acceleration
        'lambda': trial.suggest_loguniform('lambda', 1e-8, 10.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-8, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.1, 0.5, 0.7, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.6, 0.7, 0.8, 0.9, 1.0]),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'n_estimators': 20000,
        'max_depth': trial.suggest_categorical('max_depth', [3, 5, 7, 9]),
        'random_state': 42,
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
        'objective': 'reg:squarederror',
        'verbosity': 1,
        'eval_metric': 'rmse'
    }
    
    # Create DMatrix for XGBoost
    dtrain = xgb.DMatrix(xgb_x_train, label=xgb_y_train)
    dval = xgb.DMatrix(xgb_x_eval, label=xgb_y_eval)
    
    # Train model
    model = xgb.train(param, dtrain, evals=[(dval, 'eval')], early_stopping_rounds=2000)
    
    # Compute RMSE on validation set
    preds = model.predict(dval)
    rmse = np.sqrt(mean_squared_error(xgb_y_eval, preds))
    
    return rmse

# Create a study object
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

# Fetch the best parameters
best_params = study.best_params
print("Best params:", best_params)

# Train final model with best parameters
best_params['tree_method'] = 'gpu_hist'  # Ensure GPU usage

dtrain = xgb.DMatrix(xgb_x_train, label=xgb_y_train)
dval = xgb.DMatrix(xgb_x_eval, label=xgb_y_eval)
final_model = xgb.train(best_params, dtrain, evals=[(dval, 'eval')])


In [None]:
from model.xgb import xgb_model
import xgboost as xgb
from sklearn.metrics import r2_score
import numpy as np
import time

for cur_update_position_time in range(1, UPDATE_POSITION_TIME + 1):

    non_nan_result_hour = result_hour.filter(pl.col(f"future_{cur_update_position_time}day_return").is_not_nan())

    non_nan_xgb_x = non_nan_result_hour.select(['open_time', 'symbol'] + FACTOR_COMBINATION_LIST)
    non_nan_xgb_y = non_nan_result_hour.select (f"future_{cur_update_position_time}day_return")


    print (f'min date: {non_nan_xgb_x["open_time"].min()} == max date: {non_nan_xgb_x["open_time"].max()}')


    xgb_x_train = non_nan_xgb_x.filter(pl.col('open_time') < date_threshold)
    xgb_x_eval = non_nan_xgb_x.filter(pl.col('open_time') >= date_threshold)

    train_size = xgb_x_train.height
    eval_size = xgb_x_eval.height
    ratio = train_size / eval_size if eval_size > 0 else float('inf')  # Avoid division by zero

    xgb_y_train = non_nan_xgb_y.head(train_size)
    xgb_y_eval = non_nan_xgb_y.tail(non_nan_xgb_y.height - train_size)


    xgb_x_train = xgb_x_train.drop (['open_time', 'symbol'])
    xgb_x_eval = xgb_x_eval.drop (['open_time', 'symbol'])

    print("Training set size:", train_size)
    print("Evaluation set size:", eval_size)
    print("Ratio (Train:Eval):", ratio)



    log_period = 100 # check this later

    fit_parameters = {
        "num_boost_round": 20000,
        "early_stopping_rounds": 2000,
        "verbose_eval":  log_period
    }
    xgb_parameters = {
        # "tree_method": "hist",
        "tree_method": "gpu_hist",
        "device": "cuda:0",
        "nthread": os.cpu_count(),
        "objective": "reg:squarederror",
    #   "objective": "reg:pseudohubererror",
        "max_depth": 7,
    #   "max_depth": 4,
        "subsample": 0.8,
    #   "subsample": 0.5,
        "colsample_bytree": 0.1,
    #   "colsample_bytree": 0.5,
    #   "min_child_weight": 0.5,
    #   "min_child_weight": 200,
        "reg_alpha": 0.98,
        "reg_lambda": 0.98,
        "eval_metric": "rmse",
        "seed": int(time.time()),
        "num_parallel_tree": 7,
        "learning_rate": 1E-2,
        "verbosity": 1
    }



    model = xgb_model(
        x_train=xgb_x_train, y_train=xgb_y_train,
        x_val=xgb_x_eval, y_val=xgb_y_eval,
        xgb_para=xgb_parameters, **fit_parameters
    )

    # 训练模型
    model.train()

    # save the xgb model predict result into file
    x_eval_factors = origin_xgb_x_eval[FACTOR_COMBINATION_LIST]
    d_eval = xgb.DMatrix(x_eval_factors)
    predictions = final_model.predict(d_eval)

    # Add predictions back to the DataFrame to analyze or use in trading logic
    # origin_xgb_x_eval['predicted_factor'] = predictions
    # origin_xgb_x_eval = origin_xgb_x_eval.with_columns('predict_factor', predictions)
    predictions_series = pl.Series(predictions)

    # Add the predictions as a new column to the DataFrame
    origin_xgb_x_eval = origin_xgb_x_eval.with_columns(predictions_series.alias(f'xgb_compound_factor_{cur_update_position_time}day'))

    print (cur_update_position_time, origin_xgb_x_eval)
    # break


In [None]:
origin_xgb_x_eval.write_parquet('data/xgb_predictions.parquet')
origin_xgb_x_eval
