**原始方案：** https://www.kaggle.com/c/jpx-tokyo-stock-exchange-prediction/discussion/361482

**方案解读**：  本次分享的方案非常简单，手动构建特征+lightgbm回归，最终得分0.356，获得了第二名。

**代码作者**： aa

**结构说明**：

第一部分: 安装并导入依赖包;

第二部分：加载处理数据;

第三部分：训练模型，进行预测。


# 第一部分: 安装并导入依赖包

1. 安装依赖包

In [1]:
!pip install numpy pandas scipy sklearn && pip install -U lightgbm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


2 导入依赖包,并固定随机数种子

In [2]:
import numpy as np
import pandas as pd
import math
import os
from scipy import stats
import lightgbm as lgb
from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings('ignore')

def seed_everything(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

SEED=42
seed_everything(SEED)

# 第二部分: 加载数据，并进行特征生成

3. 读取数据

In [3]:
train_data = pd.read_csv("./stock_prices.csv",parse_dates=["Date"])
train_data = train_data.drop(columns=['RowId','ExpectedDividend','AdjustmentFactor','SupervisionFlag']).dropna().reset_index(drop=True)

test_data = pd.read_csv("./stock_prices_test.csv",parse_dates=["Date"])
test_data =test_data.drop(columns=['RowId','ExpectedDividend','AdjustmentFactor','SupervisionFlag'])


4. 生成新的特征，删除不用的特征，并对Nan值和Inf值进行处理

In [4]:
# 生成特征
def add_features(feats):
    # 20个工作日收盘价涨幅
    feats["return_1month"] = feats["Close"].pct_change(20)
    # 40个工作日收盘价涨幅
    feats["return_2month"] = feats["Close"].pct_change(40)
    # 60个工作日收盘价涨幅
    feats["return_3month"] = feats["Close"].pct_change(60)
    # 20个工作日收盘价波动率
    feats["volatility_1month"] = (
        np.log(feats["Close"]).diff().rolling(20).std()
    )
    # 40个工作日收盘价波动率
    feats["volatility_2month"] = (
        np.log(feats["Close"]).diff().rolling(40).std()
    )
    # 60个工作日收盘价波动率
    feats["volatility_3month"] = (
        np.log(feats["Close"]).diff().rolling(60).std()
    )
    # 收盘价除以20个工作日收盘价移动平均线
    feats["MA_gap_1month"] = feats["Close"] / (
        feats["Close"].rolling(20).mean()
    )
    # 收盘价除以40个工作日收盘价移动平均线
    feats["MA_gap_2month"] = feats["Close"] / (
        feats["Close"].rolling(40).mean()
    )
    # 收盘价除以60个工作日收盘价移动平均线
    feats["MA_gap_3month"] = feats["Close"] / (
        feats["Close"].rolling(60).mean()
    )
    
    return feats

# 使用0代理Nan值和Inf值
def fill_nan_inf(df):
    df = df.fillna(0)
    df = df.replace([np.inf, -np.inf], 0)
    return df

# 定义要使用的特征
features =['High','Low','Open','Close','Volume', 'return_1month', 'return_2month', 'return_3month', 'volatility_1month', 'volatility_2month', 'volatility_3month', 'MA_gap_1month', 'MA_gap_2month', 'MA_gap_3month']


train_data = add_features(train_data)
test_data = add_features(test_data)
train_data=fill_nan_inf(train_data)
test_data=fill_nan_inf(test_data)

# 第三部分: 模型训练

5. 定义指标计算函数

In [5]:
# 计算RMSE
def feval_rmse(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'rmse', mean_squared_error(y_true, y_pred), False
# 计算Pearsonr
def feval_pearsonr(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'pearsonr', stats.pearsonr(y_true, y_pred)[0], True
# 计算每天的收益
def calc_spread_return_per_day(df, portfolio_size=200, toprank_weight_ratio=2):
    assert df['Rank'].min() == 0
    assert df['Rank'].max() == len(df['Rank']) - 1
    weights = np.linspace(start=toprank_weight_ratio, stop=1, num=portfolio_size)
    purchase = (df.sort_values(by='Rank')['Target'][:portfolio_size] * weights).sum() / weights.mean()
    short = (df.sort_values(by='Rank', ascending=False)['Target'][:portfolio_size] * weights).sum() / weights.mean()
    return purchase - short
# 计算每天的Sharpe
def calc_spread_return_sharpe(df: pd.DataFrame, portfolio_size=200, toprank_weight_ratio=2):
    buf = df.groupby('Date').apply(calc_spread_return_per_day, portfolio_size, toprank_weight_ratio)
    sharpe_ratio = buf.mean() / buf.std()
    return sharpe_ratio#, buf
# 按收益率对资产进行排序
def add_rank(df):
    df["Rank"] = df.groupby("Date")["Target"].rank(ascending=False, method="first") - 1 
    df["Rank"] = df["Rank"].astype("int")
    return df
# 计算得分
def check_score(df,preds,Securities_filter=[]):
    tmp_preds=df[['Date','SecuritiesCode']].copy()
    tmp_preds['Target']=preds
    
    #Rank Filter. Calculate median for this date and assign this value to the list of Securities to filter.
    tmp_preds['target_mean']=tmp_preds.groupby("Date")["Target"].transform('median')
    tmp_preds.loc[tmp_preds['SecuritiesCode'].isin(Securities_filter),'Target']=tmp_preds['target_mean']
    
    tmp_preds = add_rank(tmp_preds)
    df['Rank']=tmp_preds['Rank']
    score=round(calc_spread_return_sharpe(df, portfolio_size= 200, toprank_weight_ratio= 2),5)
    score_mean=round(df.groupby('Date').apply(calc_spread_return_per_day, 200, 2).mean(),5)
    score_std=round(df.groupby('Date').apply(calc_spread_return_per_day, 200, 2).std(),5)
    print(f'Competition_Score:{score}, rank_score_mean:{score_mean}, rank_score_std:{score_std}')

6. 构建训练数据集和验证数据集

In [6]:
list_spred_h=list((train_data.groupby('SecuritiesCode')['Target'].max()-train_data.groupby('SecuritiesCode')['Target'].min()).sort_values()[:1000].index)
list_spred_l=list((train_data.groupby('SecuritiesCode')['Target'].max()-train_data.groupby('SecuritiesCode')['Target'].min()).sort_values()[1000:].index)

train_dataset = lgb.Dataset(train_data[train_data['SecuritiesCode'].isin(list_spred_h)][features],train_data[train_data['SecuritiesCode'].isin(list_spred_h)]["Target"],feature_name = features )
validate_dataset = lgb.Dataset(train_data[train_data['SecuritiesCode'].isin(list_spred_l)][features], train_data[train_data['SecuritiesCode'].isin(list_spred_l)]["Target"],feature_name = features)

7. 训练模型

In [7]:
params_lgb = {'learning_rate': 0.005,'metric':'None','objective': 'regression','boosting': 'gbdt','verbosity': 0,'n_jobs': -1,'force_col_wise':True}  
model = lgb.train(params = params_lgb, 
                train_set = train_dataset, 
                valid_sets = [train_dataset, validate_dataset], 
                num_boost_round = 3000, 
                feval=feval_pearsonr,
                callbacks=[ lgb.early_stopping(stopping_rounds=300, verbose=True), lgb.log_evaluation(period=100)])    

Training until validation scores don't improve for 300 rounds
[100]	training's pearsonr: 0.0564218	valid_1's pearsonr: 0.0108141
[200]	training's pearsonr: 0.0680828	valid_1's pearsonr: 0.0133135
[300]	training's pearsonr: 0.0762279	valid_1's pearsonr: 0.0141654
[400]	training's pearsonr: 0.0823793	valid_1's pearsonr: 0.0145849
[500]	training's pearsonr: 0.0879873	valid_1's pearsonr: 0.0147099
[600]	training's pearsonr: 0.0934746	valid_1's pearsonr: 0.0149388
[700]	training's pearsonr: 0.0980855	valid_1's pearsonr: 0.0147947
[800]	training's pearsonr: 0.102637	valid_1's pearsonr: 0.0146655
Early stopping, best iteration is:
[568]	training's pearsonr: 0.0918688	valid_1's pearsonr: 0.0150002


8. 在验证集上测试模型

In [8]:
preds=model.predict(test_data[features])
print(math.sqrt(mean_squared_error(preds,test_data.Target)))
check_score(test_data,preds)
check_score(test_data,preds,list_spred_h)
check_score(test_data,preds,list_spred_l)

0.023836449735498907
Competition_Score:0.12494, rank_score_mean:0.10759, rank_score_std:0.86111
Competition_Score:0.132, rank_score_mean:0.12325, rank_score_std:0.93367
Competition_Score:0.21668, rank_score_mean:0.12284, rank_score_std:0.5669
