<center><font size=6> xgboost预测实验 </font></center>
主要是考虑能否通过xgboost来找到合适的买点。

流程：
1. 整理训练数据
2. 直接训练

重要：
1. 需要更长的时间历史数据，保留7月的数据用于回测。数据集应该划分为 train/val/test
    - 训练集可以随机采样，但是测试集需要不重合地划分。
2. 看不出交易量是否有用，所以需要做对照试验


结论：
1. IC计算的结果不稳定。尤其是收到数据量的影响较大。
2. 用xgboost进行回归的结果是非常不可靠的，所以接下来还是转化为分类问题处理。

In [1]:
import pandas as pd
import numpy as np

In [2]:
import os

In [102]:
class Data_Loader(object):
    
    def __init__(self):
        pass
    
    def get_price_df(self, file_list):
        '''
        从file_list读取数据，然后以df形式返回。
        '''
        concatenated_df = pd.DataFrame()
        if len(file_list) > 1:
            for temp_file in file_list:
                temp_path = file_path + temp_file
                # 从本地读取价量数据
                temp_price_df = pd.read_csv(temp_path, header=None)
                temp_price_df.columns = ['open_time','open','high','low','close','volume','close_time','quote_volume','count',
                                    'taker_buy_volume','taker_buy_quote_volume','ignore']
                concatenated_df = pd.concat([concatenated_df, temp_price_df])
                
        else:
            temp_file = file_list[0]
            temp_path = file_path + temp_file
            # 从本地读取价量数据
            temp_price_df = pd.read_csv(temp_path, header=None)
            temp_price_df.columns = ['open_time','open','high','low','close','volume','close_time','quote_volume','count',
                                'taker_buy_volume','taker_buy_quote_volume','ignore']
            concatenated_df = temp_price_df
        
        concatenated_df.reset_index(drop=True, inplace=True)
        return concatenated_df
    
    def calculate_theta(self, Boll_df, price_df, target_column_list=["close"]):
        '''
        计算theta指标
        Boll_df: 用于存放指标的df
        price_df：原始价格的df
        '''
        for target_column in target_column_list:
            # theta
            Boll_df['mean_20_'+target_column] = price_df[[target_column]].ewm(span=20, adjust=False).mean()
            Boll_df['std_20_'+target_column] = price_df[[target_column]].ewm(span=20, adjust=False).std()
            Boll_df[target_column] = price_df[target_column]

            # 计算偏离度 theta = (p - ma) / sigma
            Boll_df['theta_'+target_column] = (Boll_df[target_column] - Boll_df['mean_20_'+target_column]) / Boll_df['std_20_'+target_column]

        return Boll_df
    
    def calculate_theta_prospective_earnings(self, price_df):
        '''
        计算预期收益率
        '''
        # 计算未来均价
        price_df['mean_10_future'] = price_df[['close']].ewm(span=10, adjust=False).mean().shift(-10)  # 首先计算均线，然后错位10个（也就是对应于未来的均价）
        # 计算未来收益
        price_df['prospective_earnings'] = (price_df['mean_10_future'] - price_df['close']) / price_df['close']     
        price_df['prospective_earnings_1'] = (price_df['close'].shift(-2) - price_df['close']) / price_df['close'] 
        return price_df

# 1.数据准备 

In [103]:
import random

In [104]:
file_path = "../data/cc_data/"
file_list = os.listdir(file_path)
file_list = [x if "csv" in x else None for x in file_list] # 排除文件夹
file_list.remove(None)

In [105]:
file_list.remove(None)

In [106]:
file_list = file_list[:12]

In [107]:
data_loader = Data_Loader()

In [108]:
len(file_list)

12

In [109]:
train_df = data_loader.get_price_df(file_list[:6])
test_df = data_loader.get_price_df(file_list[6:])

In [110]:
train_df.shape

(264960, 12)

In [111]:
test_df.shape

(260560, 12)

## 1.1 计算技术指标

In [112]:
train_theta_df = pd.DataFrame()
test_theta_df = pd.DataFrame()

train_theta_df["close"] = train_df["close"]
test_theta_df["close"] = test_df["close"]

# target_columns_list = ['open','high','low','close','quote_volume', 'taker_buy_volume', 'taker_buy_quote_volume']  # 需要计算theta的对应列
target_columns_list = ['close'] 

train_theta_df = data_loader.calculate_theta(train_theta_df, train_df, target_columns_list)
test_theta_df = data_loader.calculate_theta(test_theta_df, test_df, target_columns_list)

train_theta_df.dropna(inplace=True)
test_theta_df.dropna(inplace=True)

## 1.3 采样X和y

In [115]:
train_theta_df = data_loader.calculate_theta_prospective_earnings(train_theta_df)
test_theta_df = data_loader.calculate_theta_prospective_earnings(test_theta_df)

In [116]:
train_theta_df.dropna(inplace=True)
test_theta_df.dropna(inplace=True)

In [117]:
def Sample_random(data_df, target_columns_list, sample_num=20, time_range=2):
    '''随机采样'''
    # 采样随机时间点
    time_range = time_range-1
    index = np.arange(0, len(train_theta_df)-1, time_range)
    random_index = index[index>=time_range][:sample_num]
    
    sample_x_list = []
    sample_y_list = []
    for index in random_index:
        temp_df = data_df.iloc[index-time_range:index+1,:]  # 此处需要加1因为要包含index所在的时刻
        temp_df = np.array(temp_df[target_columns_list])
        sample_x_list.append(temp_df)
        
        temp_y = data_df.iloc[index+1,:]
        sample_y_list.append(temp_y['prospective_earnings'])  # 此处将return保存

    sample_X = np.stack(sample_x_list)
    sample_y = np.stack(sample_y_list)
    assert sample_X.shape[0] == sample_y.shape[0] , '采样X与y数量不一致'
    
    return sample_X, sample_y

In [38]:
target_columns = ["theta_"+x for x in target_columns_list]

In [82]:
n_train = 100000
n_test = 100000

In [83]:
origin_train_X, origin_train_y = Sample_random(train_theta_df, target_columns, sample_num=n_train)
origin_test_X, origin_test_y = Sample_random(test_theta_df, target_columns, sample_num=n_test)

In [84]:
print("sample_X.shape: ", origin_train_X.shape)
print("sample_y.shape: ", origin_train_y.shape)

sample_X.shape:  (100000, 2, 1)
sample_y.shape:  (100000,)


In [85]:
print("sample_test_X.shape: ", origin_test_X.shape)
print("sample_test_y.shape: ", origin_test_y.shape)

sample_test_X.shape:  (100000, 2, 1)
sample_test_y.shape:  (100000,)


# 2. 模型训练

In [86]:
from xgboost import XGBRegressor

In [87]:
sample_X, sample_y = origin_train_X, origin_train_y

sample_X = sample_X.reshape((len(sample_X),-1))
test_X = origin_test_X.reshape((len(origin_test_X),-1))
test_y = origin_test_y

In [88]:
kwars = {
            "eval_metric": "rmse",
            "colsample_bytree": 0.8879,
            "eta": 0.0421,
            "max_depth": 4,
            "n_estimators": 100,
            "subsample": 0.8789,
            "nthread": 20,
}

xgb = XGBRegressor(**kwars)

In [89]:
xgb.fit(sample_X, sample_y)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.8879, eta=0.0421,
             eval_metric='rmse', gamma=0, gpu_id=-1, importance_type='gain',
             interaction_constraints='', learning_rate=0.0421000011,
             max_delta_step=0, max_depth=4, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=20, nthread=20,
             num_parallel_tree=1, objective='reg:squarederror', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.8789,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [90]:
pred_y = xgb.predict(test_X)

In [91]:
from sklearn.metrics import mean_absolute_error

In [92]:
mean_absolute_error(pred_y, test_y)

0.006807028826189966

In [93]:
y_df = pd.DataFrame({"pred_y":pred_y,"sample_y":test_y})

In [94]:
y_df[['pred_y','sample_y']].corr().iloc[0,1]

0.0642196955781763

In [118]:
train_theta_df[:50000][['theta_close','prospective_earnings']].corr().iloc[0,1]

-0.09854248477671587

In [119]:
train_theta_df[['theta_close','prospective_earnings_1']].corr().iloc[0,1]

-0.09583240135459803

In [120]:
train_theta_df["minus_theta"] = -1*train_theta_df["theta_close"]

In [121]:
train_theta_df[['minus_theta','prospective_earnings_1']].corr().iloc[0,1]

0.09583240135459803

## 2.2 LSTM

# 3. 模型评价