In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("./all-coin.csv")

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,open_time,open,high,low,close,volume,close_time,quote_asset_volume,number_of_trades,...,bollinger_upper,bollinger_middle,bollinger_lower,diff_bollinger_upper,diff_bollinger_lower,diff_sma_3,diff_sma_6,diff_sma_12,signal,tag
0,0,1504272600000,390.01,390.26,389.22,389.95,31.36796,1504273499999,12230.265145,33,...,391.308471,389.311,387.313529,-1.358471,2.636471,-0.04,0.62,0.584167,1,1
1,1,1504273500000,390.6,391.0,389.33,390.5,44.61277,1504274399999,17415.334113,54,...,391.413983,389.352,387.290017,-0.913983,3.209983,0.346667,0.838333,1.0825,1,1
2,2,1504274400000,390.78,391.06,388.8,388.8,77.89239,1504275299999,30411.035081,87,...,390.844858,389.192,387.539142,-2.044858,1.260858,-0.95,-0.878333,-0.659167,1,1
3,3,1504275300000,388.81,388.81,386.73,386.83,58.66071,1504276199999,22756.111613,33,...,391.015535,389.061,387.106465,-4.185535,-0.276465,-1.88,-2.52,-2.365,1,1
4,4,1504276200000,387.58,388.51,386.88,387.8,25.3345,1504277099999,9824.113039,23,...,390.999972,388.9775,386.955028,-3.199972,0.844972,-0.01,-1.181667,-1.2575,1,1


In [5]:
df.drop(columns=["Unnamed: 0"], inplace=True)

In [6]:
# 只使用btc和eth的数据进行训练 ETH-1 BTC-2
df = df[df['tag'].isin([2])]
df.shape

(200854, 35)

In [8]:
# 计算特征每个类信息有多少个
df[['signal']].groupby(['signal']).size()

signal
1    189482
2      5276
3      5158
4       469
6       469
dtype: int64

In [13]:
# 对数据进行分割
def create_time_series_data(data, window_size):
    X = np.zeros((len(data) - window_size, window_size * (data.shape[1] - 1)))
    y = np.zeros(len(data) - window_size)
    for i in range(len(data) - window_size):
        X[i] = data[i:i + window_size, :-1].flatten()  # 提取前n个时间点的所有特征作为输入特征
        y[i] = int(data[i + window_size, -2])  # 提取第n+1个时间点的signal作为标签，并转换为int类型
    return X, y

# 重构数据
window_size = 200  # 选择窗口大小
data = df.values
X, y = create_time_series_data(data, window_size)

# 展平时间窗口数据以适应XGBoost模型
X_flattened = X.reshape(X.shape[0], -1)

In [None]:
# 选择前70%的数据作为训练集，剩余30%作为测试集。保持数据的顺序
split_boundary = int(X.shape[0] * 0.7)
X_train = X_flattened[: split_boundary]
y_train = y[: split_boundary]
X_test = X_flattened[split_boundary:]
y_test = y[split_boundary:]

In [None]:
# 使用XGBoost模型进行训练
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder


In [None]:
# 由于训练数据中标签为1的比重过大，为了避免模型过于关注标签为1的数据，给其他标签的数据设置更大的权重
def compute_weights(y):
    # 初始化权重为1
    weights = np.ones(len(y))
    
    # 找到标签为1的位置
    indices_one = np.where(y == 1)[0]
    
    # 找到标签不为1的位置
    indices_not_one = np.where(y == 2)[0]

    indices_not_thr = np.where(y == 3)[0]

    indices_not_four = np.where(y == 4)[0]

    indices_not_six = np.where(y == 6)[0]

    
    # 设置标签为1的权重为1
    weights[indices_one] = 1
    
    # 设置标签不为1的权重为20
    weights[indices_not_one] = 24
    weights[indices_not_thr] = 24
    weights[indices_not_four] = 53
    weights[indices_not_six] = 53
    
    return weights

In [None]:

encoder = LabelEncoder()
y_train_encoded = encoder.fit_transform(y_train)

model = XGBClassifier(
    eta=0.3, 
    max_depth=6,  
    objective='multi:softmax',  # 多分类的问题
    learning_rate=0.1,
    tree_method='gpu_hist',  # 使用GPU
    n_jobs=-1,  # 使用全部的CPU线程数进行并行运算    
    num_class=5,  # 类别数
    n_estimators=850,
)


# 计算训练集中每个样本的权重
weights = compute_weights(y_train)


model.fit(X_train, y_train_encoded,sample_weight=weights)

# 对测试集进行预测
y_pred = model.predict(X_test)
predictions = encoder.inverse_transform([round(value) for value in y_pred])

# 创建一个掩码，标记出真实标签不为1的位置
mask = y_test != 1

# 使用掩码过滤预测结果和真实标签
predictions_filtered = predictions[mask]
y_test_filtered = y_test[mask]

# 评估预测结果
accuracy = accuracy_score(y_test_filtered, predictions_filtered)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))