In [None]:
import pandas as pd
main_df = pd.read_csv('data/main_df.csv')

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np


In [None]:
# 定义特征集（排除close）
features = [
    'open', 'high', 'low', 'vwap',  # 价格特征（排除close）
    'volume', 'transactions',        # 交易量特征
    
    # 技术指标
    'RSI', 'RSI_Wilder',
    'BB_middle', 'BB_upper', 'BB_lower',
    'ATR', 'ATR_Pct',
    'volatility', 'log_returns',
    
    # 动量和趋势
    'price_accel', 
    'momentum_5', 'momentum_15', 'momentum_ratio',
    
    # 其他指标
    'high_break', 'low_break',
    'fractal_vol', 'vol_cluster',
    'mf_divergence', 'volume_conf',
    'avg_trade_size', 'large_trade',
    'trade_freq_z'
]

# 准备特征矩阵X和目标变量Y
X = main_df[features].copy()
y = main_df['close'].copy()

# 检查数据
print("特征矩阵形状:", X.shape)
print("目标变量形状:", y.shape)

# 检查缺失值
print("\n特征中的缺失值:")
null_counts = X.isnull().sum()
print(null_counts[null_counts > 0])

print("\n目标变量中的缺失值:")
print(y.isnull().sum())

# 显示基本统计信息
print("\n目标变量(close)的基本统计信息:")
print(y.describe())

# 保存特征数量信息
print(f"\n总特征数: {len(features)}")

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 初始化随机森林模型
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# 训练模型
rf.fit(X_train, y_train)

# 预测
predictions = rf.predict(X_test)

# 计算MSE
mse = mean_squared_error(y_test, predictions)
print("Mean Squared Error:", mse)


Mean Squared Error: 0.01867968594068356


In [6]:


# 假设 X 是包含所有训练特征的 DataFrame
last_features_df = pd.DataFrame(X.iloc[-1]).T  # 转换最后一行为 DataFrame，保持列名

predictions = []
for _ in range(30):
    # 使用 DataFrame 进行预测以保持特征名称一致性
    next_value = rf.predict(last_features_df)
    
    # 取出预测结果的第一个元素
    next_value_scalar = next_value[0]
    
    # 更新特征 DataFrame 以用于下一个预测
    # 这里需要适当调整以匹配您的具体特征结构
    new_features = np.roll(last_features_df.values, -1)
    new_features[0, -1] = next_value_scalar
    last_features_df = pd.DataFrame(new_features, columns=last_features_df.columns)

    # 保存预测结果
    predictions.append(next_value_scalar)

print("预测的未来30个时间点的数据：", predictions)

预测的未来30个时间点的数据： [np.float64(2917.9318000000003), np.float64(2581.306899999999), np.float64(2571.084), np.float64(2541.2248999999997), np.float64(2540.0756999999994), np.float64(2818.0796999999984), np.float64(2844.966899999999), np.float64(2896.0524999999993), np.float64(2581.2404999999994), np.float64(2571.0002000000004), np.float64(2541.251499999999), np.float64(2540.064699999999), np.float64(2540.052799999999), np.float64(2540.0122999999994), np.float64(2540.089599999999), np.float64(2540.0645999999992), np.float64(2540.0645999999992), np.float64(2540.0018), np.float64(2540.0926), np.float64(2539.8624), np.float64(2539.8215999999993), np.float64(2539.9964999999997), np.float64(2540.2093999999997), np.float64(2540.2513), np.float64(2540.0015), np.float64(2818.144399999999), np.float64(2585.469199999999), np.float64(2587.8855999999987), np.float64(2556.9954999999995), np.float64(2549.814300000001)]
