In [1]:
# import pandas as pd
# import matplotlib as plt
# import numpy as np
# from sklearn.preprocessing import MinMaxScaler,StandardScaler

In [2]:
# data = pd.read_csv('test.csv')
# data.head()

异常值处理

In [3]:
# def stock_feature_engineering(df):
#     """
#     这段代码用于处理股票数据中的异常值，分为两部分：

#     1. **IQR 法处理价格异常值**  
#        - 针对“开盘”、“最高”、“最低”、“收盘”四个价格列，使用四分位数间距（IQR）方法检测和修正异常值。
#        - 具体做法是：  
#          - 计算每列的第 1 四分位数（Q1）和第 3 四分位数（Q3）。
#          - 计算 IQR = Q3 - Q1。
#          - 定义下界为 Q1 - 1.5*IQR，上界为 Q3 + 1.5*IQR。
#          - 将低于下界的值替换为下界，高于上界的值替换为上界，其他值保持不变。
#        - 这样可以有效减少极端异常值对后续分析的影响。

#     2. **Z-score 法处理成交量异常值**  
#        - 对“成交量”列，采用 Z-score 方法检测异常值。
#        - 计算成交量的均值和标准差，将绝对偏离均值超过 3 个标准差的值视为异常。
#        - 对于异常值，用成交量的中位数进行替换。
#        - 这种方法适合处理近似正态分布的数据，能有效缓解极端值的影响。

#     整体来看，这段代码的目的是通过合理的异常值处理，提高数据质量，为后续的特征工程和建模打下基础。
#      """
#     df['日期']=pd.to_datetime(df['日期'])
#       #IQR法处理价格异常值
#     price=['开盘','最高','最低','收盘']
#     for col in price:
#         q1=df[col].quantile(0.25)
#         q3=df[col].quantile(0.75)
#         iqr=q3-q1
#         lower_bound=q1-1.5*iqr
#         upper_bound=q3+1.5*iqr
#         df[col]=np.where(df[col]<lower_bound,lower_bound,np.where(df[col]>upper_bound,upper_bound,df[col]))


#     #Z-score处理成交量异常值
#     volume_mean=df['成交量'].mean()
#     volume_std=df['成交量'].std()
#     df['成交量']=np.where(np.abs(df['成交量']-volume_mean)>3*volume_std,df['成交量'].median(),df['成交量'])
#     return df
# df=stock_feature_engineering(data)

时间特征处理

In [4]:


# df['周内日'] = df['日期'].dt.dayofweek
# df['月份'] = df['日期'].dt.month
# df['年份'] = df['日期'].dt.year
# df['季度'] = df['月份'].map({
#     12: 0, 1: 0, 2: 0,
#     3: 1, 4: 1, 5: 1,
#     6: 2, 7: 2, 8: 2,
#     9: 3, 10: 3, 11: 3
# })

In [5]:
# #周期化处理
# week_sum=df['日期'].dt.isocalendar().week
# week_sum=week_sum%52
# week_sum=week_sum.replace(0,52)

# df['周sin']=np.sin(2*np.pi*week_sum/52)
# df['周cos']=np.sin(2*np.pi*week_sum/52)

# df['月sin']=np.sin(2*np.pi*week_sum/52)
# df['月cos']=np.sin(2*np.pi*week_sum/52)

# #归一化处理

# def normalize_cyclical(col,period):
#     return(col/period)-0.5
# df['weekofyear']=normalize_cyclical(df['日期'].dt.isocalendar().week,52)

In [6]:
# # 时间窗口统计
# windows = [5, 20, 60]
# for w in windows:
#     df[f'收盘{w}'] = df['收盘'].rolling(w).mean()
#     df[f'波动率{w}'] = df['收盘'].pct_change().rolling(w).std()



In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from torch.utils.tensorboard import SummaryWriter
from torch.cuda.amp import GradScaler, autocast

import math

# 1. 数据预处理模块
class StockData:
    def __init__(self, filepath):
        df = pd.read_csv(filepath)
        # 自动查找日期列名
        date_col = None
        for col in df.columns:
            if col.lower() in ['date', '日期']:
                date_col = col
                break
        if date_col is not None:
            df[date_col] = pd.to_datetime(df[date_col])
            df = df.set_index(date_col)
        # 自动适配英文或中文列名
        col_map = {
            'open': ['open', '开盘'],
            'high': ['high', '最高'],
            'low': ['low', '最低'],
            'close': ['close', '收盘'],
            'volume': ['volume', '成交量']
        }
        for std_col, candidates in col_map.items():
            for c in candidates:
                if c in df.columns:
                    df.rename(columns={c: std_col}, inplace=True)
                    break
        self.data = df
        self.scaler = MinMaxScaler()
        
    def create_features(self):
        df = self.data.copy()
        # 检查必须的列
        for col in ['high', 'low', 'close', 'volume']:
            if col not in df.columns:
                raise ValueError(f"数据缺少必要列: {col}")
        df['HL_PCT'] = (df['high'] - df['low']) / df['close'] * 100
        df['PCT_change'] = df['close'].pct_change()
        windows = [5, 10, 20, 50]
        for w in windows:
            df[f'MA_{w}'] = df['close'].rolling(w).mean()
            df[f'VOL_{w}'] = df['volume'].rolling(w).mean()
        df['target'] = df['close'].shift(-1)
        df = df.dropna()
        features = [col for col in df.columns if col != 'target']
        df[features] = df[features].replace([np.inf, -np.inf], np.nan).fillna(0)
        df[features] = self.scaler.fit_transform(df[features])
        return df[features], df['target']

# PositionalEncoding实现
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        # x shape: (batch, seq_len, d_model)
        x = x + self.pe[:, :x.size(1), :]
        return x

# 2. Transformer模型实现
class StockTransformer(nn.Module):
    def __init__(self, input_dim, d_model=64, nhead=4, num_layers=2):
        super().__init__()
        self.encoder = nn.Linear(input_dim, d_model)
        self.pos_encoder = PositionalEncoding(d_model)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model, 
            nhead=nhead,
            dim_feedforward=d_model*4
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.decoder = nn.Linear(d_model, 1)

    def forward(self, src):
        src = self.encoder(src) * math.sqrt(self.encoder.out_features)
        src = self.pos_encoder(src.unsqueeze(1)).squeeze(1)
        output = self.transformer(src.unsqueeze(1))
        return self.decoder(output.mean(dim=1)).squeeze()

# 3. 训练与可视化模块
class ModelTrainer:
    def __init__(self, model, device='cuda' if torch.cuda.is_available() else 'cpu'):
        self.model = model.to(device)
        self.device = device
        self.writer = SummaryWriter()
        self.scaler = GradScaler()  # 混合精度训练
        
    def train(self, X_train, y_train, epochs=100, lr=0.001):
        X = torch.FloatTensor(X_train.values).to(self.device)
        y = torch.FloatTensor(y_train.values).to(self.device)
        
        criterion = nn.MSELoss()
        optimizer = optim.Adam(self.model.parameters(), lr=lr)
        
        for epoch in range(epochs):
            self.model.train()
            optimizer.zero_grad()
            
            with autocast():  # 自动混合精度
                outputs = self.model(X)
                loss = criterion(outputs, y)
            
            self.scaler.scale(loss).backward()
            self.scaler.step(optimizer)
            self.scaler.update()
            
            # 记录指标
            self.writer.add_scalar('Loss/train', loss.item(), epoch)
            
            if epoch % 10 == 0:
                print(f'Epoch {epoch}, Loss: {loss.item():.4f}')
        
        self.writer.close()

    def evaluate(self, X_test, y_test):
        with torch.no_grad():
            X = torch.FloatTensor(X_test.values).to(self.device)
            preds = self.model(X).cpu().numpy()
            
            plt.figure(figsize=(12,6))
            plt.plot(y_test.index, y_test, label='True')
            plt.plot(y_test.index, preds, label='Predicted')
            plt.legend()
            plt.show()
            
            return preds

# 4. 主程序
def main():
    # 数据准备
    data = StockData('train.csv')
    X, y = data.create_features()
    
    # 划分数据集
    split = int(len(X)*0.8)
    X_train, X_test = X.iloc[:split], X.iloc[split:]
    y_train, y_test = y.iloc[:split], y.iloc[split:]
    
    # 训练配置
    model = StockTransformer(input_dim=X.shape[1])
    trainer = ModelTrainer(model)
    
    # 训练与评估
    trainer.train(X_train, y_train)
    preds = trainer.evaluate(X_test, y_test)

if __name__ == '__main__':
    main()

2025-06-03 17:41:28.237137: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-03 17:41:28.479553: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748943688.565306     736 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748943688.586124     736 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1748943688.761810     736 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

Epoch 0, Loss: 9399.9844
Epoch 10, Loss: 9285.1738
Epoch 20, Loss: 9148.1992
