In [88]:
import pandas as pd
import matplotlib as plt
import numpy as np
from sklearn.preprocessing import MinMaxScaler,StandardScaler

In [89]:
data = pd.read_csv('test.csv')
data.head()

Unnamed: 0,股票代码,日期,开盘,收盘,最高,最低,成交量,成交额,振幅,涨跌额,换手率,涨跌幅
0,600000,2015-04-20,9.47,8.89,9.47,8.68,5724358,10446730000.0,8.42,-0.49,3.84,-5.22
1,600000,2015-04-21,8.79,9.07,9.1,8.79,3681947,6615541000.0,3.49,0.18,2.47,2.02
2,600000,2015-04-22,9.17,9.31,9.35,9.02,4207667,7712131000.0,3.64,0.24,2.82,2.65
3,600000,2015-04-23,9.31,9.12,9.41,9.04,3635936,6675542000.0,3.97,-0.19,2.44,-2.04
4,600000,2015-04-24,8.82,8.74,8.98,8.59,4229271,7509013000.0,4.28,-0.38,2.83,-4.17


异常值处理

In [90]:
def stock_feature_engineering(df):
    """
    这段代码用于处理股票数据中的异常值，分为两部分：

    1. **IQR 法处理价格异常值**  
       - 针对“开盘”、“最高”、“最低”、“收盘”四个价格列，使用四分位数间距（IQR）方法检测和修正异常值。
       - 具体做法是：  
         - 计算每列的第 1 四分位数（Q1）和第 3 四分位数（Q3）。
         - 计算 IQR = Q3 - Q1。
         - 定义下界为 Q1 - 1.5*IQR，上界为 Q3 + 1.5*IQR。
         - 将低于下界的值替换为下界，高于上界的值替换为上界，其他值保持不变。
       - 这样可以有效减少极端异常值对后续分析的影响。

    2. **Z-score 法处理成交量异常值**  
       - 对“成交量”列，采用 Z-score 方法检测异常值。
       - 计算成交量的均值和标准差，将绝对偏离均值超过 3 个标准差的值视为异常。
       - 对于异常值，用成交量的中位数进行替换。
       - 这种方法适合处理近似正态分布的数据，能有效缓解极端值的影响。

    整体来看，这段代码的目的是通过合理的异常值处理，提高数据质量，为后续的特征工程和建模打下基础。
     """
    df['日期']=pd.to_datetime(df['日期'])
      #IQR法处理价格异常值
    price=['开盘','最高','最低','收盘']
    for col in price:
        q1=df[col].quantile(0.25)
        q3=df[col].quantile(0.75)
        iqr=q3-q1
        lower_bound=q1-1.5*iqr
        upper_bound=q3+1.5*iqr
        df[col]=np.where(df[col]<lower_bound,lower_bound,np.where(df[col]>upper_bound,upper_bound,df[col]))


    #Z-score处理成交量异常值
    volume_mean=df['成交量'].mean()
    volume_std=df['成交量'].std()
    df['成交量']=np.where(np.abs(df['成交量']-volume_mean)>3*volume_std,df['成交量'].median(),df['成交量'])
    return df
df=stock_feature_engineering(data)

时间特征处理

In [91]:


df['周内日'] = df['日期'].dt.dayofweek
df['月份'] = df['日期'].dt.month
df['年份'] = df['日期'].dt.year
df['季度'] = df['月份'].map({
    12: 0, 1: 0, 2: 0,
    3: 1, 4: 1, 5: 1,
    6: 2, 7: 2, 8: 2,
    9: 3, 10: 3, 11: 3
})

In [92]:
#周期化处理
week_sum=df['日期'].dt.isocalendar().week
week_sum=week_sum%52
week_sum=week_sum.replace(0,52)

df['周sin']=np.sin(2*np.pi*week_sum/52)
df['周cos']=np.sin(2*np.pi*week_sum/52)

df['月sin']=np.sin(2*np.pi*week_sum/52)
df['月cos']=np.sin(2*np.pi*week_sum/52)

#归一化处理

def normalize_cyclical(col,period):
    return(col/period)-0.5
df['weekofyear']=normalize_cyclical(df['日期'].dt.isocalendar().week,52)

In [93]:
# 时间窗口统计
windows = [5, 20, 60]
for w in windows:
    df[f'收盘{w}'] = df['收盘'].rolling(w).mean()
    df[f'波动率{w}'] = df['收盘'].pct_change().rolling(w).std()



In [94]:
df

Unnamed: 0,股票代码,日期,开盘,收盘,最高,最低,成交量,成交额,振幅,涨跌额,...,周cos,月sin,月cos,weekofyear,收盘5,波动率5,收盘20,波动率20,收盘60,波动率60
0,600000,2015-04-20,9.470,8.89,9.47,8.68,212334.0,1.044673e+10,8.42,-0.49,...,0.885456,0.885456,0.885456,-0.173077,,,,,,
1,600000,2015-04-21,8.790,9.07,9.10,8.79,212334.0,6.615541e+09,3.49,0.18,...,0.885456,0.885456,0.885456,-0.173077,,,,,,
2,600000,2015-04-22,9.170,9.31,9.35,9.02,212334.0,7.712131e+09,3.64,0.24,...,0.885456,0.885456,0.885456,-0.173077,,,,,,
3,600000,2015-04-23,9.310,9.12,9.41,9.04,212334.0,6.675542e+09,3.97,-0.19,...,0.885456,0.885456,0.885456,-0.173077,,,,,,
4,600000,2015-04-24,8.820,8.74,8.98,8.59,212334.0,7.509013e+09,4.28,-0.38,...,0.885456,0.885456,0.885456,-0.173077,9.026,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
637224,301269,2025-04-21,66.605,66.64,68.05,65.32,40838.0,4.997469e+08,3.96,2.93,...,0.885456,0.885456,0.885456,-0.173077,66.640,0.0,66.64,0.0,66.64,0.0
637225,301269,2025-04-22,66.605,66.64,68.05,65.32,31599.0,3.891299e+08,2.33,-1.11,...,0.885456,0.885456,0.885456,-0.173077,66.640,0.0,66.64,0.0,66.64,0.0
637226,301269,2025-04-23,66.605,66.64,68.05,65.32,34644.0,4.200321e+08,2.67,-1.60,...,0.885456,0.885456,0.885456,-0.173077,66.640,0.0,66.64,0.0,66.64,0.0
637227,301269,2025-04-24,66.605,66.64,68.05,65.32,28666.0,3.433121e+08,2.04,-1.80,...,0.885456,0.885456,0.885456,-0.173077,66.640,0.0,66.64,0.0,66.64,0.0
