In [None]:
import os
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
# 删除100个股票中不全的留下81个
# 按照论文的说法2011-2020 10年数据都不缺
path = '100 stocks/'
stocks = os.listdir(path)
for i in stocks:
    if len(pd.read_csv(path+i))!=2518:
        os.remove(path+i)

In [None]:
import talib
# 输入：股票原始数据，output='PLD'/'node_representation'；输出：返回特征处理后的对应列
def process_feature(df, output):
    # 计算CCI指标
    df['CCI'] = talib.CCI(df['High'], df['Low'], df['Close'], timeperiod=14)

    # 计算SAR指标
    df['SAR'] = talib.SAR(df['High'], df['Low'], acceleration=0.02, maximum=0.2)

    # 计算ADX指标
    df['ADX'] = talib.ADX(df['High'], df['Low'], df['Close'], timeperiod=14)

    # 计算MFI指标
    df['MFI'] = talib.MFI(df['High'], df['Low'], df['Close'], df['Volume'], timeperiod=14)

    # 计算RSI指标
    df['RSI'] = talib.RSI(df['Close'], timeperiod=14)

    # 计算SK、SD指标
    df['SK'], df['SD'] = talib.STOCH(df['High'], df['Low'], df['Close'], fastk_period=5, slowk_period=3, slowk_matype=0, slowd_period=3, slowd_matype=0)

    # 计算RSI指标信号
    df['RSI_S'] = 0
    df.loc[df['RSI'] > 70, 'RSI_S'] = -1 # RSI > 70时卖出信号
    df.loc[df['RSI'] < 30, 'RSI_S'] = 1 # RSI < 30时买入信号

    # 计算Bollinger bands指标信号
    df['SD_upper'], middle, df['SD_lower'] = talib.BBANDS(df['Close'], timeperiod=20, nbdevup=2, nbdevdn=2, matype=0)
    df['BB_S'] = 0
    df.loc[df['Close'] > df['SD_upper'], 'BB_S'] = -1 # 收盘价突破上轨带时卖出信号
    df.loc[df['Close'] < df['SD_lower'], 'BB_S'] = 1 # 收盘价跌破下轨带时买入信号

    # 计算MACD指标信号
    macd, signal, df['MACD_hist'] = talib.MACD(df['Close'], fastperiod=12, slowperiod=26, signalperiod=9)
    df['MACD_S'] = 0
    df.loc[(df['MACD_hist'] > 0) & (df['MACD_hist'].shift(1) < 0), 'MACD_S'] = 1 # MACD柱从负值向上穿过0轴时买入信号
    df.loc[(df['MACD_hist'] < 0) & (df['MACD_hist'].shift(1) > 0), 'MACD_S'] = -1 # MACD柱从正值向下穿过0轴时卖出信号

    # 计算SAR指标信号
    df['SAR_S'] = 0
    df.loc[df['Close'] > df['SAR'], 'SAR_S'] = 1 # 收盘价突破SAR时买入信号
    df.loc[df['Close'] < df['SAR'], 'SAR_S'] = -1 # 收盘价跌破SAR时卖出信号

    # 计算ADX指标信号
    df['ADX_S'] = 0
    df.loc[df['ADX'] > 25, 'ADX_S'] = 1 # ADX > 25时趋势明显，可以买卖信号
    df.loc[df['ADX'] < 20, 'ADX_S'] = -1 # ADX < 20时趋势不明显，不建议买卖信号

    # 计算Stochastic指标信号
    df['S_S'] = 0
    df.loc[(df['SK'] > df['SD']) & (df['SK'].shift() < df['SD'].shift()), 'S_S'] = 1 # Slow stoch %K上穿Slow stoch %D时买入信号
    df.loc[(df['SK'] < df['SD']) & (df['SK'].shift() > df['SD'].shift()), 'S_S'] = -1 # Slow stoch %K下穿Slow stoch %D时卖出信号

    # 计算MFI指标信号
    df['MFI_S'] = 0
    df.loc[df['MFI'] > 80, 'MFI_S'] = -1 # MFI > 80时卖出信号
    df.loc[df['MFI'] < 20, 'MFI_S'] = 1 # MFI < 20时买入信号

    # 计算CCI指标信号
    df['CCI_S'] = 0
    df.loc[df['CCI'] > 100, 'CCI_S'] = -1 # CCI > 100时卖出信号
    df.loc[df['CCI'] < -100, 'CCI_S'] = 1 # CCI < -100时买入信号

    # 计算交易量符号函数：Sign(Volume -Avg(last 5 days))
    df['Average_Volume'] = df['Volume'].rolling(window=5).mean() # 计算5天的平均交易量并添加到DataFrame中
    df['Volume_Diff'] = df['Volume'] - df['Average_Volume'] # 计算当日交易量相对于过去5天平均交易量的差值
    df['V_S'] = df['Volume_Diff'].apply(lambda x: 1 if x > 0 else (-1 if x < 0 else 0)) # 计算交易量符号函数

    # 计算开盘价与收盘价之差的符号函数：Sign(CP-OP)
    df['CPOP'] = df['Close'] - df['Open'] # 计算当日收盘价与开盘价之差
    df['CPOP_S'] = df['CPOP'].apply(lambda x: 1 if x > 0 else (-1 if x < 0 else 0)) # 计算开盘价与收盘价之差的符号函数

    # 计算当日收盘价相对于昨日收盘价的符号函数：Sign(CP-Closing price yesterday)
    df['CPCPY'] = df['Close'].diff() # 计算当日收盘价与昨日收盘价之差
    df['CPCPY_S'] = df['CPCPY'].apply(lambda x: 1 if x > 0 else (-1 if x < 0 else 0)) # 计算当日收盘价相对于昨日收盘价的符号函数
    
    # 输出去掉含有缺失值的行
    if output=='PLD':
        return df[['Open', 'High', 'Low', 'Close', 'Volume', 'CCI', 'SAR', 'ADX', 'MFI', 'RSI', 'SK', 'SD', 'RSI_S', 'BB_S', 'MACD_S', 'SAR_S', 'ADX_S', 'S_S', 'MFI_S', 'CCI_S', 'V_S', 'CPOP_S', 'CPCPY_S']].dropna().reset_index(drop=True)
    elif output=='node_representation':
        df['label'] = (df['Close'].shift(-1) > df['Close']).astype(int)
        return df[['RSI_S', 'BB_S', 'MACD_S', 'SAR_S', 'ADX_S', 'S_S', 'MFI_S', 'CCI_S', 'V_S', 'CPOP_S', 'CPCPY_S','label']].reset_index(drop=True)
    

In [None]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.preprocessing import StandardScaler
# 输入：两个公司的数据；输出：融合两部分信息的数据，字段格式一致
def mix_ij(df_i, df_j):
    cols = df_i.columns.tolist()
    df_ij = pd.DataFrame(columns=cols)
    for col in cols:
        df_ij[col] = 1/2*(df_i[col]+df_j[col])
    return df_ij

# 输入：含有target列作为分类结果的数据；输出：QDA准确率
def QDA_(df):
    X = df.drop('target', axis=1)
    y = df['target']
    # 划分训练集和测试集
    split = int(len(df)*0.8)
    X_train, X_test = X[:split], X[split:]
    y_train, y_test = y[:split], y[split:]
    # 标准化
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    # 训练QDA分类器
    qda = QuadraticDiscriminantAnalysis()
    qda.fit(X_train, y_train)
    # 在测试集上评估模型性能
    accuracy = qda.score(X_test, y_test)
    return accuracy

# 输入：公司i数据和ij融合数据；输出：融合数据QDA准确率与单独准确率之增长
def diff_acc(df_i, df_ij):
    # 标签生成：如果下一时刻的收盘价高于当前时间点的收盘价，标记为1；否则标记为0
    df_i['target'] = (df_i['Close'].shift(-1) > df_i['Close']).astype(int)
    df_ij['target'] = df_i['target']
    df_i = df_i[:-1]
    df_ij = df_ij[:-1]
    acc_i = QDA_(df_i)
    acc_ij = QDA_(df_ij)
    return (acc_ij - acc_i)

# 输入：公司i数据和公司j数据；输出：边权重
def influence(df_i, df_j):
    df_i = process_feature(df_i, 'PLD')
    df_j = process_feature(df_j, 'PLD')
    df_ij = mix_ij(df_i, df_j)
    return 1/2*(diff_acc(df_i, df_ij)+diff_acc(df_j, df_ij))

import networkx as nx
import itertools
# 输入：观测日期，利用该日期之前的数据构建公司关系图；输出：联通图
import functools
@functools.lru_cache(maxsize=128)
def create_graph(observe_date):
    # 创建一张无向图
    G = nx.Graph()
    for i, j in itertools.combinations(stocks, 2):
        df_i = pd.read_csv(path+i)
        df_j = pd.read_csv(path+j)
        df_i = df_i[df_i['Date'] <= observe_date].sort_values('Date')
        df_j = df_j[df_j['Date'] <= observe_date].sort_values('Date')
        w = influence(df_i, df_j)
        if w > 0:
            # 添加正边和权重（相当于移除掉负权边）
            G.add_edge(i.split('.')[0], j.split('.')[0], weight=w)
    # 不断移除最小权重的边，直到图不再连通，将最后一条移除的边还原回去
    while nx.is_connected(G):
        # 获取权重最小的边，如果有多条边权重相同，则返回第一条边
        min_edge = min(G.edges(data=True), key=lambda x: x[2]['weight'])
        # 移除该边
        G.remove_edge(min_edge[0], min_edge[1])
    G.add_edge(min_edge[0], min_edge[1], weight=min_edge[2]['weight'])
    # 确认图是连通图
    if nx.is_connected(G)==True:
        return G
    else:
        print('NOT connected!!!')
# 归一化边权重，没有返回值，直接在输入G上操作
def normalize_graph(G):
    max_w = max([d['weight'] for (u, v, d) in G.edges(data=True)])
    for (u, v, d) in G.edges(data=True):
        d['weight'] /= max_w
    

In [None]:
G = create_graph('2018-07-27')
normalize_graph(G)
import matplotlib.pyplot as plt
# 找出科技和互联网行业的，演示作图的4个阶段
node_list = ['AMZN', 'TSLA', 'INTC', 'GOOG', 'NVDA', 'NTES']
subgraph = G.subgraph(node_list)
plt.figure(figsize=(50,50))
weights = nx.get_edge_attributes(subgraph, 'weight')
for i in weights:
    weights[i] = round(weights[i],2)
pos = nx.spring_layout(subgraph)
nx.draw_networkx_nodes(subgraph, pos, node_size=20000, node_color='lightblue')
nx.draw_networkx_labels(subgraph, pos, font_size=40, font_family='sans-serif')
nx.draw_networkx_edges(
    subgraph, 
    pos, 
    width=[w * 50 for w in weights.values()], 
    edgelist=weights.keys(),
    edge_color='black')
nx.draw_networkx_edge_labels(subgraph, pos, edge_labels=weights, font_size=50,label_pos=0.5, font_family='sans-serif')
plt.axis('off')
plt.show()

In [None]:
# 输入：无向图；输出：节点对应密度字典
def density_score(G):
    # 初始化结果字典
    densities = {}
    # 遍历每个节点
    for node in G.nodes():
        neighbors = list(G.neighbors(node))
        # 节点的度（即相邻节点数）
        deg_i = len(neighbors)
        # 如果节点度小于2，则密度为0
        if deg_i <= 1:
            densities[node] = 0
        else:
            # 计算节点的权重平均邻居度
            sum_wdeg_jk = 0
            for j, k in itertools.combinations(neighbors, 2):
                w_ji = G[j][node]['weight']
                w_ki = G[k][node]['weight']
                w_jk = G[j][k]['weight'] if G.has_edge(j, k) else 0
                sum_wdeg_jk += ((w_ji * w_ki * w_jk) ** (1/3))
            wdeg_i = sum_wdeg_jk / (deg_i * (deg_i - 1))
            # 计算节点的密度分数
            densities[node] = wdeg_i
    return densities

In [None]:
# PLD用到的不同算法
def LDA(X_train, X_test, y_train):
    from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
    # 构建LDA模型
    lda = LinearDiscriminantAnalysis()
    lda.fit(X_train, y_train)
    y_pred = lda.predict(X_test)
    return lda, y_pred

def DT(X_train, X_test, y_train):
    from sklearn.tree import DecisionTreeClassifier
    # 构建决策树模型
    dtc = DecisionTreeClassifier()
    dtc.fit(X_train, y_train)
    y_pred = dtc.predict(X_test)
    return dtc, y_pred

def GNB(X_train, X_test, y_train):
    from sklearn.naive_bayes import GaussianNB
    # 构建高斯朴素贝叶斯模型
    gnb = GaussianNB()
    gnb.fit(X_train, y_train)
    y_pred = gnb.predict(X_test)
    return gnb, y_pred

def QDA(X_train, X_test, y_train):
    from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
    # 构建QDA模型
    qda = QuadraticDiscriminantAnalysis()
    qda.fit(X_train, y_train)
    y_pred = qda.predict(X_test)
    return qda, y_pred

def RFC(X_train, X_test, y_train):
    from sklearn.ensemble import RandomForestClassifier
    # 构建随机森林模型
    rfc = RandomForestClassifier(n_estimators=100)
    rfc.fit(X_train, y_train)
    y_pred = rfc.predict(X_test)
    return rfc, y_pred

def MLP(X_train, X_test, y_train):
    from sklearn.neural_network import MLPClassifier
    # 构建多层感知器模型
    mlp = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=50000)
    mlp.fit(X_train, y_train)
    y_pred = mlp.predict(X_test)
    return mlp, y_pred

def adjust_score(y_pred, y_test, c):
    length = len(y_test)
    score = 0
    for i in range(length):
        if y_pred[i] == y_test[i]:
            score+=(1-c)**(length-1-i)
    return score

# 输入：一股票数据；输出：表现最好的预测模型和准确率
def max_score(df):
    X = df.drop('target', axis=1)
    y = df['target']
    # 划分训练集和测试集
    d = 10
    X_train, X_test = X[:-d], X[-d:]
    y_train = y[:-d]
    y_test = y[-d:].tolist()
    # 标准化
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    model_score = [] # 存储模型和准确率的元组
    c=0.2 # 调整测试集中临近预测目标的日期的权重
    model, y_pred = LDA(X_train, X_test, y_train)
    model_score.append((model,adjust_score(y_pred,y_test, c)))
    model, y_pred = DT(X_train, X_test, y_train)
    model_score.append((model,adjust_score(y_pred,y_test, c)))
    model, y_pred = GNB(X_train, X_test, y_train)
    model_score.append((model,adjust_score(y_pred,y_test, c)))
    model, y_pred = QDA(X_train, X_test, y_train)
    model_score.append((model,adjust_score(y_pred,y_test, c)))
    model, y_pred = RFC(X_train, X_test, y_train)
    model_score.append((model,adjust_score(y_pred,y_test, c)))
    model, y_pred = MLP(X_train, X_test, y_train)
    model_score.append((model,adjust_score(y_pred,y_test, c)))
    predictability = max([x[1] for x in model_score])
    best_model = [x[0] for x in model_score if x[1] == predictability][0]
    return best_model, scaler, predictability

# 输入：待预测的日期，Top-n%，网络中各节点的densities字典；输出：Top-n%的股票在该日期价格变化的标签（二分类）
def PLD(predict_date, n, densities):
    stock_privilege_predict = []
    for i in stocks:
        df = pd.read_csv(path+i)
        df = df[df['Date']<=predict_date].sort_values('Date').tail(100)
        df = process_feature(df, 'PLD').tail(70) # 取值与T有关，即观察窗口， 目前选择的是观察最后10天结果
        df['target'] = (df['Close'].shift(-1) > df['Close']).astype(int)
        best_model, scaler, predictability = max_score(df[:-1])
        privilege = densities[i.split('.')[0]] * predictability
        X_predict = df.drop('target', axis=1).tail(1)
        X_predict = scaler.transform(X_predict)
        y_predict = best_model.predict(X_predict)[0]
        stock_privilege_predict.append((i, privilege, y_predict))
    # 按照 privilege 逆序排序
    stock_privilege_predict_sorted = sorted(stock_privilege_predict, key=lambda x: x[1], reverse=True)
    # 取前百分之 n 的数据
    top_n = stock_privilege_predict_sorted[:int(len(stock_privilege_predict) * n / 100)]
    prediction = {(stock.split('.')[0]): predict for stock, _, predict in top_n}
    return prediction


In [None]:
# 将实验所需各月份的graph提前生成好并保存下来
for i in ['2020-09-01','2020-10-01','2020-11-01','2020-12-01',]:#'2020-10-01',
    G = create_graph(i) # 可以以月更新
    normalize_graph(G)
    # 将图保存到文件中
    with open('graph_'+i+'.pkl', 'wb') as f:
        pickle.dump(G, f)

In [None]:
import datetime
# 输入：2002-10-30形式的两个字符串，代表首和尾，输出包含首尾的中间所有日期字符串
def between_dates(start_date,end_date):
    start_date = datetime.datetime.strptime(start_date, '%Y-%m-%d').date()
    end_date = datetime.datetime.strptime( end_date , '%Y-%m-%d').date()
    # 计算日期间隔
    delta = end_date - start_date
    date_list = []
    # 遍历日期间隔，生成所有日期，并以"yyyy-MM-dd"格式输出
    for i in range(delta.days + 1):
        date = start_date + datetime.timedelta(days=i)
        date_list.append(date.strftime('%Y-%m-%d'))
    return date_list


In [None]:
# 使用DGL实现GCN
import torch
import dgl
import torch.nn as nn
import torch.nn.functional as F

# 定义GCN模型
class GCN(nn.Module):
    def __init__(self):
        super(GCN, self).__init__()
        # 第一层GCN
        self.conv1 = dgl.nn.GraphConv(11, 4, weight=True) # 11就是每个节点特征向量的长度
        # Dropout
        self.dropout1 = nn.Dropout(p=0.5)
        # 第二层GCN
        self.conv2 = dgl.nn.GraphConv(4, 4, weight=True)
        # Dropout
        self.dropout2 = nn.Dropout(p=0.2)
        # 第三层GCN
        self.conv3 = dgl.nn.GraphConv(4, 2,weight=True)

    def reset_parameters(self):
        # Glorot uniform initializer初始化
        gain = nn.init.calculate_gain('relu')
        nn.init.xavier_uniform_(self.conv1.weight, gain=gain)
        nn.init.xavier_uniform_(self.conv2.weight, gain=gain)
        nn.init.xavier_uniform_(self.conv3.weight, gain=gain)


    def forward(self, g, x):
        # 第一层GCN
        h = self.conv1(g, x)
        h = F.relu(h)
        h = self.dropout1(h)
        # 第二层GCN
        h = self.conv2(g, h)
        h = F.relu(h)
        h = self.dropout2(h)
        # 第三层GCN
        h = self.conv3(g, h)
        return F.softmax(h, dim=1)

In [None]:
def normalize_gcn(features):
    features = features.float()
    # 计算节点特征的均值和方差
    feat_mean = torch.mean(features, dim=0)
    feat_std = torch.std(features, dim=0)
    # 对节点特征进行标准化处理
    features_normalized = (features - feat_mean) / (feat_std + 1e-8)
    return features_normalized

# 训练过程
def train_gcn(gcn, n_epochs, g):
    # 定义优化器
    optimizer = torch.optim.Adam(gcn.parameters(), lr=0.01, weight_decay=1e-5)
    # 定义损失函数和评估指标
    loss_fn = nn.CrossEntropyLoss()
    for epoch in range(n_epochs):
        gcn.train()  # 进入训练模式
        # features_normalized = normalize_gcn(g.ndata['feat'])
        # logits = gcn(g, features_normalized)
        logits = gcn(g, g.ndata['feat'])
        mask = g.ndata['label'] >= 0
        loss = loss_fn(logits[mask], g.ndata['label'][mask])
        optimizer.zero_grad()  # 清空梯度
        loss.backward()
        optimizer.step()
        acc = (torch.argmax(logits[mask], dim=1) == g.ndata['label'][mask]).float().mean().item()
        # print("Epoch {:05d} | Loss {:.4f} | Acc {:.4f}".format(epoch, loss.item(), acc))

# 输入：torch形式的labels
def test_gcn(gcn, g, labels):
    gcn.eval()  # 进入评估模式
    with torch.no_grad():
        # 进行标准化处理
        # features_normalized = normalize_gcn(g.ndata['feat'])
        logits = gcn(g, g.ndata['feat'])  # 前向传播计算输出
        # 输出所有节点的预测结果（包括未知标签的节点）
        preds = torch.argmax(logits, dim=1)
        # print(preds)
        acc = (preds == labels).float().mean().item()
        # print(f'Accuracy: {acc:.4f}')
        # 计算MCC
        from sklearn.metrics import matthews_corrcoef
        mcc = matthews_corrcoef(labels.numpy(), preds.numpy())
        # print(f'MCC: {mcc:.4f}')
        return acc, mcc


In [None]:
def tran_G(g_nx):
    # 将节点名称映射到整数ID，并创建一个权重列表
    id_dict = {n: i for i, n in enumerate(g_nx.nodes)}
    weight_list = [e[2]['weight'] for e in g_nx.edges(data=True)]
    edges = [(id_dict[e[0]], id_dict[e[1]]) for e in g_nx.edges]
    # 创建一个DGLGraph对象并添加边和权重
    g_dgl = dgl.DGLGraph(edges)
    g_dgl.add_edges(*zip(*[(t, s) for s, t in edges]))
    g_dgl.edata['weight'] = torch.tensor(weight_list+weight_list)
    return g_dgl

import pickle
def experiment_gcn(predict_date, epoch_num, n):
    # 从文件中加载图
    with open('graph_'+predict_date[:7]+'-01.pkl', 'rb') as f:
        G = pickle.load(f)

    id_dict = {n: i for i, n in enumerate(G.nodes)}
    g_1 = tran_G(G) # t-4
    # 定义一个字典，用于存储每个节点对应的特征向量
    node_feats_1 = {}
    node_labels_1 = {}
    g_2 = tran_G(G) # t-3
    node_feats_2 = {}
    node_labels_2 = {}
    g_3 = tran_G(G) # t-2
    node_feats_3 = {}
    node_labels_3 = {}
    g_4 = tran_G(G) # t-1
    node_feats_4 = {}
    node_labels_4 = {}
    g_5 = tran_G(G) # t
    node_feats_5 = {}
    node_labels_5 = {}
    densities = density_score(G)
    # n=20 # 20%=1/5
    prediction = PLD(predict_date, n, densities)
    for i in stocks:
        df = pd.read_csv(path+i).sort_values('Date')
        df = df[df['Date'] <= predict_date]
        df = process_feature(df,'node_representation')
        i = i.split('.')[0]
        node_labels_1[id_dict[i]] = df[['label']].iloc[-5][0]
        node_labels_2[id_dict[i]] = df[['label']].iloc[-4][0]
        node_labels_3[id_dict[i]] = df[['label']].iloc[-3][0]
        node_labels_4[id_dict[i]] = df[['label']].iloc[-2][0]
        try:
            node_labels_5[id_dict[i]] = prediction[i] # 属于PLD选择出的带标签的节点
        except:
            node_labels_5[id_dict[i]] = -1 # 不属于
        node_feats_1[id_dict[i]] = torch.tensor(df[['RSI_S', 'BB_S', 'MACD_S', 'SAR_S', 'ADX_S', 'S_S', 'MFI_S', 'CCI_S', 'V_S', 'CPOP_S', 'CPCPY_S']].iloc[-5].values).view(11)
        node_feats_2[id_dict[i]] = torch.tensor(df[['RSI_S', 'BB_S', 'MACD_S', 'SAR_S', 'ADX_S', 'S_S', 'MFI_S', 'CCI_S', 'V_S', 'CPOP_S', 'CPCPY_S']].iloc[-4].values).view(11)
        node_feats_3[id_dict[i]] = torch.tensor(df[['RSI_S', 'BB_S', 'MACD_S', 'SAR_S', 'ADX_S', 'S_S', 'MFI_S', 'CCI_S', 'V_S', 'CPOP_S', 'CPCPY_S']].iloc[-3].values).view(11)
        node_feats_4[id_dict[i]] = torch.tensor(df[['RSI_S', 'BB_S', 'MACD_S', 'SAR_S', 'ADX_S', 'S_S', 'MFI_S', 'CCI_S', 'V_S', 'CPOP_S', 'CPCPY_S']].iloc[-2].values).view(11)
        node_feats_5[id_dict[i]] = torch.tensor(df[['RSI_S', 'BB_S', 'MACD_S', 'SAR_S', 'ADX_S', 'S_S', 'MFI_S', 'CCI_S', 'V_S', 'CPOP_S', 'CPCPY_S']].iloc[-1].values).view(11)
    # 将每个节点的特征向量作为 'feat' 属性存入 Graph 对象中的每个节点
    g_1.ndata['feat'] = torch.stack([node_feats_1[i] for i in range(len(stocks))])
    g_2.ndata['feat'] = torch.stack([node_feats_2[i] for i in range(len(stocks))])
    g_3.ndata['feat'] = torch.stack([node_feats_3[i] for i in range(len(stocks))])
    g_4.ndata['feat'] = torch.stack([node_feats_4[i] for i in range(len(stocks))])
    g_5.ndata['feat'] = torch.stack([node_feats_5[i] for i in range(len(stocks))])
    # 将每个节点的 label 作为 'label' 属性存入 Graph 对象中的每个节点
    g_1.ndata['label'] = torch.tensor([node_labels_1[i] for i in range(len(stocks))])
    g_2.ndata['label'] = torch.tensor([node_labels_2[i] for i in range(len(stocks))])
    g_3.ndata['label'] = torch.tensor([node_labels_3[i] for i in range(len(stocks))])
    g_4.ndata['label'] = torch.tensor([node_labels_4[i] for i in range(len(stocks))])
    g_5.ndata['label'] = torch.tensor([node_labels_5[i] for i in range(len(stocks))])

    # 创建模型实例
    gcn = GCN()
    gcn.reset_parameters()
    train_gcn(gcn, epoch_num, g_1)
    train_gcn(gcn, epoch_num, g_2)
    train_gcn(gcn, epoch_num, g_3)
    train_gcn(gcn, epoch_num, g_4)
    train_gcn(gcn, epoch_num, g_5)
    labels = {}
    for i in stocks:
        df = pd.read_csv(path+i).sort_values('Date').tail(100)
        df['label'] = (df['Close'].shift(-1) > df['Close']).astype(int)
        labels[id_dict[i.split('.')[0]]] = df[df['Date']==predict_date]['label'].values[0]
    values = [labels[key] for key in sorted(labels.keys())]
    labels = torch.tensor(values)
    acc, mcc = test_gcn(gcn, g_5, labels)
    return acc, mcc


In [None]:
df = pd.read_csv(path+stocks[0])
date_list = df[(df['Date']>='2020-12-01') & (df['Date']<'2020-12-31')].Date.tolist()

experiment_gcn(predict_date, epoch_num, n)

In [None]:
df = pd.read_csv(path+stocks[0])
date_list = df[(df['Date']>='2020-09-30') & (df['Date']<'2020-12-31')].Date.tolist()

accs = []  
mccs = []
for predict_date in date_list:
    for i in range(10): # 重复10组独立实验
        acc, mcc = experiment_gcn(predict_date, 30, 20)
        accs.append(acc)
        mccs.append(mcc)
print((sum(accs) / len(accs), sum(mccs) / len(mccs))) 
# (56.07,0.0016)
# 用时6小时