In [2]:
import json
import re
import os

def clean_comments(comments):
    cleaned_comments = []
    for comment in comments:
        # 移除 @用户名 和换行符
        cleaned_comment = re.sub(r'@\w+', '', comment)
        cleaned_comment = cleaned_comment.replace('\n', '')
        cleaned_comment = cleaned_comment.strip()
        if cleaned_comment:
            cleaned_comments.append(cleaned_comment)
    return cleaned_comments

def process_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    for item in data:
        item['comments'] = clean_comments(item['comments'])
    new_file_path = file_path.replace('.json', '_cleaned.json')
    with open(new_file_path, 'w', encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False, indent=4)
    print(f"Processed and saved: {new_file_path}")

def main(folder_path):
    files = ['train.json', 'val.json', 'test.json']
    for file_name in files:
        file_path = os.path.join(folder_path, file_name)
        if os.path.exists(file_path):
            process_file(file_path)
        else:
            print(f"File not found: {file_path}")

if __name__ == "__main__":
    folder_path = r'D:\project\WWW2021-master\dataset\RumourEval-19-edit'  # 修改为您的文件夹路径
    main(folder_path)


Processed and saved: D:\project\WWW2021-master\dataset\RumourEval-19-edit\train_cleaned.json
Processed and saved: D:\project\WWW2021-master\dataset\RumourEval-19-edit\val_cleaned.json
Processed and saved: D:\project\WWW2021-master\dataset\RumourEval-19-edit\test_cleaned.json


In [2]:
import os
import torch
from transformers import BertTokenizer, BertModel
import numpy as np
import json
from tqdm.notebook import tqdm
import time

# 初始化 BERT 分词器和模型
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to('cuda' if torch.cuda.is_available() else 'cpu')
print(model)
def bert_embed(text, max_length=128):
    # 对文本进行编码，并限制最大长度
    inputs = tokenizer(text, return_tensors='pt', max_length=max_length, truncation=True, padding='max_length')
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    # 返回BERT模型的最后一层隐藏状态
    return outputs.last_hidden_state.squeeze(0)
def process_batch(data_batch, batch_index, save_dir, max_comments=5000):
    processed_batch_list = []

    # 对每个批次的数据进行处理
    for data in data_batch:
        # 计算内容的嵌入表示
        content_embedding = bert_embed(data['content']).to('cuda')
        #print(f'Content embedding shape: {content_embedding.shape}')

        # 检查评论数量，如果超过5000则仅使用前5000条
        comments = data['comments'][:max_comments]

        # 计算评论的嵌入表示
        comments_embeddings = torch.stack([bert_embed(comment).to('cuda') for comment in comments])
        #print(f'Comments embeddings shape: {comments_embeddings.shape}')

        # 计算平均池化和最大池化特征
        mean_pooling = torch.mean(comments_embeddings, dim=0)
        #print(f'Mean pooling shape: {mean_pooling.shape}')
        max_pooling = torch.max(comments_embeddings, dim=0).values
        #print(f'Max pooling shape: {max_pooling.shape}')

        # 计算语义差特征
        semantic_gap_mean = content_embedding - mean_pooling
        #print(f'Semantic gap mean shape: {semantic_gap_mean.shape}')
        semantic_gap_max = content_embedding - max_pooling
        #print(f'Semantic gap max shape: {semantic_gap_max.shape}')

        # 连接所有特征形成最终特征
        final_feature = torch.cat([content_embedding, mean_pooling, max_pooling, semantic_gap_mean, semantic_gap_max])
        #print(f'Final feature shape: {final_feature.shape}')
        
        processed_batch_list.append(final_feature)

    # 将处理后的批次数据保存到文件
    batch_file_name = f'batch_{batch_index}.npy'
    batch_file_path = os.path.join(save_dir, batch_file_name)

    # Move the stacked tensor to CPU before converting to NumPy
    np.save(batch_file_path, torch.stack(processed_batch_list).cpu().numpy())

    # 清理内存
    torch.cuda.empty_cache()
    del processed_batch_list
    return batch_file_name



def merge_batches(file_list, output_file_path):
    # 合并所有批次文件中的数据
    batch_data = [np.load(file) for file in file_list]
    merged_data = np.concatenate(batch_data, axis=0)
    # 将合并后的数据保存到一个文件
    np.save(output_file_path, merged_data)

# 设置批处理大小
batch_size = 1
save_dir = './data'
# 创建保存目录
if not os.path.exists(save_dir):
    os.mkdir(save_dir)
datasets_ch = ['RumourEval-19-2']
for dataset in datasets_ch:
    print(f'\n\n{"-"*20} [{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}] Processing the dataset: {dataset} {"-"*20}\n')
    # 指定数据集目录
    data_dir = os.path.join('../../dataset', dataset)
    output_dir = os.path.join(save_dir, dataset)
    # 创建输出目录
    if not os.path.exists(output_dir):
        os.mkdir(output_dir)
    processed_data_dir = os.path.join(output_dir, 'processed')
    # 创建处理后数据的保存目录
    if not os.path.exists(processed_data_dir):
        os.mkdir(processed_data_dir)
    # 加载数据集的训练、验证和测试部分
    split_datasets = {
        t: json.load(open(os.path.join(data_dir, f'{t}.json'), 'r', encoding='utf-8'))
        for t in ['train','test','val']
    }

    for split, data in split_datasets.items():
        # 为每个数据分割设置批处理文件目录
        batch_dir = os.path.join(processed_data_dir, split)
        if not os.path.exists(batch_dir):
            os.mkdir(batch_dir)

        file_list = []
        # 处理数据并保存为批次文件
        for batch_index in tqdm(range(0, len(data), batch_size), desc=f"Processing {split} dataset"):
            data_batch = data[batch_index:batch_index + batch_size]
            batch_file_name = process_batch(data_batch, batch_index // batch_size, batch_dir)
            file_list.append(os.path.join(batch_dir, batch_file_name))

        # 合并所有批次文件为一个单独的文件
        final_file_path = os.path.join(processed_data_dir, f'{split}.npy')
        merge_batches(file_list, final_file_path)


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

Processing train dataset:   0%|          | 0/223 [00:00<?, ?it/s]

Processing test dataset:   0%|          | 0/71 [00:00<?, ?it/s]

Processing val dataset:   0%|          | 0/29 [00:00<?, ?it/s]

In [7]:
import json

# Step 1: 读取 train.json 文件
file_path = r'D:\project\WWW2021-master\dataset\RumourEval-19\test.json'

with open(file_path, 'r', encoding='utf-8') as file:
    data = json.load(file)

# 简单查看数据结构
data_sample = data[0] if data else "No data found"

data_sample


{'comments': ["No.\n\nNo matter how hard or focused you wish for something, sometimes it just doesn't happen.",
  'Yes , kind of true, somewhat like destiny,  but I am not talking about a scene of a magic lamp and a genie. Some things are simply impossible but Ive heard people at a good level in meditation can "make" things happen and obviously prophets can make a lot of things happen.',
  'That\'s not at all obvious. In fact, I have yet to see proof meditation makes anything happen outside of your mind and body and am pretty sure "prophets" are just charismatic leaders.',
  "Haven't you heard about people manipulating matter with their own aura ?",
  "I have, but that doesn't mean it's true.",
  'Does anyone have any info or experience with this ?',
  'Things only happen if you actively do them. ',
  "Two ways this can work. A, you learn to appreciate what you have through introspection. B, you gain the confidence to improve by coming to terms with your issues and facing them.\n\nThat

In [8]:
from transformers import BertTokenizer, BertModel
import torch
import numpy as np
from tqdm import tqdm  # 引入tqdm

# 检查CUDA设备的可用性
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 加载模型和分词器
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = BertModel.from_pretrained('bert-base-cased').to(device)  # 将模型移至CUDA设备
model.eval()  # 设置模型为评估模式

def get_embedding(text):
    # 对文本进行编码并计算嵌入
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    inputs = inputs.to(device)  # 将输入数据移至CUDA设备
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(1)  # 取平均作为嵌入
    return embeddings.cpu()  # 将嵌入移回CPU

def compute_cosine_similarity(embedding1, embedding2):
    # 计算两个嵌入之间的余弦相似度
    cos_sim = torch.nn.functional.cosine_similarity(embedding1, embedding2)
    return cos_sim.item()

# 对数据进行处理
embeddings = []
for item in tqdm(data, desc="Processing items"):  # 使用tqdm显示进度条
    content_embedding = get_embedding(item['content'])
    comment_embeddings = torch.stack([get_embedding(comment) for comment in item['comments']])
    avg_comment_embedding = torch.mean(comment_embeddings, 0)
    max_comment_embedding, _ = torch.max(comment_embeddings, 0)
    avg_similarity = compute_cosine_similarity(content_embedding, avg_comment_embedding)
    max_similarity = compute_cosine_similarity(content_embedding, max_comment_embedding)
    features = torch.cat((content_embedding.squeeze(), avg_comment_embedding.squeeze(), max_comment_embedding.squeeze(), torch.tensor([avg_similarity, max_similarity])), 0)
    embeddings.append(features.numpy())

np.save('test_features.npy', np.array(embeddings))


Processing items: 100%|██████████| 81/81 [00:10<00:00,  7.92it/s]


# 使用余弦相似度

In [9]:
import numpy as np
import torch
import torch.nn as nn
from sklearn.metrics import classification_report
from torch.utils.data import TensorDataset, DataLoader

# 定义BiGRU模型
class BiGRU(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes, num_layers=1):
        super(BiGRU, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size*2, num_classes)  # 乘以2是因为BiGRU的输出是双向的
    
    def forward(self, x):
        h0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_size).to(x.device)  # 乘以2也是因为双向
        out, _ = self.gru(x, h0)
        out = self.fc(out[:, -1, :])  # 取序列的最后一个时间步
        return out

# 加载数据
train_features_path = 'D:/project/WWW2021-master/code/preprocess/train_features.npy'
train_labels_path = 'D:/project/WWW2021-master/code/preprocess/data/RumourEval-19/labels/train_(327, 3).npy'
test_features_path = 'D:/project/WWW2021-master/code/preprocess/test_features.npy'
test_labels_path = 'D:/project/WWW2021-master/code/preprocess/data/RumourEval-19/labels/test_(81, 3).npy'

X_train = np.load(train_features_path)
y_train = np.argmax(np.load(train_labels_path), axis=1)  # 假设标签是one-hot编码的，我们需要将其转换为标签索引
X_test = np.load(test_features_path)
y_test = np.argmax(np.load(test_labels_path), axis=1)

# 转换为PyTorch张量
X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.long)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.long)

# 创建数据加载器
batch_size = 64
train_data = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
test_data = TensorDataset(X_test, y_test)
test_loader = DataLoader(test_data, batch_size=batch_size)

# 初始化模型、损失函数和优化器
input_size = X_train.shape[1]
hidden_size = 128  # 或根据需要调整
num_classes = 3  # 根据实际类别数调整
model = BiGRU(input_size, hidden_size, num_classes).to(X_train.device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

# 训练模型
num_epochs = 10  # 或根据需要调整
for epoch in range(num_epochs):
    for inputs, labels in train_loader:
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}')

# 在测试集上评估模型
model.eval()
all_preds = []
with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        all_preds.extend(predicted.cpu().numpy())

print(classification_report(y_test.numpy(), np.array(all_preds)))


RuntimeError: For unbatched 2-D input, hx should also be 2-D but got 3-D tensor

In [16]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import classification_report
import numpy as np

# 定义简单的全连接网络
class SimpleNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(SimpleNN, self).__init__()
        self.layer1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.layer2 = nn.Linear(hidden_size, num_classes)
    
    def forward(self, x):
        out = self.layer1(x)
        out = self.relu(out)
        out = self.layer2(out)
        return out

# 加载数据
train_features = np.load(r'D:\project\WWW2021-master\code\preprocess\data\RumourEval-19-2\processed\train.npy')
train_labels = np.argmax(np.load(r'D:/project/WWW2021-master/code/preprocess/data/RumourEval-19/labels/train_(327, 3).npy'), axis=1)
test_features = np.load(r'D:\project\WWW2021-master\code\preprocess\data\RumourEval-19-2\processed\test.npy')
test_labels = np.argmax(np.load(r'D:/project/WWW2021-master/code/preprocess/data/RumourEval-19/labels/test_(81, 3).npy'), axis=1)

# 转换为PyTorch张量
train_features = torch.tensor(train_features, dtype=torch.float32)
train_labels = torch.tensor(train_labels, dtype=torch.long)
test_features = torch.tensor(test_features, dtype=torch.float32)
test_labels = torch.tensor(test_labels, dtype=torch.long)

# 创建数据加载器
batch_size = 64
train_dataset = TensorDataset(train_features, train_labels)
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_dataset = TensorDataset(test_features, test_labels)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)

# 初始化模型
model = SimpleNN(input_size=train_features.shape[1], hidden_size=100, num_classes=3)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 训练模型
num_epochs = 2000
for epoch in range(num_epochs):
    for i, (features, labels) in enumerate(train_loader):
        # 前向传播
        outputs = model(features)
        loss = criterion(outputs, labels)
        
        # 反向传播和优化
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# 在测试集上评估模型
model.eval()
with torch.no_grad():
    all_preds = []
    for features, labels in test_loader:
        outputs = model(features)
        _, predicted = torch.max(outputs.data, 1)
        all_preds.extend(predicted.numpy())

print(classification_report(test_labels.numpy(), np.array(all_preds)))


Epoch [1/2000], Loss: 1.0552
Epoch [2/2000], Loss: 1.0522
Epoch [3/2000], Loss: 0.8714
Epoch [4/2000], Loss: 1.0970
Epoch [5/2000], Loss: 0.9160
Epoch [6/2000], Loss: 0.9013
Epoch [7/2000], Loss: 0.7918
Epoch [8/2000], Loss: 0.9183
Epoch [9/2000], Loss: 0.7551
Epoch [10/2000], Loss: 0.6199
Epoch [11/2000], Loss: 0.7495
Epoch [12/2000], Loss: 0.5572
Epoch [13/2000], Loss: 0.5054
Epoch [14/2000], Loss: 0.9316
Epoch [15/2000], Loss: 0.7213
Epoch [16/2000], Loss: 0.4775
Epoch [17/2000], Loss: 0.7812
Epoch [18/2000], Loss: 0.5992
Epoch [19/2000], Loss: 0.4015
Epoch [20/2000], Loss: 0.6229
Epoch [21/2000], Loss: 0.5669
Epoch [22/2000], Loss: 0.4194
Epoch [23/2000], Loss: 0.6103
Epoch [24/2000], Loss: 0.6210
Epoch [25/2000], Loss: 0.6792
Epoch [26/2000], Loss: 0.8780
Epoch [27/2000], Loss: 0.4889
Epoch [28/2000], Loss: 0.3490
Epoch [29/2000], Loss: 0.6178
Epoch [30/2000], Loss: 0.4503
Epoch [31/2000], Loss: 0.3919
Epoch [32/2000], Loss: 0.2416
Epoch [33/2000], Loss: 0.3922
Epoch [34/2000], Lo

In [5]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import accuracy_score, f1_score

# 参数设置
input_size = 768  # 输入特征的维度
hidden_size = 256  # LSTM的隐藏层大小
num_layers = 1  # LSTM的层数
num_classes = 3  # 输出类别数
batch_size = 16  # 批大小
learning_rate = 0.001
num_epochs =20 # 训练周期数

# 确保CUDA可用
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 模型定义
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.hidden_size = hidden_size
        self.weight = nn.Parameter(torch.Tensor(hidden_size))
        nn.init.uniform_(self.weight, -0.1, 0.1)

    def forward(self, hidden_states):
        scores = torch.matmul(hidden_states, self.weight)  # [batch_size, seq_len]
        attn_weights = F.softmax(scores, dim=1)
        context = torch.sum(hidden_states * attn_weights.unsqueeze(-1), dim=1)
        return context, attn_weights

class BiLSTMWithAttention(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(BiLSTMWithAttention, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.attention = Attention(hidden_size * 2)  # *2 for bidirectional
        self.fc = nn.Linear(hidden_size * 2, num_classes)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(x.device)  # *2 for bidirectional
        c0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(x.device)

        out, _ = self.lstm(x, (h0, c0))
        context, attn_weights = self.attention(out)
        out = self.fc(context)
        return out

# 初始化模型并移至GPU
model = BiLSTMWithAttention(input_size, hidden_size, num_layers, num_classes).to(device)

# 加载数据
train_features = np.load(r'D:\project\WWW2021-master\code\preprocess\data\RumourEval-19\processed\updated_train.npy')
train_labels = np.load(r'D:\project\WWW2021-master\code\preprocess\data\RumourEval-19\labels\train_(327, 3).npy')
train_dataset = TensorDataset(torch.Tensor(train_features), torch.Tensor(train_labels))
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)

# 损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# 训练循环
for epoch in range(num_epochs):
    for i, (features, labels) in enumerate(train_loader):
        features, labels = features.to(device), labels.to(device)

        # 前向传播
        outputs = model(features)
        loss = criterion(outputs, torch.max(labels, 1)[1])

        # 反向传播和优化
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i+1) % 10 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(train_loader)}], Loss: {loss.item():.4f}')


Epoch [1/20], Step [10/21], Loss: 0.9832
Epoch [1/20], Step [20/21], Loss: 1.0978
Epoch [2/20], Step [10/21], Loss: 0.8860
Epoch [2/20], Step [20/21], Loss: 0.9320
Epoch [3/20], Step [10/21], Loss: 1.0217
Epoch [3/20], Step [20/21], Loss: 0.8363
Epoch [4/20], Step [10/21], Loss: 0.6254
Epoch [4/20], Step [20/21], Loss: 0.6162
Epoch [5/20], Step [10/21], Loss: 0.2534
Epoch [5/20], Step [20/21], Loss: 0.7031
Epoch [6/20], Step [10/21], Loss: 0.5240
Epoch [6/20], Step [20/21], Loss: 0.6642
Epoch [7/20], Step [10/21], Loss: 0.4316
Epoch [7/20], Step [20/21], Loss: 0.2109
Epoch [8/20], Step [10/21], Loss: 0.5112
Epoch [8/20], Step [20/21], Loss: 0.0703
Epoch [9/20], Step [10/21], Loss: 0.1233
Epoch [9/20], Step [20/21], Loss: 0.1969
Epoch [10/20], Step [10/21], Loss: 0.4150
Epoch [10/20], Step [20/21], Loss: 0.0523
Epoch [11/20], Step [10/21], Loss: 0.0536
Epoch [11/20], Step [20/21], Loss: 0.0170
Epoch [12/20], Step [10/21], Loss: 0.0094
Epoch [12/20], Step [20/21], Loss: 0.0037
Epoch [13/

In [6]:
from sklearn.metrics import classification_report

model.eval()  # 将模型设置为评估模式
all_labels = []
all_preds = []
test_features = np.load(r'D:\project\WWW2021-master\code\preprocess\data\RumourEval-19\processed\updated_test.npy')
test_labels = np.load(r'D:\project\WWW2021-master\code\preprocess\data\RumourEval-19\labels\test_(81, 3).npy')
test_dataset = TensorDataset(torch.Tensor(test_features), torch.Tensor(test_labels))
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)
with torch.no_grad():
    for features, labels in test_loader:
        features, labels = features.to(device), labels.to(device)
        outputs = model(features)
        _, predicted = torch.max(outputs.data, 1)
        all_labels.extend(labels.cpu().numpy())
        all_preds.extend(predicted.cpu().numpy())

# 计算准确率
all_labels = np.argmax(all_labels, axis=1)
accuracy = accuracy_score(all_labels, all_preds)

# 计算总体的F1分数
f1 = f1_score(all_labels, all_preds, average='macro')

print(f'Accuracy: {accuracy:.4f}')
print(f'Macro F1 Score: {f1:.4f}')
# 输出分类报告，确保包含了所有类别
print("Classification Report:")
print(classification_report(all_labels, all_preds, target_names=['Fake', 'Real', 'Unverified']))
# print(classification_report(all_labels, all_preds, target_names=['Fake', 'Real']))

Accuracy: 0.2716
Macro F1 Score: 0.2397
Classification Report:
              precision    recall  f1-score   support

        Fake       0.42      0.35      0.38        40
        Real       0.24      0.19      0.21        31
  Unverified       0.09      0.20      0.12        10

    accuracy                           0.27        81
   macro avg       0.25      0.25      0.24        81
weighted avg       0.31      0.27      0.29        81



In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import accuracy_score, f1_score

# 参数设置
input_size = 768  # 输入特征的维度
hidden_size = 256  # GRU的隐藏层大小
num_layers = 1  # GRU的层数
num_classes = 3  # 输出类别数
batch_size = 16  # 批大小
learning_rate = 0.0001
num_epochs = 200  # 训练周期数

# 确保CUDA可用
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 模型定义
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.hidden_size = hidden_size
        self.weight = nn.Parameter(torch.Tensor(hidden_size))
        nn.init.uniform_(self.weight, -0.1, 0.1)

    def forward(self, hidden_states):
        scores = torch.matmul(hidden_states, self.weight)  # [batch_size, seq_len]
        attn_weights = F.softmax(scores, dim=1)
        context = torch.sum(hidden_states * attn_weights.unsqueeze(-1), dim=1)
        return context, attn_weights

class BiGRUWithAttention(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(BiGRUWithAttention, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.attention = Attention(hidden_size * 2)  # *2 for bidirectional
        self.fc = nn.Linear(hidden_size * 2, num_classes)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(x.device)  # *2 for bidirectional

        out, _ = self.gru(x, h0)
        context, attn_weights = self.attention(out)
        out = self.fc(context)
        return out

# 初始化模型并移至GPU
model = BiGRUWithAttention(input_size, hidden_size, num_layers, num_classes).to(device)

# 加载数据...
train_features = np.load(r'D:\project\WWW2021-master\code\preprocess\data\RumourEval-19\processed\updated_train.npy')
train_labels = np.load(r'D:\project\WWW2021-master\code\preprocess\data\RumourEval-19\labels\train_(327, 3).npy')
train_dataset = TensorDataset(torch.Tensor(train_features), torch.Tensor(train_labels))
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)

# 损失函数和优化器...
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# 训练循环...
for epoch in range(num_epochs):
    for i, (features, labels) in enumerate(train_loader):
        features, labels = features.to(device), labels.to(device)

        # 前向传播
        outputs = model(features)
        loss = criterion(outputs, torch.max(labels, 1)[1])

        # 反向传播和优化
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i+1) % 10 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(train_loader)}], Loss: {loss.item():.4f}')


Epoch [1/200], Step [10/21], Loss: 1.0419
Epoch [1/200], Step [20/21], Loss: 0.9589
Epoch [2/200], Step [10/21], Loss: 1.0279
Epoch [2/200], Step [20/21], Loss: 0.9194
Epoch [3/200], Step [10/21], Loss: 0.8924
Epoch [3/200], Step [20/21], Loss: 1.0839
Epoch [4/200], Step [10/21], Loss: 1.0197
Epoch [4/200], Step [20/21], Loss: 0.9460
Epoch [5/200], Step [10/21], Loss: 0.9287
Epoch [5/200], Step [20/21], Loss: 0.8796
Epoch [6/200], Step [10/21], Loss: 0.7362
Epoch [6/200], Step [20/21], Loss: 0.8040
Epoch [7/200], Step [10/21], Loss: 0.8103
Epoch [7/200], Step [20/21], Loss: 0.9105
Epoch [8/200], Step [10/21], Loss: 0.7539
Epoch [8/200], Step [20/21], Loss: 0.7111
Epoch [9/200], Step [10/21], Loss: 0.7997
Epoch [9/200], Step [20/21], Loss: 0.6732
Epoch [10/200], Step [10/21], Loss: 0.6539
Epoch [10/200], Step [20/21], Loss: 0.6724
Epoch [11/200], Step [10/21], Loss: 0.7396
Epoch [11/200], Step [20/21], Loss: 0.7676
Epoch [12/200], Step [10/21], Loss: 0.5813
Epoch [12/200], Step [20/21],

In [2]:
from sklearn.metrics import classification_report

model.eval()  # 将模型设置为评估模式
all_labels = []
all_preds = []
test_features = np.load(r'D:\project\WWW2021-master\code\preprocess\data\RumourEval-19\processed\updated_test.npy')
test_labels = np.load(r'D:\project\WWW2021-master\code\preprocess\data\RumourEval-19\labels\test_(81, 3).npy')
test_dataset = TensorDataset(torch.Tensor(test_features), torch.Tensor(test_labels))
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)
with torch.no_grad():
    for features, labels in test_loader:
        features, labels = features.to(device), labels.to(device)
        outputs = model(features)
        _, predicted = torch.max(outputs.data, 1)
        all_labels.extend(labels.cpu().numpy())
        all_preds.extend(predicted.cpu().numpy())

# 计算准确率
all_labels = np.argmax(all_labels, axis=1)
accuracy = accuracy_score(all_labels, all_preds)

# 计算总体的F1分数
f1 = f1_score(all_labels, all_preds, average='macro')

print(f'Accuracy: {accuracy:.4f}')
print(f'Macro F1 Score: {f1:.4f}')
# 输出分类报告，确保包含了所有类别
print("Classification Report:")
print(classification_report(all_labels, all_preds, target_names=['Fake', 'Real', 'Unverified']))
# print(classification_report(all_labels, all_preds, target_names=['Fake', 'Real']))

Accuracy: 0.2963
Macro F1 Score: 0.2713
Classification Report:
              precision    recall  f1-score   support

        Fake       0.50      0.38      0.43        40
        Real       0.21      0.19      0.20        31
  Unverified       0.13      0.30      0.18        10

    accuracy                           0.30        81
   macro avg       0.28      0.29      0.27        81
weighted avg       0.35      0.30      0.31        81



In [7]:
from transformers import BertTokenizer, BertModel
import torch

# 初始化分词器和模型
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# 文本输入
text = "“: “: CharlieHebdo : “The cartoonists Charb & cabu are dead.”  ”” "

# 编码文本
input_ids = tokenizer.encode(text, add_special_tokens=True)  # 添加特殊令牌
input_tensors = torch.tensor([input_ids])

# 获取词嵌入
with torch.no_grad():
    outputs = model(input_tensors)
    last_hidden_states = outputs.last_hidden_state  # 最后一层的隐藏状态

# 打印结果的shape以验证
print(last_hidden_states.shape)
# 输出的shape为 (1, N, 768)，其中N是输入文本的长度（包括特殊令牌）

# 注意：实际的嵌入向量存储在`last_hidden_states`变量中


torch.Size([1, 26, 768])


In [8]:
a = last_hidden_states

In [9]:
a

tensor([[[ 0.0537,  0.0580,  0.1904,  ..., -0.1097,  0.1455,  0.2739],
         [ 0.1704, -0.0312,  0.2530,  ..., -0.0290,  0.4150,  0.2514],
         [-0.0683, -0.0624,  0.1292,  ..., -0.0364,  0.1865,  0.4281],
         ...,
         [ 0.1863, -0.0891,  0.7164,  ...,  0.1588,  0.2346, -0.2987],
         [-0.1325, -0.5134,  0.0344,  ...,  0.4457,  0.4667, -0.0658],
         [ 0.7094,  1.0439,  0.0761,  ..., -0.0454, -0.5712, -0.0382]]])

In [10]:

# 文本输入
text = "“@Colvinius: “@LePoint: #CharlieHebdo : “The cartoonists Charb &amp; cabu are dead.” http://t.co/DulWZcURCu http://t.co/vbQMAW7MC0”” @hyzaidi"

# 编码文本
input_ids = tokenizer.encode(text, add_special_tokens=True)  # 添加特殊令牌
input_tensors = torch.tensor([input_ids])

# 获取词嵌入
with torch.no_grad():
    outputs = model(input_tensors)
    last_hidden_states = outputs.last_hidden_state  # 最后一层的隐藏状态

b = a - last_hidden_states
b

RuntimeError: The size of tensor a (26) must match the size of tensor b (72) at non-singleton dimension 1

In [5]:
# 考虑到BiGRU是用于处理序列数据，我们可以将特征向量视为序列的一个时间步。
# 这意味着我们需要调整数据的形状以适配BiGRU模型的输入要求。
# 为了使用BiGRU处理非序列化的特征数据，我们可以将每个特征向量"伪装"成长度为1的序列。
# 下面是调整后的BiGRU模型训练和评估代码：

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import classification_report
import numpy as np

# 定义BiGRU模型
class BiGRU(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(BiGRU, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = 2  # 可以调整层数
        self.gru = nn.GRU(input_size, hidden_size, self.num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, num_classes)  # 因为是双向的，所以是hidden_size * 2
    
    def forward(self, x):
        # 初始化隐藏状态
        h0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(x.device)
        # 前向传播
        out, _ = self.gru(x, h0)
        # 解码最后一个时间步的隐藏状态
        out = self.fc(out[:, -1, :])
        return out

# 加载数据并调整形状以适应BiGRU
train_features = np.load(r'D:/project/WWW2021-master/code/preprocess/train_features.npy')
train_labels = np.argmax(np.load('D:/project/WWW2021-master/code/preprocess/data/RumourEval-19/labels/train_(327, 3).npy'), axis=1)
test_features = np.load(r'D:/project/WWW2021-master/code/preprocess/test_features.npy')
test_labels = np.argmax(np.load('D:/project/WWW2021-master/code/preprocess/data/RumourEval-19/labels/test_(81, 3).npy'), axis=1)

# 将特征向量"伪装"成长度为1的序列
train_features = np.expand_dims(train_features, axis=1)
test_features = np.expand_dims(test_features, axis=1)

# 转换为PyTorch张量
train_features = torch.tensor(train_features, dtype=torch.float32)
train_labels = torch.tensor(train_labels, dtype=torch.long)
test_features = torch.tensor(test_features, dtype=torch.float32)
test_labels = torch.tensor(test_labels, dtype=torch.long)

# 创建数据加载器
batch_size = 16
train_dataset = TensorDataset(train_features, train_labels)
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_dataset = TensorDataset(test_features, test_labels)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)

# 初始化BiGRU模型、损失函数和优化器
model = BiGRU(input_size=train_features.shape[2], hidden_size=128, num_classes=3).to(train_features.device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 训练模型
num_epochs = 200
for epoch in range(num_epochs):
    for i, (features, labels) in enumerate(train_loader):
        features = features.to(train_features.device)
        labels = labels.to(train_features.device)
        # 前向传播
        outputs = model(features)
        loss = criterion(outputs, labels)
        
        # 反向传播和优化
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# 在测试集上评估模型
model.eval()
all_preds = []
with torch.no_grad():
    for features, labels in test_loader:
        features = features.to(test_features.device)
        outputs = model(features)
        _, predicted = torch.max(outputs.data, 1)
        all_preds.extend(predicted.cpu().numpy())

print(classification_report(test_labels.numpy(), np.array(all_preds)))



Epoch [1/200], Loss: 1.0417
Epoch [2/200], Loss: 0.8492
Epoch [3/200], Loss: 0.6195
Epoch [4/200], Loss: 1.0046
Epoch [5/200], Loss: 0.5041
Epoch [6/200], Loss: 0.2513
Epoch [7/200], Loss: 0.2607
Epoch [8/200], Loss: 0.6506
Epoch [9/200], Loss: 0.3046
Epoch [10/200], Loss: 0.4258
Epoch [11/200], Loss: 0.2583
Epoch [12/200], Loss: 0.1412
Epoch [13/200], Loss: 0.0648
Epoch [14/200], Loss: 0.0196
Epoch [15/200], Loss: 0.0110
Epoch [16/200], Loss: 0.2881
Epoch [17/200], Loss: 0.0178
Epoch [18/200], Loss: 0.0054
Epoch [19/200], Loss: 0.0050
Epoch [20/200], Loss: 0.0019
Epoch [21/200], Loss: 0.0008
Epoch [22/200], Loss: 0.0008
Epoch [23/200], Loss: 0.0015
Epoch [24/200], Loss: 0.0024
Epoch [25/200], Loss: 0.0004
Epoch [26/200], Loss: 0.0002
Epoch [27/200], Loss: 0.0009
Epoch [28/200], Loss: 0.0008
Epoch [29/200], Loss: 0.0012
Epoch [30/200], Loss: 0.0018
Epoch [31/200], Loss: 0.0002
Epoch [32/200], Loss: 0.0004
Epoch [33/200], Loss: 0.0002
Epoch [34/200], Loss: 0.0003
Epoch [35/200], Loss: 0