In [1]:
import numpy as np

def concatenate_features(bert_features, fasttext_features):
    if isinstance(bert_features, (np.ndarray, sparse.spmatrix)) and isinstance(fasttext_features, (np.ndarray, sparse.spmatrix)):
        if bert_features.shape[0] != fasttext_features.shape[0]:
            raise ValueError("BERT 和 FastText 特征矩阵的样本数量必须一致")
        
        # 根据输入类型选择合适的拼接方式
        if sparse.issparse(bert_features) or sparse.issparse(fasttext_features):
            return sparse.hstack((bert_features, fasttext_features))
        else:
            return np.hstack((bert_features, fasttext_features))
    else:
        raise TypeError("特征矩阵必须是 numpy 数组或 scipy 稀疏矩阵")

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
from scipy import sparse

# class ViolenceClassifier(nn.Module):
#     def __init__(self, input_dim, hidden_dim, num_classes):
#         super(ViolenceClassifier, self).__init__()
#         self.fc = nn.Linear(input_dim, hidden_dim)
#         self.relu = nn.ReLU()
#         self.dropout = nn.Dropout(0.5)
#         self.out = nn.Linear(hidden_dim, num_classes)
        
#     def forward(self, x):
#         x = self.fc(x)
#         x = self.relu(x)
#         x = self.dropout(x)
#         x = self.out(x)
#         return x

class Classifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_classes):
        super(Classifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

def create_model(input_dim, hidden_dim, num_classes):
    """
    创建一个新的分类器模型。
    
    参数:
        input_dim: 输入特征的数量
        hidden_dim: 隐藏层单元的数量
        num_classes: 类别数量
        
    返回:
        model: 初始化的分类器模型
    """
    model = Classifier(input_dim, hidden_dim, num_classes)
    return model

def train_model(model, X_train, y_train, epochs=10, batch_size=32, learning_rate=1e-3):
    """
    训练分类器模型。

    参数:
        model: 要训练的分类器模型
        X_train: 训练集输入数据
        y_train: 训练集标签
        epochs: 训练轮数
        batch_size: 批处理大小
        learning_rate: 学习率
        device: 设备类型（'cpu' or 'cuda'）
    """
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # 将模型移动到正确的设备
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    
    # 将数据转换为 tensor 并移动到设备
    X_train = X_train.to(device)
    y_train = y_train.to(device)

    dataset = torch.utils.data.TensorDataset(X_train, y_train)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
    
    model.train()
    for epoch in tqdm(range(epochs)):
        running_loss = 0.0
        for inputs, labels in dataloader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss/len(dataloader):.4f}")


In [3]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForMaskedLM,AutoModel
from tqdm import tqdm
# 编码文本，将其转换为模型的输入格式
def encode_text(text, tokenizer, max_length=512):
    # 使用 tokenizer 对文本进行编码
    encoding = tokenizer(text, return_tensors='pt', padding='max_length', truncation=True, max_length=max_length)
    input_ids = encoding['input_ids']
    attention_mask = encoding['attention_mask']
    return input_ids, attention_mask

# 提取BERT特征
def extract_bert_features(texts):
    if len(texts) == 0:
        raise ValueError("Input text list is empty.")
    
    batch_size = 16
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", trust_remote_code=True)
    model = AutoModel.from_pretrained("bert-base-uncased", trust_remote_code=True).to(device)

    embeddings = []
    # 处理每个批次
    for i in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[i:i + batch_size]
        
        # 对每个批次的文本进行编码
        input_ids, attention_mask = zip(*[encode_text(text, tokenizer) for text in batch_texts])
        
        # 将 input_ids 和 attention_mask 拼接成大批次的张量
        input_ids = torch.cat(input_ids, dim=0).to(device)
        attention_mask = torch.cat(attention_mask, dim=0).to(device)

        # 模型推理
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)

        # 获取 [CLS] token 的嵌入 (第一维表示batch size)
        cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        embeddings.extend(cls_embeddings)

        # 清理内存
        del batch_texts, input_ids, attention_mask, outputs, cls_embeddings
        torch.cuda.empty_cache() if torch.cuda.is_available() else None

    # 转换为 numpy 数组
    embeddings = np.vstack(embeddings)
    return embeddings


In [4]:
import fasttext
import gc
import cupy as cp  # 使用 cupy 进行GPU加速
from concurrent.futures import ProcessPoolExecutor
from tqdm import tqdm
import numpy as np
def process_batch(batch_texts, fasttext_model_path):
    # 每个进程自己加载 fasttext 模型
    fasttext_model = fasttext.load_model(fasttext_model_path)
    batch_vectors = [fasttext_model.get_sentence_vector(text) for text in batch_texts]
    return batch_vectors

# 加载预训练的FastText模型
def extract_fasttext_features(texts, fasttext_model_path='/root/autodl-tmp/bert_to_hate_context/fasttext/cc.en.300.bin', batch_size=10000):
    gc.collect()
    features = []

    # 使用ProcessPoolExecutor来并行处理文本批次
    with ProcessPoolExecutor() as executor:
        # 使用每个批次的数据处理
        for i in tqdm(range(0, len(texts), batch_size)):
            batch_texts = texts[i:i + batch_size]
            # 并行处理批次
            batch_vectors = executor.submit(process_batch, batch_texts, fasttext_model_path).result()
            features.extend(batch_vectors)
            # 清理垃圾以释放内存
            del batch_texts, batch_vectors
            gc.collect()

    # 将特征转换为NumPy数组
    features = np.array(features)
    return features

In [5]:
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_score, recall_score, f1_score
def evaluate_model(model, X_test, y_test):
    """
    评估模型的准确率、精确率、召回率和 F1 分数。
    
    参数:
        model: 训练好的模型
        X_test: 测试集输入张量
        y_test: 测试集标签张量
    
    返回:
        metrics: 包含 accuracy、precision、recall、f1 的字典
    """
    model.eval()  # 设置模型为评估模式

    with torch.no_grad():  # 禁用梯度计算以节省内存和加速计算
        # 确保输入张量在同一设备上
        X_test = X_test.to(next(model.parameters()).device)
        y_test = y_test.to(next(model.parameters()).device)
        
        outputs = model(X_test)
        _, predicted = torch.max(outputs, 1)  # 获取最高分对应的类别

        # 将张量转换为 NumPy 数组
        y_test_np = y_test.cpu().numpy()
        predicted_np = predicted.cpu().numpy()

        # 计算评估指标
        accuracy = (predicted == y_test).sum().item() / len(y_test)
        precision = precision_score(y_test_np, predicted_np, average="weighted")
        recall = recall_score(y_test_np, predicted_np, average="weighted")
        f1 = f1_score(y_test_np, predicted_np, average="weighted")

        metrics = {
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1_score": f1
        }
        return metrics

In [6]:
import re
def preprocess_text(text):
    text = re.sub(r'[^\w\s]', '', text)  # 去除标点符号
    text = text.lower()  # 转小写
    return word_tokenize(text)  # 分词

In [7]:
print("loading dataset")
import pandas as pd
df = pd.read_csv('HateSpeechDataset.csv')  # 替换为你的数据文件路径
df = df.drop(columns=['Content_int'], errors='ignore')
df = df[df['Label'] != 'Label']

texts = df['Content'].tolist()
labels = df['Label'].tolist()

loading dataset


In [8]:
# 2. 使用 BERT 提取特征 (假设在 get_features.py 中)
print("get_features to get bert features")
bert_features = extract_bert_features(texts)

get_features to get bert features


2024-12-19 14:51:53.962816: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-19 14:51:53.975318: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1734591113.988957    2070 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1734591113.993264    2070 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-19 14:51:54.010414: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [9]:
# 3. 使用 get_word_vector 提取 FastText 特征
print("get_word_vector to get fasttext features")
fasttext_features = extract_fasttext_features(texts)

get_word_vector to get fasttext features


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [10]:
# 4. 将 BERT 和 FastText 特征拼接起来 (假设在 cat_features.py 中)
print("cat features")
combined_features =concatenate_features(bert_features, fasttext_features)

cat features


In [11]:
# 5. 创建模型并训练 (假设在 create_classifier.py 中)
print("create model")
model = create_model(input_dim=combined_features.shape[1], hidden_dim=128, num_classes=len(set(labels)))

create model


In [12]:
# 6. 分割数据集为训练集和测试集
print("split dataset")
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(combined_features, df['Label'], test_size=0.2, random_state=42)

split dataset


In [13]:
label_map = {'0': 0, '1': 1}  # 假设标签是 '0' 和 '1'

# 如果标签在 label_map 中不存在，可以打印出不在字典中的标签
y_train_encoded = []
for label in y_train:
    try:
        y_train_encoded.append(label_map[label])
    except KeyError:
        print(f"未知标签：{label}")  # 打印出无法映射的标签
        y_train_encoded.append(-1)  # 可以指定一个默认值，例如 -1
y_test_encoded = []
for label in y_test:
    try:
        y_test_encoded.append(label_map[label])
    except KeyError:
        print(f"未知标签：{label}")  # 打印出无法映射的标签
        y_test_encoded.append(-1)  # 可以指定一个默认值，例如 -1

In [14]:
# 转换为张量
import torch
print("to tensor")
train_tensor = torch.tensor(X_train, dtype=torch.float32)
test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_encoded, dtype=torch.long)
y_test_tensor = torch.tensor(y_test_encoded, dtype=torch.long)

to tensor


In [15]:
# 训练模型
print("training model")
train_model(model, train_tensor, y_train_tensor)

training model


 10%|█         | 1/10 [00:20<03:06, 20.73s/it]

Epoch 1/10, Loss: 0.3145


 20%|██        | 2/10 [00:48<03:18, 24.84s/it]

Epoch 2/10, Loss: 0.2959


 30%|███       | 3/10 [01:03<02:21, 20.18s/it]

Epoch 3/10, Loss: 0.2875


 40%|████      | 4/10 [01:25<02:05, 20.99s/it]

Epoch 4/10, Loss: 0.2817


 50%|█████     | 5/10 [01:51<01:55, 23.01s/it]

Epoch 5/10, Loss: 0.2771


 60%|██████    | 6/10 [02:19<01:37, 24.41s/it]

Epoch 6/10, Loss: 0.2728


 70%|███████   | 7/10 [02:46<01:16, 25.55s/it]

Epoch 7/10, Loss: 0.2692


 80%|████████  | 8/10 [03:14<00:52, 26.25s/it]

Epoch 8/10, Loss: 0.2665


 90%|█████████ | 9/10 [03:41<00:26, 26.35s/it]

Epoch 9/10, Loss: 0.2636


100%|██████████| 10/10 [04:09<00:00, 24.95s/it]

Epoch 10/10, Loss: 0.2609





In [17]:
# 7. 评估模型 (假设在 evaluate.py 中)
print("Evaluate model")
metrics = evaluate_model(model, test_tensor, y_test_tensor)
print(f"Test Accuracy: {metrics['accuracy']:.4f}")
print(f"Test Precision: {metrics['precision']:.4f}")
print(f"Test Recall: {metrics['recall']:.4f}")
print(f"Test F1 Score: {metrics['f1_score']:.4f}")

Evaluate model
Test Accuracy: 0.8681
Test Precision: 0.8726
Test Recall: 0.8681
Test F1 Score: 0.8701


In [18]:
print(type(y_train[0]))  # 输出 y_train 中第一个元素的类型

<class 'str'>


In [19]:
# 打印 y_train 的唯一标签值
print(set(y_train))

{'1', '0'}


## 不使用fasttext

In [20]:
## 不使用fasttext
model1 = create_model(input_dim=bert_features.shape[1], hidden_dim=128, num_classes=len(set(labels)))

In [21]:
# 6. 分割数据集为训练集和测试集
print("split dataset")
from sklearn.model_selection import train_test_split

label_map = {'0': 0, '1': 1}  # 假设标签是 '0' 和 '1'
# 如果标签在 label_map 中不存在，可以打印出不在字典中的标签
y_encoded = []
for label in df['Label']:
    try:
        y_encoded.append(label_map[label])
    except KeyError:
        print(f"未知标签：{label}")  # 打印出无法映射的标签
        y_encoded.append(-1)  # 可以指定一个默认值，例如 -1

X_train1, X_test1, y_train1, y_test1 = train_test_split(bert_features, y_encoded, test_size=0.2, random_state=42)

split dataset


In [22]:
# 转换为张量
import torch
print("to tensor")
train_tensor1 = torch.tensor(X_train1, dtype=torch.float32)
test_tensor1 = torch.tensor(X_test1, dtype=torch.float32)
y_train_tensor1 = torch.tensor(y_train1, dtype=torch.long)
y_test_tensor1 = torch.tensor(y_test1, dtype=torch.long)

to tensor


In [23]:
print("training model")
train_model(model1, train_tensor1, y_train_tensor1)

training model


 10%|█         | 1/10 [01:26<13:02, 86.90s/it]

Epoch 1/10, Loss: 0.3180


 20%|██        | 2/10 [03:02<12:13, 91.73s/it]

Epoch 2/10, Loss: 0.3009


 30%|███       | 3/10 [04:33<10:40, 91.53s/it]

Epoch 3/10, Loss: 0.2941


 40%|████      | 4/10 [05:56<08:48, 88.05s/it]

Epoch 4/10, Loss: 0.2898


 50%|█████     | 5/10 [07:05<06:47, 81.43s/it]

Epoch 5/10, Loss: 0.2860


 60%|██████    | 6/10 [08:05<04:55, 73.91s/it]

Epoch 6/10, Loss: 0.2831


 70%|███████   | 7/10 [09:22<03:44, 74.98s/it]

Epoch 7/10, Loss: 0.2806


 80%|████████  | 8/10 [10:41<02:32, 76.41s/it]

Epoch 8/10, Loss: 0.2782


 90%|█████████ | 9/10 [12:02<01:17, 77.88s/it]

Epoch 9/10, Loss: 0.2760


100%|██████████| 10/10 [13:22<00:00, 80.20s/it]

Epoch 10/10, Loss: 0.2738





In [25]:
print("Evaluate model")
metrics1 = evaluate_model(model1, test_tensor1, y_test_tensor1)
print(f"Test Accuracy: {metrics1['accuracy']:.4f}")
print(f"Test Precision: {metrics1['precision']:.4f}")
print(f"Test Recall: {metrics1['recall']:.4f}")
print(f"Test F1 Score: {metrics1['f1_score']:.4f}")

Evaluate model
Test Accuracy: 0.8692
Test Precision: 0.8643
Test Recall: 0.8692
Test F1 Score: 0.8663


## 不使用bert

In [26]:
model2 = create_model(input_dim=fasttext_features.shape[1], hidden_dim=128, num_classes=len(set(labels)))

In [27]:
# 6. 分割数据集为训练集和测试集
print("split dataset")
from sklearn.model_selection import train_test_split

label_map = {'0': 0, '1': 1}  # 假设标签是 '0' 和 '1'
# 如果标签在 label_map 中不存在，可以打印出不在字典中的标签
y_encoded = []
for label in df['Label']:
    try:
        y_encoded.append(label_map[label])
    except KeyError:
        print(f"未知标签：{label}")  # 打印出无法映射的标签
        y_encoded.append(-1)  # 可以指定一个默认值，例如 -1

X_train2, X_test2, y_train2, y_test2 = train_test_split(fasttext_features, y_encoded, test_size=0.2, random_state=42)

split dataset


In [28]:
# 转换为张量
import torch
print("to tensor")
train_tensor2 = torch.tensor(X_train2, dtype=torch.float32)
test_tensor2 = torch.tensor(X_test2, dtype=torch.float32)
y_train_tensor2 = torch.tensor(y_train2, dtype=torch.long)
y_test_tensor2 = torch.tensor(y_test2, dtype=torch.long)

to tensor


In [29]:
print("training model")
train_model(model2, train_tensor2, y_train_tensor2)

training model


 10%|█         | 1/10 [00:53<08:03, 53.70s/it]

Epoch 1/10, Loss: 0.3212


 20%|██        | 2/10 [01:59<08:07, 60.94s/it]

Epoch 2/10, Loss: 0.2991


 30%|███       | 3/10 [03:04<07:20, 62.88s/it]

Epoch 3/10, Loss: 0.2914


 40%|████      | 4/10 [04:10<06:24, 64.03s/it]

Epoch 4/10, Loss: 0.2854


 50%|█████     | 5/10 [05:18<05:26, 65.35s/it]

Epoch 5/10, Loss: 0.2805


 60%|██████    | 6/10 [06:24<04:22, 65.52s/it]

Epoch 6/10, Loss: 0.2760


 70%|███████   | 7/10 [07:24<03:11, 63.85s/it]

Epoch 7/10, Loss: 0.2724


 80%|████████  | 8/10 [08:19<02:02, 61.12s/it]

Epoch 8/10, Loss: 0.2687


 90%|█████████ | 9/10 [09:27<01:03, 63.26s/it]

Epoch 9/10, Loss: 0.2659


100%|██████████| 10/10 [10:34<00:00, 63.43s/it]

Epoch 10/10, Loss: 0.2628





In [30]:
print("Evaluate model")
metrics2 = evaluate_model(model2, test_tensor2, y_test_tensor2)
print(f"Test Accuracy: {metrics['accuracy']:.4f}")
print(f"Test Precision: {metrics['precision']:.4f}")
print(f"Test Recall: {metrics['recall']:.4f}")
print(f"Test F1 Score: {metrics['f1_score']:.4f}")

Evaluate model
Test Accuracy: 0.8681
Test Precision: 0.8726
Test Recall: 0.8681
Test F1 Score: 0.8701


In [31]:
import re
# 使用简单的字符串分割代替
def preprocess_text_simple(text):
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()
    return text.split()  # 简单分词

In [32]:
text_word2vec = [preprocess_text_simple(text) for text in texts]

In [33]:
from gensim.models import Word2Vec
word2vec_model = Word2Vec(sentences=texts, vector_size=100, window=5, min_count=1, workers=4)

In [34]:
import numpy as np

def text_to_vector(text, model):
    vectors = [model.wv[word] for word in text if word in model.wv]
    if len(vectors) > 0:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

X = np.array([text_to_vector(text, word2vec_model) for text in text_word2vec])
y = np.array(df['Label'])

In [35]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 训练逻辑回归模型
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

# 测试分类效果
y_pred = classifier.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      1.00      0.90     72346
           1       0.45      0.00      0.00     15834

    accuracy                           0.82     88180
   macro avg       0.63      0.50      0.45     88180
weighted avg       0.75      0.82      0.74     88180



In [36]:
classifier.score(X_test,y_test)

0.820378770696303