In [11]:
import csv

# 从CSV文件导入数据
with open('sentiment_labels_to_finetune.csv', 'r', encoding='utf-8') as f:
    reader = csv.reader(f)
    fine_tune_text = [(row[0], row[1]) for row in reader]

# 现在fine_tune_text就包含了您的数据
print("✅ 数据已成功导入，共", len(fine_tune_text), "条记录")
print("示例数据:", fine_tune_text[0])

✅ 数据已成功导入，共 5586 条记录
示例数据: ('公告：公司正常经营，无应披露未披露事项', 'none')


In [2]:
label_list = ["none", "disgust", "happiness", "like", "fear", "sadness", "anger", "surprise"]
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for i, label in enumerate(label_list)}

texts, str_labels = zip(*fine_tune_text)  # 解包元组
labels = [label2id[label] for label in str_labels]  # 转为整数

In [3]:
# 导入tokenizer
from transformers import AutoTokenizer
model_path = r"D:\浏览器下载\hugging_face\xuyuan-trial-sentiment-bert-chinese"
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [4]:
def tokenize_function(examples):
    return tokenizer(examples, truncation=True, padding=True, max_length=128)

# 对所有文本批量编码
encodings = tokenize_function(list(texts))

from datasets import Dataset
dataset_dict = {
    "input_ids": encodings["input_ids"],
    "attention_mask": encodings["attention_mask"],
    "labels": labels
}
#
dataset = Dataset.from_dict(dataset_dict) #能够对数据集转化成Dataset的格式进行储存

In [5]:
dataset = dataset.train_test_split(test_size=0.15)  # 85% train, 15% eval
train_dataset = dataset["train"]
eval_dataset = dataset["test"]


In [7]:
# 导入本地部署的bert模型
# 导入peft库，加载lora微调参数，并把参数加载到bert模型之中
from transformers import AutoModelForSequenceClassification,AutoTokenizer
from peft import get_peft_model,LoraConfig,TaskType

model = AutoModelForSequenceClassification.from_pretrained(model_path,num_labels=8,id2label=id2label,label2id=label2id)

peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    inference_mode=False,
    r=16,
    lora_alpha=32,
    lora_dropout=0.15,
    target_modules=['query','key','value']
)

model = get_peft_model(model,peft_config)
print(model.print_trainable_parameters())


trainable params: 890,888 || all params: 103,164,688 || trainable%: 0.8636
None


In [11]:
# 加载训练参数
from transformers import TrainingArguments
import torch
training_args = TrainingArguments(
    output_dir=r'D:\浏览器下载\hugging_face\finetuned_bert_lora2',
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model='eval_accuracy',
    greater_is_better=True,
    save_total_limit=2,
    report_to='none',
    fp16 = True if torch.cuda.is_available() else False,
)

In [9]:
import numpy as np
import evaluate
# 加载精确度计算函数
def compute_metrics(eval_pred):
    metric = evaluate.load('accuracy')
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis = 1)
    results = metric.compute(predictions = predictions, references = labels)
    return results

In [12]:
# 实例化Trainer，调用训练函数
from transformers import Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,                  # 用于动态 padding（可选）
    compute_metrics=compute_metrics,
)
# 开始训练！
trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.7107,0.516387,0.815036
2,0.5283,0.442497,0.847255
3,0.3861,0.378231,0.873508
4,0.2605,0.3811,0.883055
5,0.2703,0.365017,0.890215


Using the latest cached version of the module from C:\Users\mazal\.cache\huggingface\modules\evaluate_modules\metrics\evaluate-metric--accuracy\f887c0aab52c2d38e1f8a215681126379eca617f96c447638f751434e8e65b14 (last modified on Wed Nov 26 19:25:15 2025) since it couldn't be found locally at evaluate-metric--accuracy, or remotely on the Hugging Face Hub.
Using the latest cached version of the module from C:\Users\mazal\.cache\huggingface\modules\evaluate_modules\metrics\evaluate-metric--accuracy\f887c0aab52c2d38e1f8a215681126379eca617f96c447638f751434e8e65b14 (last modified on Wed Nov 26 19:25:15 2025) since it couldn't be found locally at evaluate-metric--accuracy, or remotely on the Hugging Face Hub.
Using the latest cached version of the module from C:\Users\mazal\.cache\huggingface\modules\evaluate_modules\metrics\evaluate-metric--accuracy\f887c0aab52c2d38e1f8a215681126379eca617f96c447638f751434e8e65b14 (last modified on Wed Nov 26 19:25:15 2025) since it couldn't be found locally at

TrainOutput(global_step=1485, training_loss=0.4410545959215774, metrics={'train_runtime': 131.2004, 'train_samples_per_second': 180.945, 'train_steps_per_second': 11.319, 'total_flos': 382145523204480.0, 'train_loss': 0.4410545959215774, 'epoch': 5.0})

In [5]:
# 微调后bert的实际转化效果
# 在实际微调过程中本人训练了两个版本，这里使用的是第一个微调后的版本，其转化结果更为合理，以上程序展现微调过程
from transformers import pipeline
fine_tuned_path = r"D:\浏览器下载\hugging_face\finetuned_bert_lora\checkpoint-280"
# 加载微调后的模型
sentiment_pipe = pipeline(
    "sentiment-analysis",
    model=fine_tuned_path,
    tokenizer=fine_tuned_path,
    device=0  # -1 表示 CPU，0 表示 GPU
)

# 测试
text = "利好出尽是利空，高开就是跑路机会"
result = sentiment_pipe(text)
print(result)

Device set to use cuda:0


[{'label': 'anger', 'score': 0.48831290006637573}]


In [7]:
def map_to_financial_sentiment(label: str) -> str:
    """
    将模型输出的8类情绪映射为金融场景下的情绪倾向
    """
    positive_labels = {"like", "happiness","surprise"}          # 看多、乐观
    negative_labels = {"anger", "disgust", "fear", "sadness"}  # 看空、悲观
    neutral_labels = {"none"}            # 中性或不确定（surprise 在股吧常为中性）

    if label in positive_labels:
        return "Positive"
    elif label in negative_labels:
        return "Negative"
    else:
        return "Neutral"

print("=" * 60)
print("中文股吧情绪分析测试（基于细粒度情感模型）")
print("=" * 60)
financial_texts = [
    "60日线下缩量滞涨，故事讲完了吗？那就要慢慢消化高市盈了，路漫漫其修远兮",
    "尾盘减了加仓部分，总体是增仓的。",
    "这是在等中芯北方收购结果的节奏吗？",
    "兄弟们走了，留下发财",
    "吊车尾",
    "中芯走势软不拉几",
    "全仓干起来啊。。未来一定涨到300。。回头看这里就是洼地。。补仓锁仓三年",
    "你给华虹提鞋的？",
    "这个股票是散户的大本营，结果有机都不来，这几天就是没有成交量"
]

for text in financial_texts:
    try:
        result = sentiment_pipe(text)[0]
        raw_label = result['label']
        confidence = result['score']
        financial_label = map_to_financial_sentiment(raw_label.lower())

        print(f"文本: {text}")
        print(f"原始情绪: {raw_label} → 金融情绪: {financial_label} (置信度: {confidence:.2%})")
        print("-" * 60)
    except Exception as e:
        print(f"❌ 处理文本出错: {text[:30]}... | 错误: {e}")
        print("-" * 60)

中文股吧情绪分析测试（基于细粒度情感模型）
文本: 60日线下缩量滞涨，故事讲完了吗？那就要慢慢消化高市盈了，路漫漫其修远兮
原始情绪: fear → 金融情绪: Negative (置信度: 29.45%)
------------------------------------------------------------
文本: 尾盘减了加仓部分，总体是增仓的。
原始情绪: none → 金融情绪: Neutral (置信度: 98.06%)
------------------------------------------------------------
文本: 这是在等中芯北方收购结果的节奏吗？
原始情绪: disgust → 金融情绪: Negative (置信度: 69.63%)
------------------------------------------------------------
文本: 兄弟们走了，留下发财
原始情绪: surprise → 金融情绪: Positive (置信度: 91.83%)
------------------------------------------------------------
文本: 吊车尾
原始情绪: sadness → 金融情绪: Negative (置信度: 90.23%)
------------------------------------------------------------
文本: 中芯走势软不拉几
原始情绪: fear → 金融情绪: Negative (置信度: 32.20%)
------------------------------------------------------------
文本: 全仓干起来啊。。未来一定涨到300。。回头看这里就是洼地。。补仓锁仓三年
原始情绪: happiness → 金融情绪: Positive (置信度: 72.11%)
------------------------------------------------------------
文本: 你给华虹提鞋的？
原始情绪: anger → 金融情绪: Negative (置信度: 52.20%)
--------------------------