FinSent Detector

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax
import torch
import pandas as pd
import numpy as np
from torch.nn.functional import softmax
from tqdm import tqdm

In [2]:
model = AutoModelForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone")
tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone")


In [3]:
texts = [
    "Tesla's revenue increased by 40% this quarter. Analysts are optimistic.",
    "TSLA stock is facing serious challenges due to poor delivery numbers.",
    "Get ready! Tesla will skyrocket tomorrow!",
    "Buy now or miss the rally!",
]

for text in texts:
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128)
    outputs = model(**inputs)
    probs = softmax(outputs.logits, dim=1)[0]
    print(f"{text}\n→ positive: {probs[1]:.4f}, neutral: {probs[0]:.4f}, negative: {probs[2]:.4f}\n")


Tesla's revenue increased by 40% this quarter. Analysts are optimistic.
→ positive: 1.0000, neutral: 0.0000, negative: 0.0000

TSLA stock is facing serious challenges due to poor delivery numbers.
→ positive: 0.0000, neutral: 0.0000, negative: 1.0000

Get ready! Tesla will skyrocket tomorrow!
→ positive: 0.0015, neutral: 0.9985, negative: 0.0000

Buy now or miss the rally!
→ positive: 0.9913, neutral: 0.0048, negative: 0.0039



In [4]:
# 还是不够准确（skyrocket那句）
# 加载两个模型（FinBERT 和 Twitter-RoBERTa），以便识别财报和散户语言

# FinBERT（擅长分析财经新闻）
finbert_model = AutoModelForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone")
finbert_tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone")

# Twitter-RoBERTa（擅长社交媒体、口号式语言）
twitter_model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
twitter_tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")

# FinBERT情绪分析
def get_finbert_sentiment(text):
    inputs = finbert_tokenizer(text, return_tensors="pt", truncation=True, max_length=128)
    outputs = finbert_model(**inputs)
    probs = softmax(outputs.logits, dim=1)[0]
    return {"positive": probs[1], "neutral": probs[0], "negative": probs[2]}

# Twitter-RoBERTa情绪分析
def get_twitter_sentiment(text):
    inputs = twitter_tokenizer(text, return_tensors="pt", truncation=True, max_length=128)
    outputs = twitter_model(**inputs)
    probs = softmax(outputs.logits, dim=1)[0]
    return {"negative": probs[0], "neutral": probs[1], "positive": probs[2]}  # 注意顺序不同



In [5]:
# 简单的加权平均

def get_combined_sentiment(text, alpha=1):
    finbert_scores = get_finbert_sentiment(text)
    twitter_scores = get_twitter_sentiment(text)

    combined = {
        label: alpha * twitter_scores[label] + (1 - alpha) * finbert_scores[label]
        for label in ["positive", "neutral", "negative"]
    }
    return combined


In [6]:
texts = [
    "Tesla will skyrocket tomorrow!",
    "TSLA stock is facing serious risks.",
    "Revenue grew 30% this quarter, a strong signal.",
    "Get in now before it jumps!",
    "Tesla is probably overvalued.",
]

for text in texts:
    result = get_combined_sentiment(text)
    print(f"{text}\n→ Positive: {result['positive']:.4f}, Neutral: {result['neutral']:.4f}, Negative: {result['negative']:.4f}\n")


Tesla will skyrocket tomorrow!
→ Positive: 0.8989, Neutral: 0.0963, Negative: 0.0048

TSLA stock is facing serious risks.
→ Positive: 0.0060, Neutral: 0.1632, Negative: 0.8308

Revenue grew 30% this quarter, a strong signal.
→ Positive: 0.9338, Neutral: 0.0651, Negative: 0.0011

Get in now before it jumps!
→ Positive: 0.2955, Neutral: 0.6035, Negative: 0.1011

Tesla is probably overvalued.
→ Positive: 0.0192, Neutral: 0.2151, Negative: 0.7657



加权平均也不准确，建立融合模型

In [7]:
import pandas as pd

df = pd.read_csv("C:\\Users\\86153\\financial-sentiment-analysis\\data.csv")

df = df.head(1000)

print(df.head())

                                            Sentence Sentiment
0  The GeoSolutions technology will leverage Bene...  positive
1  $ESI on lows, down $1.50 to $2.50 BK a real po...  negative
2  For the last quarter of 2010 , Componenta 's n...  positive
3  According to the Finnish-Russian Chamber of Co...   neutral
4  The Swedish buyout firm has sold its remaining...   neutral


In [8]:
finbert_model.eval()
twitter_model.eval()


# 2. 统一标签顺序：0 - negative, 1 - neutral, 2 - positive
id2label = {0: "negative", 1: "neutral", 2: "positive"}

# 因为 FinBERT 原始顺序是 {0: neutral, 1: positive, 2: negative}
# 所以我们要将它输出的向量 permute 成 [negative, neutral, positive]
def reorder_finbert_probs(probs):
    return np.array([probs[2], probs[0], probs[1]])

# 3. 定义预测函数
def get_probs(text, tokenizer, model, reorder=False):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
        probs = softmax(outputs.logits, dim=1).squeeze().numpy()
    if reorder:
        probs = reorder_finbert_probs(probs)
    return probs

# 4. 定义融合函数（可动态调权）
def adjust_alpha(text):
    text_lower = text.lower()

    # 偏向 FinBERT 的金融关键词
    fin_keywords = [
        "stock", "market", "profit", "nasdaq", "fed", "earnings",
        "inflation", "revenue", "guidance", "fomc", "qe", "rate hike"
    ]
    
    # 偏向 Twitter 的网络热词和情绪表达
    emotion_keywords = [
        "wow", "awesome", "lol", "😭", "🔥", "💥", "!", "omg", "insane",
        "crazy", "skyrocketing", "to the moon", "crash", "plummet", "surge",
        "collapse", "explode", "rally", "selloff", "moon", "🚀", "lmao"
    ]

    has_fin = any(word in text_lower for word in fin_keywords)
    has_emotion = any(word in text_lower for word in emotion_keywords)

    # 简化的判断逻辑
    if has_fin and not has_emotion:
        return 0.8  # 偏向 FinBERT
    elif has_emotion and not has_fin:
        return 0.3  # 偏向 Twitter
    else:
        return 0.5  # 含混或都不包含时保持中性融合

def merge_probs(p1, p2, alpha):
    return alpha * p1 + (1 - alpha) * p2
    
# 5. 执行模型预测与融合
finbert_probs = []
twitter_probs = []
merged_probs = []
merged_labels = []
merged_confidences = []

print("Processing...")

for text in tqdm(df["Sentence"]):
    fb_p = get_probs(text, finbert_tokenizer, finbert_model, reorder=True)
    tw_p = get_probs(text, twitter_tokenizer, twitter_model)
    alpha = adjust_alpha(text)
    merged_p = merge_probs(fb_p, tw_p, alpha)
    label = id2label[np.argmax(merged_p)]
    confidence = np.max(merged_p)
    
    finbert_probs.append(fb_p)
    twitter_probs.append(tw_p)
    merged_probs.append(merged_p)
    merged_labels.append(label)
    merged_confidences.append(confidence)

# 6. 加入输出列
df["finbert_probs"] = finbert_probs
df["twitter_probs"] = twitter_probs
df["merged_probs"] = merged_probs
df["merged_sentiment"] = merged_labels
df["confidence"] = merged_confidences

Processing...


100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [06:41<00:00,  2.49it/s]


In [9]:
# 7. 展示前几行结果
print(df[["Sentence", "merged_sentiment", "confidence"]].head())

                                            Sentence merged_sentiment  \
0  The GeoSolutions technology will leverage Bene...         positive   
1  $ESI on lows, down $1.50 to $2.50 BK a real po...         negative   
2  For the last quarter of 2010 , Componenta 's n...         positive   
3  According to the Finnish-Russian Chamber of Co...          neutral   
4  The Swedish buyout firm has sold its remaining...          neutral   

   confidence  
0    0.822221  
1    0.569238  
2    0.827364  
3    0.911201  
4    0.951154  


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# 假设你已经有 df，其中有 Sentence 和 merged_sentiment 两列
# merged_sentiment 的标签为字符串："positive", "neutral", "negative"
X = df['Sentence']
y = df['merged_sentiment']

# 人工标注样本（字符串标签版）
positive_samples = [
    "This stock is absolutely going to the moon! 🚀🚀🚀",
    "Buy now or regret forever! This is the next Amazon!",
    "Unstoppable rally incoming, don’t miss out!",
    "Profits will explode beyond imagination!",
    "This company will dominate the market, no doubt!",
    "Incredible growth ahead — sell before it’s too late!",
    "The best investment you’ll ever make, guaranteed!",
    "Skyrocketing past all expectations, pure gold!",
    "Everyone’s talking about this unstoppable surge!",
    "Get rich quick with this unbelievable opportunity!",
    "Tesla's earnings exceeded expectations this quarter!",
    "NVDA is showing strong growth potential in AI chips.",
    "Market is optimistic about Apple's new product launch.",
    "Strong revenue growth reported by Microsoft this year.",
    "This stock is a solid long-term buy with great fundamentals.",
    "Investors are excited about the latest earnings call.",
    "The company's guidance points to a promising future.",
    "Positive momentum continues in the tech sector.",
    "Profits have steadily increased despite market volatility.",
    "The stock rallied after better-than-expected sales numbers.",
    "Absolutely phenomenal earnings! Best company ever! 🚀🔥",
    "We are going to the moon! 📈💎🙌",
    "Record-breaking performance again this quarter! 💰💰",
    "Huge potential in this stock, just getting started! 🚀",
    "Great management, consistent growth 📊👍",
    "Unbelievable rally today! We're printing money! 💸🚀",
    "Just bought more shares. This is the next big thing! 🔥💯",
    "Massive growth potential. Long and strong! 💪📈",
    "Solid fundamentals, strong earnings, great future! 🧠✨",
    "CEO nailed the interview. Confidence through the roof! 🎤🙌",
    "Another all-time high! Cheers to everyone holding! 🥂🚀",
    "Analysts are bullish and I'm all in! 🐂💰",
    "Up 10% in a day?? Love this stock. ❤️📈",
    "Dividend increased again! Let's gooo 🔥📊",
    "Perfect dip buying opportunity — moon incoming 🌕💎"
]
positive_labels = ["positive"] * len(positive_samples)

neutral_samples = [
    "Tesla announced its quarterly earnings today.",
    "The company reported revenue figures inline with forecasts.",
    "Market conditions remain stable with no major changes.",
    "Apple released its updated product specifications.",
    "Microsoft's stock price fluctuated slightly during trading.",
    "The report provides an overview of recent company activities.",
    "Investors are awaiting more data before making decisions.",
    "Economic indicators showed mixed results this week.",
    "The quarterly report includes details on expenses and income.",
    "The stock closed flat after a day of moderate trading.",
    "The stock is up, but I'm not convinced it's sustainable.",  # 看涨，但怀疑
    "Good earnings, yet the market doesn't seem excited.",      # 正向财报，市场反应冷淡
    "This might be a bubble, but profits look strong.",         # 警惕泡沫但数据好
    "The rally continues despite some red flags.",              # 上涨但有风险
    "I’m cautiously optimistic, but things could turn bad fast.",# 小心乐观，夹杂担忧
    "Investors seem split on whether this is a good buy.",      # 投资者意见分歧
    "Solid fundamentals, yet the price is not moving much.",    # 基本面好，股价平稳
    "Looks promising, but too soon to tell.",                   # 有潜力，但不确定
    "The company’s guidance is unclear and confusing.",         # 指引模糊
    "Mixed signals from the market, waiting on next quarter.",  # 市场信号混合
    "They say this is the next big thing, but I’m skeptical.",  # 大家都说好，自己怀疑
    "Huge potential losses if this doesn’t pan out.",           # 警告风险但也期待收益
    "Positive news overshadowed by broader market fears.",      # 好消息被大环境压制
    "The hype might be overblown, but some value is there.",    # 炒作可能夸大，但有价值
    "Strong sales, yet margins are shrinking.", 
    "The company reported Q3 revenue of $5.2 billion.",
    "Trading volume remained consistent with the weekly average.",
    "The board approved a 2-for-1 stock split.",
    "Earnings call is scheduled for Thursday at 5PM EST.",
    "New CFO appointed after previous one stepped down.",
    "Shares closed flat after mild volatility during the day.",
    "Analysts maintain a 'hold' rating on the stock.",
    "Company filed a 10-K with the SEC today.",
    "Stock moved sideways amid lack of news.",
    "Market awaits Fed decision before major moves."
]
neutral_labels = ["neutral"] * len(neutral_samples)

negative_samples = [
    "TSLA faces significant headwinds due to supply chain issues.",
    "Investors worry about disappointing earnings results.",
    "The stock dropped sharply amid market uncertainty.",
    "Poor revenue growth raises concerns among analysts.",
    "The company is struggling to meet its financial targets.",
    "Negative sentiment increased following management changes.",
    "The outlook remains weak given recent regulatory challenges.",
    "Shares plunged after disappointing guidance was released.",
    "There are serious doubts about the company’s future prospects.",
    "Profit warnings have caused panic selling in the market.",
    "This company is a ticking time bomb! Total disaster!",
    "Sell now before it crashes to zero!",
    "Absolutely doomed, no chance of recovery!",
    "Investors are being scammed, don’t fall for it!",
    "Worst financial disaster in decades, avoid at all costs!",
    "This stock is burning money faster than you can imagine!",
    "Complete collapse imminent, prepare for losses!",
    "This is financial suicide, run away now!",
    "Massive red flags everywhere, a total joke!",
    "Bankruptcy is just around the corner, beware!",
    "Sell everything now. We're heading for a crash. 💥📉",
    "This is a scam. Avoid at all costs! 😡🚫",
    "Bankruptcy is coming. It's over. 💀📉",
    "Terrible leadership, bleeding cash quarter after quarter.",
    "This company is garbage. Can't believe it's still trading. 🤢",
    "Get out while you can. This is a total disaster! 😱📉",
    "Red flags everywhere. This will collapse soon. 🚨💥",
    "How is this company still alive? Burning cash nonstop. 🔥💀",
    "Worst earnings report I've seen. CEO should resign. 🤬",
    "Big dump coming. I'm out. 😤📉",
    "Panic selling in full swing. Brace yourself. 😰📉",
    "Charts look horrible. This is going to zero. 📉🕳️",
    "Manipulated garbage. I'm done with this. 🚫🤡",
    "Stock is tanking and no one seems to care. 😓",
    "This is financial suicide. Do not buy this trash! 🗑️"
]
negative_labels = ["negative"] * len(negative_samples)


In [11]:
# 1. 拆分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 假设你原本的训练集是X_train, y_train
X_train = list(X_train) + positive_samples + neutral_samples + negative_samples
y_train = list(y_train) + positive_labels + neutral_labels + negative_labels

# 2. 文本转为向量（TF-IDF）
vectorizer = TfidfVectorizer(max_features=3000)  # 可调整
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# 3. 训练随机森林
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_vec, y_train)

# 4. 预测
y_pred = clf.predict(X_test_vec)

# 5. 评估
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.715
              precision    recall  f1-score   support

    negative       0.58      0.48      0.53        29
     neutral       0.73      0.95      0.83       127
    positive       0.80      0.18      0.30        44

    accuracy                           0.71       200
   macro avg       0.70      0.54      0.55       200
weighted avg       0.72      0.71      0.67       200



| 维度    | 现在的做法（RandomForest） | 原本给的做法（BERT微调）  |
| ----- | -------------------- | ---------------- |
| 文本理解力 | 词袋模型，不理解上下文          | 深度语义理解，考虑上下文与语气  |
| 特征工程  | 人工特征（TF-IDF）         | 自动抽取深层特征         |
| 精度潜力  | 有限         | 有望突破90%，尤其对复杂表达  |
| 可扩展性  | 对其他领域泛化弱             | 可在其他领域直接迁移       |
| 部署成本  | 快速简单，轻量模型            | 成本略高，需 GPU 或优化部署 |
| 数据需求  | 不需要太多                | 微调效果更好时数据越多越稳    |


In [12]:
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# === Step 1: 数据预处理与标签映射 ===
label_map = {"negative": 0, "neutral": 1, "positive": 2}
inverse_label_map = {v: k for k, v in label_map.items()}

# 假设你已经有 df，其中包含 'Sentence' 和 'merged_sentiment' 列（标签为字符串）
df['label_id'] = df['merged_sentiment'].map(label_map)

X_train, X_test, y_train, y_test = train_test_split(
    df['Sentence'].tolist(),
    df['label_id'].tolist(),
    test_size=0.2,
    random_state=42
)

In [13]:
# === Step 2: 数据集封装类 ===
model_name = "bert-base-uncased" 

tokenizer = BertTokenizer.from_pretrained(model_name)

class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=32):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors='pt'
        )
        return {
            'input_ids': enc['input_ids'].squeeze(0),
            'attention_mask': enc['attention_mask'].squeeze(0),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# === Step 3: 数据加载器 ===
train_dataset = SentimentDataset(X_train, y_train, tokenizer)
test_dataset = SentimentDataset(X_test, y_test, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8)


In [14]:
# === Step 4: 模型训练 ===
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=3)  # 三分类
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)

model.train()
for epoch in range(4):  # 训练3个epoch
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader):.4f}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Loss: 0.8518
Epoch 2, Loss: 0.5148
Epoch 3, Loss: 0.2226
Epoch 4, Loss: 0.0974


In [15]:
# === Step 5: 模型评估 ===
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# 准确率与详细分类报告
print("Accuracy:", accuracy_score(all_labels, all_preds))

# 将数字标签映射回文字
print(classification_report(
    [inverse_label_map[x] for x in all_labels],
    [inverse_label_map[x] for x in all_preds]
))


Accuracy: 0.89
              precision    recall  f1-score   support

    negative       0.80      0.97      0.88        29
     neutral       0.97      0.87      0.92       127
    positive       0.76      0.89      0.82        44

    accuracy                           0.89       200
   macro avg       0.85      0.91      0.87       200
weighted avg       0.90      0.89      0.89       200



In [16]:
# === Step 6: 模型保存（保存到新的路径） ===
save_path = "FinSent_Detector"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

('FinSent_Detector\\tokenizer_config.json',
 'FinSent_Detector\\special_tokens_map.json',
 'FinSent_Detector\\vocab.txt',
 'FinSent_Detector\\added_tokens.json')

In [17]:
import pandas as pd
import random

# 数字标签到文本标签的映射
id2label = {0: "negative", 1: "neutral", 2: "positive"}

# 把原始文本、真实标签和预测结果拼成一个DataFrame
results_df = pd.DataFrame({
    'sentence': X_test,
    'true_label': [id2label[label] for label in y_test],
    'predicted': [id2label[pred] for pred in all_preds]
})

# 随机查看几条预测效果
samples = results_df.sample(5, random_state=42)
for idx, row in samples.iterrows():
    print(f"Text: {row['sentence']}")
    print(f"True label: {row['true_label']} | Predicted: {row['predicted']}")
    print("-" * 80)


Text: Finnish software developer Basware Oyj said on November 30 , 2006 its U.S. subsidiary Basware , Inc. won an order to provide software for contract lifecycle management to an unnamed U.S. medical technology company .
True label: neutral | Predicted: neutral
--------------------------------------------------------------------------------
Text: Word on the street is that Allergen is looking at Endo International after the failed Pfizer merger. May-20 $35 calls active. $ENDP
True label: neutral | Predicted: neutral
--------------------------------------------------------------------------------
Text: Looks like its booking a one way ticket to its 40 week MA near 50. Losing 10 week here $LULU http://chart.ly/7xb9h9b
True label: neutral | Predicted: negative
--------------------------------------------------------------------------------
Text: $VIPS similar pattern like beginning of May. Did u sell? Same now..will go up much higher after this drop.
True label: neutral | Predicted: neut

In [18]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# 加载模型
model_path = "FinSent_Detector"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
model.eval()  # 不训练了，只推理

def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=1)
        predicted_class = torch.argmax(probs, dim=1).item()
    label_map = {0: "Negative", 1: "Neutral", 2: "Positive"}
    return label_map[predicted_class]

text = "Get in now before it jumps!"
print("输入文本：", text)
print("模型判断情绪：", predict_sentiment(text))


输入文本： Get in now before it jumps!
模型判断情绪： Neutral
