In [12]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline

modelName = "yiyanghkust/finbert-tone"
# modelName = "ProsusAI/finbert"

finbert = BertForSequenceClassification.from_pretrained(modelName, num_labels=3)
tokenizer = BertTokenizer.from_pretrained(modelName)

nlp = pipeline("sentiment-analysis", model=finbert, tokenizer=tokenizer, truncation=True, max_length=512)

In [13]:
sentences = ["there is a shortage of capital, and we need extra financing",  
             "growth is strong and we have plenty of liquidity", 
             "there are doubts about our finances", 
             "profits are flat"]

results = nlp(sentences)
results = [i['label'] for i in results]

newRes = list(zip(sentences, results))

for i in newRes:
    print(i)

('there is a shortage of capital, and we need extra financing', 'Negative')
('growth is strong and we have plenty of liquidity', 'Positive')
('there are doubts about our finances', 'Negative')
('profits are flat', 'Neutral')


In [14]:
import pandas as pd
import re

df = pd.read_csv("crypto_currency_sentiment_dataset.csv")
print("Dataset Loaded!\n", df.head())

## SBert -- PCA -- XGB
def clean_text(text):
    text = re.sub(r"http\\S+|www\\S+", "", text)
    text = re.sub(r"@[A-Za-z0-9]+", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df["Clean_Comment"] = df["Comment"].astype(str).apply(clean_text)

df["Label"] = df["Sentiment"].map({"Positive": 1, "Negative": 0})

print("Dataset Transformed!\n", df.head())

Dataset Loaded!
         user_id                                            Comment Sentiment  \
0  XYNN2Y4VCF3G  I bought 2200 at the ico, at 0.50$ per coin. H...  Positive   
1  DR6XNZMT9KRH  Harmony one , algorand , Cardano, solana , vec...  Positive   
2  9FCQGMYD4A42  Honestly, after reading this post and many of ...  Negative   
3  QEZAEMV2WF9D  In bear market is where money is made. I Will ...  Positive   
4  Z7J7W3XCP4XC  Funny how people think Bitcoin's risk is compa...  Negative   

                                          Reddit URL  
0  https://www.reddit.com/r/Avax/comments/uzggar/...  
1  https://www.reddit.com/r/CryptoCurrency/commen...  
2  https://www.reddit.com/r/CryptoCurrency/commen...  
3  https://www.reddit.com/r/CryptoCurrency/commen...  
4  https://www.reddit.com/r/investing/comments/um...  
Dataset Transformed!
         user_id                                            Comment Sentiment  \
0  XYNN2Y4VCF3G  I bought 2200 at the ico, at 0.50$ per coin. H...  Po

In [15]:
comments = df["Clean_Comment"].to_list()

predicted_op = nlp(comments)
predicted_sentiments = [i['label'] for i in predicted_op]

print(len(predicted_sentiments))
print(predicted_sentiments[:10])

562
['Positive', 'Neutral', 'Neutral', 'Neutral', 'Neutral', 'Negative', 'Neutral', 'Positive', 'Neutral', 'Positive']


In [24]:
# predicted_sentiments2 = ["Positive" if i=="Neutral" else i for i in predicted_sentiments]
predicted_sentiments2 = ["Negative" if i=="Neutral" else i for i in predicted_sentiments]
predicted_sentiments2 = [1 if i=='Positive' else 0 for i in predicted_sentiments2]

print(len(predicted_sentiments2))
print(predicted_sentiments2[:10])

all_labels = df['Label'].to_list()

from sklearn.model_selection import train_test_split

pred_sents1, pred_sents2, labels1, labels2 = train_test_split(predicted_sentiments2, all_labels, test_size=0.1, stratify=all_labels, random_state=42)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(labels2, pred_sents2)
precision = precision_score(labels2, pred_sents2, average='weighted')  # Use 'macro' or 'weighted' for multi-class
recall = recall_score(labels2, pred_sents2, average='weighted')
f1 = f1_score(labels2, pred_sents2, average='weighted')

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

562
[1, 0, 0, 0, 0, 0, 0, 1, 0, 1]
Precision: 0.7612
Recall: 0.6491
F1 Score: 0.6225
