In [None]:
# !pip install vaderSentiment
# !pip install datasets


In [None]:
import pandas as pd
import numpy as np


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score,confusion_matrix, f1_score, precision_score, recall_score

from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from textblob import TextBlob

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from transformers import AutoModelForSequenceClassification, DistilBertForSequenceClassification, BertForSequenceClassification
from transformers import AutoTokenizer, DistilBertTokenizer, BertTokenizer
from transformers import Trainer, TrainingArguments
from datasets import Dataset


import torch
from torch.utils.data import DataLoader

import time
import re
from collections import Counter

In [None]:
nltk.download('vader_lexicon')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

# Analysing the performance of different sentiment analyzer tools using a manually labeled dataset

In [None]:
#Labelled dataset
# Define the file path
labeled_bitcointalk_sample_path = 'data/news_manually_labeled_sentiments.csv'
df = pd.read_csv(labeled_bitcointalk_sample_path)


In [None]:
df

Unnamed: 0,date,headline,description,actual_label
0,2016-01-17,The Capital of Lithuania to Host Baltics Larg...,- The Capital of Lithuania to Host Baltics La...,Positive
1,2015-10-17,MMA Welterweight Jon Fitch Fights for Bitcoin...,"MMA Welterweight Jon Fitch Fights for Bitcoin,...",Positive
2,2015-12-18,State Street Financials Crypto-FinTech Univer...,The technology researched will be focused on c...,Positive
3,2013-12-10,Dollar Vigilante - BITCOINS EVOLUTION WILL BE...,BITCOINS EVOLUTION WILL BE SWIFThttp://dollar...,Positive
4,2013-08-27,Bitcoin offers privacy -- as long as you don't...,http://www.pcworld.com/article/2047608/bitcoin...,Neutral
...,...,...,...,...
994,2016-07-08,OpenBazaar 2.0 Offers Significant Improvements,Solutions such as OpenBazaar will stand or fal...,Positive
995,2015-11-17,WSJ: Kleiner Perkins Makes First Bitcoin-Rela...,Kleiner Perkins Makes First Bitcoin-Related De...,Positive
996,2014-04-07,GIGom / New bitcoin debit card claims to work ...,Watch out! Getting cash from Bitcoins may have...,Positive
997,2017-06-19,Chinese Miners Announce Accelerated Developmen...,Chinese Miners Announce Accelerated Developmen...,Neutral


In [None]:
#Dataframe to store the accuracies for each model
sentiment_analyser_comparison = pd.DataFrame()

In [None]:
# Define the file path in your Google Drive
sentiment_analyser_comparison_path = 'data/sentiment_analyser_model_comparison_news.csv'
sentiment_analyser_comparison = pd.read_csv(sentiment_analyser_comparison_path)

VADER SENTIMENT ANALYSIS

In [None]:
def vader_sentiment_analysis(df,column=1):

  """
    Takes a DataFrame of news details, analyze sentiment for each news and is added to a list which is returned

    Parameters:
    df(DataFrame): DataFrame with news dataset

    Retuns:
    vader_sentiments(List) : List of sentiments for each row
  """
  vader_sentiments = []
  for index,row in df.iterrows():
      # Create a SentimentIntensityAnalyzer object.
      vader = SentimentIntensityAnalyzer()

      # polarity_scores method of SentimentIntensityAnalyzer
      # object gives a sentiment dictionary.
      # which contains pos, neg, neu, and compound scores.
      vader_sentiment_dict = vader.polarity_scores(row[column])
      vader_sentiment = ""

      # decide sentiment as positive, negative and neutral by comparing to a threshold
      if vader_sentiment_dict['compound'] >= 0.05 :
          vader_sentiment = "Bullish"

      elif vader_sentiment_dict['compound'] <= - 0.05 :
          vader_sentiment = "Bearish"
      else :
          vader_sentiment = "Neutral"

      vader_sentiments.append(vader_sentiment)

  return vader_sentiments

TEXTBLOB


In [None]:
def textblob_sentiment(df,column=1):
  """
    Analyzes the sentiment of text data in a DataFrame column using TextBlob and classifies
    the sentiment as 'Bearish', 'Bullish', or 'Neutral' based on the polarity score.

    Parameters:
    ----------
    df : pandas.DataFrame
        The DataFrame containing the dataset with text data to analyze.

    column : int or str, optional (default=1)
        The index or name of the column in the DataFrame that contains the text data for sentiment analysis.

    Returns:
    -------
    textblob_sentiments : list
        A list of sentiment classifications ('Bearish', 'Bullish', or 'Neutral') for each row in the DataFrame.


  """
  # Initialize an empty list to store the sentiment results.
  textblob_sentiments = []
  # Iterate over each row in the DataFrame.
  for index,row in df.iterrows():
    # Create a TextBlob object for the text in the specified column of the current row.
    textblob_analysis = TextBlob(row[column])
    # Analyze the polarity and classify the sentiment accordingly.
    if textblob_analysis.sentiment.polarity > 0.5:
      sentiment = "Bearish"
    elif textblob_analysis.sentiment.polarity < -0.5:
      sentiment = "Bullish"
    else:
      sentiment = "Neutral"

    # Append the classified sentiment to the list.
    textblob_sentiments.append(sentiment)

  # Return the list of sentiments.
  return textblob_sentiments


BERT SENTIMENT ANALYSIS

In [None]:
def preprocess(texts,tokenizer):
    """
    Pre processing a list of texts for model input

    Parameters:
    texts(str):List of strings to tokenize

    Returns:
    inputs: Tokenized inputs for the model
    """
    #Tokenize the list of strings
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
    return inputs



In [None]:

def bert_sentiment_analysis(df, column, bert_model=None, tokenizer=None):
    """
    Takes a dataframe and returns sentiment for each text using BERT model

    Parameters:
    df(pd.DataFrame()): Dataframe containing texts for analysing sentiment
    column(int): The column index in the DataFrame containing texts for sentiment analysis

    Returns:
    bert_sentiments(list): List of sentiment labels corresponding to texts
    """
    batch_size = 100
    bert_sentiments = []
    bert_label_map = {0: "Bearish", 1: "Neutral", 2: "Bullish"}
    for i in range(0,len(df),batch_size):
        #Taking strings in dataframe as a batch of size 'batch_size' and storing as list
        texts = df[i:i + batch_size][column].to_list()
        #Converting the list of strings as tokens
        inputs = preprocess(texts,tokenizer)
        #Analysing sentiment for the inputs using CryptoBERT model
        with torch.no_grad():
            outputs = bert_model(**inputs)

        # Get the logits from model output
        logits = outputs.logits
        #Getting probabilities from logits using softmax function
        probabilities = torch.nn.functional.softmax(logits, dim=-1)
        #Getting the sentiment index with higest probability
        labels = torch.argmax(probabilities, dim=1)
        #Mapping the label for each sentiment index
        for label in labels:
            bert_sentiments.append(bert_label_map[label.item()])
    return bert_sentiments

DATA PRE-PROCESSING

In [None]:
def remove_tags(string):
    result = re.sub('','',string)          #remove HTML tags
    result = re.sub('https?://\S+|www\.\S+','',result)   #remove URLs
    result = re.sub(r'[^a-zA-Z\s]', ' ',result)    #remove non-alphanumeric characters
    result = result.lower()
    return result


In [None]:
def stop_words_removal(text):
    """
    Takes a sentence and removes stop-words
    Parameters:
    text(str): Sentence from which stopwords are to be removed

    Returns:
    result_text(str): Sentence after removing stop words

    """
    # Initialising 'stopwords'
    stop_words = set(stopwords.words('english'))
    #Iterating through each word, filters stop words and append to result)text list
    result_text = [word for word in text.split() if word.lower() not in stop_words]
    return " ".join(result_text)

In [None]:

def lemmatize_text(text):
    """
    Takes a text string and returns a lemmatized text

    Parameters:
    text(str): Text which is to be lemmatized

    Returns:
    lemmatized_text(str): Lemmatized string

    """
    #Initialize whitespace tokenizer
    w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
    #Initialise lemmatizer
    lemmatizer = nltk.stem.WordNetLemmatizer()
    #Tokenize the text using WhitespaceTokenizer()
    tokenized_text = w_tokenizer.tokenize(text)
    lemmatized_text = ""
    #Iterate through each word,lemmatizes and join back as sentence
    for word in tokenized_text:
        lemmatized_text = lemmatized_text + lemmatizer.lemmatize(word) + " "
    return lemmatized_text


In [None]:
label = {
    'Negative':'Bearish',
    'Neutral' : 'Neutral',
    'Positive' : 'Bullish'
}
df['actual_label'] = df['actual_label'].map(label) #The sentiments are mapped to Bearish, Neutral and Bullish.

In [None]:
df['news'] = df['headline'] + ' : ' + df['description']

In [None]:
df['processed_news']=df['news'].apply(lambda cw : remove_tags(cw))
df['processed_news'] = df['processed_news'].apply(stop_words_removal)
df['processed_news'] = df['processed_news'].apply(lemmatize_text)

In [None]:
df.head()

Unnamed: 0,date,headline,description,actual_label,news,processed_news
0,2016-01-17,The Capital of Lithuania to Host Baltics Larg...,- The Capital of Lithuania to Host Baltics La...,Bullish,The Capital of Lithuania to Host Baltics Larg...,capital lithuania host baltic largest blockcha...
1,2015-10-17,MMA Welterweight Jon Fitch Fights for Bitcoin...,"MMA Welterweight Jon Fitch Fights for Bitcoin,...",Bullish,MMA Welterweight Jon Fitch Fights for Bitcoin...,mma welterweight jon fitch fight bitcoin embra...
2,2015-12-18,State Street Financials Crypto-FinTech Univer...,The technology researched will be focused on c...,Bullish,State Street Financials Crypto-FinTech Univer...,state street financial crypto fintech universi...
3,2013-12-10,Dollar Vigilante - BITCOINS EVOLUTION WILL BE...,BITCOINS EVOLUTION WILL BE SWIFThttp://dollar...,Bullish,Dollar Vigilante - BITCOINS EVOLUTION WILL BE...,dollar vigilante bitcoin evolution swift bitco...
4,2013-08-27,Bitcoin offers privacy -- as long as you don't...,http://www.pcworld.com/article/2047608/bitcoin...,Neutral,Bitcoin offers privacy -- as long as you don't...,bitcoin offer privacy long cash spend bitcoin ...


In [None]:
# Unlabeled dataset
bitcointalk = pd.read_csv("BitcoinTalk.csv")
#  Fill NaN values in both 'headline' and 'description' with an empty string
bitcointalk['headline'] = bitcointalk['headline'].fillna('')
bitcointalk['description'] = bitcointalk['description'].fillna('')
# Creating a 'news' column by combining both 'headline' and 'description'
bitcointalk['news'] = bitcointalk['headline'] + ' : ' + bitcointalk['description']
#Data pre-processing
bitcointalk['processed_news']= bitcointalk['news'].apply(lambda cw : remove_tags(cw))
bitcointalk['processed_news'] = bitcointalk['processed_news'].apply(stop_words_removal)
bitcointalk['processed_news'] = bitcointalk['processed_news'].apply(lemmatize_text)

In [None]:
bitcointalk.head()

Unnamed: 0,date,headline,headline_link,description,news,processed_news
0,2024-09-12,"Microstrategy Buys 18,300 More Bitcoins, ...",https://bitcointalk.org/index.php?topic=5509183.0,"Microstrategy Buys 18,300 More Bitcoins, Boost...","Microstrategy Buys 18,300 More Bitcoins, ... ...",microstrategy buy bitcoins microstrategy buy b...
1,2024-09-12,Cleanspark Expands Bitcoin Mining Operations ...,https://bitcointalk.org/index.php?topic=5509047.0,Cleanspark Expands Bitcoin Mining Operations b...,Cleanspark Expands Bitcoin Mining Operations ...,cleanspark expands bitcoin mining operation cl...
2,2024-09-11,Standard Chartered Launches Bitcoin and Ether...,https://bitcointalk.org/index.php?topic=5508911.0,Standard Chartered Launches Bitcoin and Ethere...,Standard Chartered Launches Bitcoin and Ether...,standard chartered launch bitcoin ethereum cus...
3,2024-09-10,Fractal Bitcoin Absorbs Over 35% of Bitcoins...,https://bitcointalk.org/index.php?topic=5508912.0,Fractal Bitcoin Absorbs Over 35% of Bitcoins ...,Fractal Bitcoin Absorbs Over 35% of Bitcoins...,fractal bitcoin absorbs bitcoin hashrate fract...
4,2024-09-10,Metaplanet Makes Significant Bitcoin Investment,https://bitcointalk.org/index.php?topic=5508833.0,"In Japan, the publicly traded company Metaplan...",Metaplanet Makes Significant Bitcoin Investme...,metaplanet make significant bitcoin investment...


# SENTIMENT ANALYSIS ON RAW NEWS

VADER model

In [None]:
df['vader_sentiment_raw_news']= vader_sentiment_analysis(df,"news")

In [None]:
#Storing accuracy of VADER model on raw news
vader_accuracy_raw_news = pd.DataFrame({
    'Model': ['VADER on raw news'],
    'Accuracy': [accuracy_score(df['actual_label'], df['vader_sentiment_raw_news'])]
})

# Use pd.concat() to add the new row
sentiment_analyser_comparison = pd.concat([sentiment_analyser_comparison, vader_accuracy_raw_news], ignore_index=True)


TextBlob

In [None]:
df['textblob_sentiment_raw_news'] = textblob_sentiment(df,"news")

In [None]:
#Storing accuracy of TextBlob model on raw news
textblob_accuracy_raw_news = pd.DataFrame({
    'Model': ['TextBlob on raw news'],
    'Accuracy': [accuracy_score(df['actual_label'], df['textblob_sentiment_raw_news'])]
})

# Use pd.concat() to add the new row
sentiment_analyser_comparison = pd.concat([sentiment_analyser_comparison, textblob_accuracy_raw_news], ignore_index=True)


TF-IDF approach

In [None]:
# Initialize the TF-IDF vectorizer.
tfidfconverter = TfidfVectorizer()

# Fit the vectorizer on the 'raw_news' column of the DataFrame
# and transform the text into TF-IDF features.
X = tfidfconverter.fit_transform(df['news'])
# Define the target variable (actual labels) for classification.
y = df['actual_label_num']
# Split the data into training and testing sets (80% train, 20% test).
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)
# Initialize the logistic regression model.
logistic_model_tfidf_raw_news = LogisticRegression()
# Train the logistic regression model using the training data.
logistic_model_tfidf_raw_news.fit(X_train,y_train)
# Make predictions on the test set.
y_pred = logistic_model_tfidf_raw_news.predict(X_test)
# Calculate the accuracy of the model's predictions using the test data.
tfidf_accuracy_raw_news = accuracy_score(y_test,y_pred)


In [None]:
#Storing accuracy of TF-IDF approach on raw news
tfidf_accuracy_raw_news = pd.DataFrame({
    'Model': ['TF-IDF on raw news'],
    'Accuracy': tfidf_accuracy_raw_news
})

# Use pd.concat() to add the new row
sentiment_analyser_comparison = pd.concat([sentiment_analyser_comparison, tfidf_accuracy_raw_news], ignore_index=True)


BERT models

In [None]:
#Loading pre-trained CryptoBERT tokenizer for the model
cryptobert_tokenizer = AutoTokenizer.from_pretrained("ElKulako/cryptobert")
#Loading pre-trained CryptoBERT model for classification
cryptobert_model = AutoModelForSequenceClassification.from_pretrained("ElKulako/cryptobert")


#Loading pre-trained FinBERT tokenizer for the model
finbert_tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
#Loading pre-trained FinBERT model for classification
finbert_model = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone')


#Loading pre-trained DistilBERT tokenizer for the model
distilbert_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
#Loading pre-trained DistilBERT model for classification
distilbert_model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/957 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/932 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
df['cryptobert_sentiment_raw_news']= bert_sentiment_analysis(df,"news", cryptobert_model, cryptobert_tokenizer)
df['finbert_sentiment_raw_news'] = bert_sentiment_analysis(df,"news", finbert_model, finbert_tokenizer)
df['distilbert_sentiment_raw_news'] = bert_sentiment_analysis(df,"news", distilbert_model, distilbert_tokenizer)


In [None]:
#Storing accuracy of CryptoBERT model on raw news
cryptobert_accuracy_raw_news = pd.DataFrame({
    'Model':[ 'CryptoBERT on raw news'],
    "Accuracy": accuracy_score(df['actual_label'], df['cryptobert_sentiment_raw_news'])
})

#Storing accuracy of FinBERT model on raw news
finbert_accuracy_raw_news = pd.DataFrame({
    'Model':[ 'FinBERT on raw news'],
    "Accuracy": accuracy_score(df['actual_label'], df['finbert_sentiment_raw_news'])
})

#Storing accuracy of DistilBERT model on raw news
distilbert_accuracy_raw_news = pd.DataFrame({
    'Model':[ 'DistilBERT on raw news'],
    "Accuracy": accuracy_score(df['actual_label'], df['distilbert_sentiment_raw_news'])
})


# Use pd.concat() to add the new row
sentiment_analyser_comparison = pd.concat([sentiment_analyser_comparison, cryptobert_accuracy_raw_news], ignore_index=True)
sentiment_analyser_comparison = pd.concat([sentiment_analyser_comparison, finbert_accuracy_raw_news], ignore_index=True)
sentiment_analyser_comparison = pd.concat([sentiment_analyser_comparison, distilbert_accuracy_raw_news], ignore_index=True)


In [None]:
sentiment_analyser_comparison

Unnamed: 0,Model,Accuracy
0,VADER on raw news,0.580581
1,CryptoBERT on raw news,0.35035
2,FinBERT on raw news,0.248248
3,DistilBERT on raw news,0.258258


## SENTIMENT ANALYSIS ON PROCESSED NEWS

VADER model

In [None]:
df['vader_sentiment_processed_news']= vader_sentiment_analysis(df,"processed_news")

In [None]:
#Storing accuracy of VADER model on processed news
vader_accuracy_processed_news = pd.DataFrame({
    'Model': ['VADER on processed news'],
    'Accuracy': [accuracy_score(df['actual_label'], df['vader_sentiment_processed_news'])]
})

# Use pd.concat() to add the new row
sentiment_analyser_comparison = pd.concat([sentiment_analyser_comparison, vader_accuracy_processed_news], ignore_index=True)


TextBlob

In [None]:
df['textblob_sentiment_processed_news']= textblob_sentiment(df,"processed_news")

In [None]:
#Storing accuracy of TextBlob model on processed news
textblob_accuracy_processed_news = pd.DataFrame({
    'Model': ['TextBlob on processed news'],
    'Accuracy': [accuracy_score(df['actual_label'], df['textblob_sentiment_processed_news'])]
})

# Use pd.concat() to add the new row
sentiment_analyser_comparison = pd.concat([sentiment_analyser_comparison, textblob_accuracy_processed_news], ignore_index=True)


TF-IDF Approach

In [None]:
# Initialize the TF-IDF vectorizer.
tfidfconverter = TfidfVectorizer()

# Fit the vectorizer on the 'processed_news' column of the DataFrame
# and transform the text into TF-IDF features.
X = tfidfconverter.fit_transform(df['processed_news'])
# Define the target variable (actual labels) for classification.
y = df['actual_label_num']
# Split the data into training and testing sets (80% train, 20% test).
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)
# Initialize the logistic regression model.
logistic_model_tfidf_processed_news = LogisticRegression()
# Train the logistic regression model using the training data.
logistic_model_tfidf_processed_news.fit(X_train,y_train)
# Make predictions on the test set.
y_pred = logistic_model_tfidf_processed_news.predict(X_test)
# Calculate the accuracy of the model's predictions using the test data.
tfidf_accuracy_processed_news = accuracy_score(y_test,y_pred)


In [None]:
#Storing accuracy of TF-IDF approach on processed news
tfidf_accuracy_processed_news = pd.DataFrame({
    'Model': ['TF-IDF on processed news'],
    'Accuracy': tfidf_accuracy_processed_news
})

# Use pd.concat() to add the new row
sentiment_analyser_comparison = pd.concat([sentiment_analyser_comparison, tfidf_accuracy_processed_news], ignore_index=True)


BERT models

In [None]:
#Loading pre-trained CryptoBERT tokenizer for the model
cryptobert_tokenizer = AutoTokenizer.from_pretrained("ElKulako/cryptobert")
#Loading pre-trained CryptoBERT model for classification
cryptobert_model = AutoModelForSequenceClassification.from_pretrained("ElKulako/cryptobert")


#Loading pre-trained FinBERT tokenizer for the model
finbert_tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
#Loading pre-trained FinBERT model for classification
finbert_model = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone')


#Loading pre-trained DistilBERT tokenizer for the model
distilbert_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
#Loading pre-trained DistilBERT model for classification
distilbert_model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
df['cryptobert_sentiment_processed_news']= bert_sentiment_analysis(df,"processed_news", cryptobert_model, cryptobert_tokenizer)
df['finbert_sentiment_processed_news'] = bert_sentiment_analysis(df,"processed_news", finbert_model, finbert_tokenizer)
df['distilbert_sentiment_processed_news'] = bert_sentiment_analysis(df,"processed_news", distilbert_model, distilbert_tokenizer)


In [None]:
#Storing accuracy of CryptoBERT model on processed news
cryptobert_accuracy_processed_news = pd.DataFrame({
    'Model':[ 'CryptoBERT on processed news'],
    "Accuracy": accuracy_score(df['actual_label'], df['cryptobert_sentiment_processed_news'])
})

#Storing accuracy of FinBERT model on processed news
finbert_accuracy_processed_news = pd.DataFrame({
    'Model':[ 'FinBERT on processed news'],
    "Accuracy": accuracy_score(df['actual_label'], df['finbert_sentiment_processed_news'])
})

#Storing accuracy of DistilBERT model on processed news
distilbert_accuracy_processed_news = pd.DataFrame({
    'Model':[ 'DistilBERT on processed news'],
    "Accuracy": accuracy_score(df['actual_label'], df['distilbert_sentiment_processed_news'])
})


# Use pd.concat() to add the new row
sentiment_analyser_comparison = pd.concat([sentiment_analyser_comparison, cryptobert_accuracy_processed_news], ignore_index=True)
sentiment_analyser_comparison = pd.concat([sentiment_analyser_comparison, finbert_accuracy_processed_news], ignore_index=True)
sentiment_analyser_comparison = pd.concat([sentiment_analyser_comparison, distilbert_accuracy_processed_news], ignore_index=True)


In [None]:
sentiment_analyser_comparison

Unnamed: 0,Model,Accuracy
0,VADER on raw news,0.580581
1,CryptoBERT on raw news,0.35035
2,FinBERT on raw news,0.248248
3,DistilBERT on raw news,0.258258
4,VADER on processed news,0.573574
5,CryptoBERT on processed news,0.373373
6,FinBERT on processed news,0.268268
7,DistilBERT on processed news,0.374374


In [None]:


# Save the DataFrame as a CSV
sentiment_analyser_comparison.to_csv(sentiment_analyser_comparison_path, index=False)

In [None]:
sentiment_analyser_comparison

Unnamed: 0,Model,Accuracy
0,VADER on raw news,0.580581
1,CryptoBERT on raw news,0.35035
2,FinBERT on raw news,0.248248
3,DistilBERT on raw news,0.258258
4,VADER on processed news,0.573574
5,CryptoBERT on processed news,0.373373
6,FinBERT on processed news,0.268268
7,DistilBERT on processed news,0.374374
8,Fine-tune CryptoBERT on raw news,0.641026


#FINE-TUNING APPROACH

In [None]:
#Count of each class
Counter(df['actual_label'])

Counter({'Bullish': 497, 'Neutral': 242, 'Bearish': 260})

In [None]:
#Making the dataset balanced
count = 0
for index,row in df.iterrows():
    if count > 220:
        break
    if row['actual_label'] == 'Bullish':
        df = df.drop(index)
        count+=1

In [None]:
Counter(df['actual_label']) #Balanced dataset

Counter({'Neutral': 242, 'Bearish': 260, 'Bullish': 276})

In [None]:
#Convert the labels to numerical for model input
bert_label_num = {
    'Bearish':0,
    'Neutral' : 1,
    'Bullish' : 2
}
df['actual_label_num'] = df['actual_label'].map(bert_label_num)

In [None]:
Counter(df['actual_label_num'])

Counter({1: 242, 0: 260, 2: 276})

In [None]:
split_ratio = 0.8
split_point = int(len(df) * split_ratio)

# Split into train and test datasets
train_df = df[:split_point]
test_df = df[split_point:]

In [None]:
# Convert Pandas DataFrame to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [None]:
train_dataset

Dataset({
    features: ['date', 'headline', 'description', 'actual_label', 'news', 'processed_news', 'actual_label_num', '__index_level_0__'],
    num_rows: 622
})

In [None]:
finetune_cryptobert_tokenizer = AutoTokenizer.from_pretrained('ElKulako/cryptobert')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
#Tokenizing the raw data
finetune_tokenized_raw_data = finetune_cryptobert_tokenizer(train_dataset["news"], return_tensors="np", padding=True,truncation=True, max_length=512)
# Tokenizer returns a BatchEncoding, but we convert that to a dict for Keras
finetune_tokenized_raw_data = dict(finetune_tokenized_raw_data)

#Tokenizing the processed data
finetune_tokenized_processed_data = finetune_cryptobert_tokenizer(train_dataset["processed_news"], return_tensors="np", padding=True,truncation=True, max_length=512)
# Tokenizer returns a BatchEncoding, but we convert that to a dict for Keras
finetune_tokenized_processed_data = dict(finetune_tokenized_processed_data)

labels = np.array(train_dataset["actual_label_num"])  # Label is already an array of 0 and 1


In [None]:
#Raw data
# Assume tokenized_data and labels are already prepared
# For demonstration, converting them into a Hugging Face Dataset
raw_dataset = Dataset.from_dict({"input_ids": finetune_tokenized_raw_data["input_ids"],
                             "attention_mask": finetune_tokenized_raw_data["attention_mask"],
                             "labels": labels})

#Processed data
# Assume tokenized_data and labels are already prepared
# For demonstration, converting them into a Hugging Face Dataset
processed_dataset = Dataset.from_dict({"input_ids": finetune_tokenized_processed_data["input_ids"],
                             "attention_mask": finetune_tokenized_processed_data["attention_mask"],
                             "labels": labels})

Fine-tuning with Raw data

In [None]:
finetune_cryptobert_model = AutoModelForSequenceClassification.from_pretrained("ElKulako/cryptobert")


In [None]:
#  Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # number of training epochs
    per_device_train_batch_size=8,   # batch size for training
    per_device_eval_batch_size=8,    # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    learning_rate=3e-5               # learning rate
)

# Define the Trainer
trainer_raw = Trainer(
    model=finetune_cryptobert_model,     # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=raw_dataset,               # training dataset
)

# Train the model
trainer_raw.train()



Step,Training Loss


Step,Training Loss


TrainOutput(global_step=234, training_loss=1.0348587036132812, metrics={'train_runtime': 12756.3676, 'train_samples_per_second': 0.146, 'train_steps_per_second': 0.018, 'total_flos': 490969637480448.0, 'train_loss': 1.0348587036132812, 'epoch': 3.0})

In [None]:
# Prepare test data
test_raw_texts = test_dataset['news']
true_labels = test_dataset['actual_label_num']


In [None]:

# Tokenize test data
test_encodings_raw = finetune_cryptobert_tokenizer(test_raw_texts, truncation=True, padding=True, max_length=512)



In [None]:
#  Convert to a Hugging Face Dataset without labels
test_dataset_raw = Dataset.from_dict({
    'input_ids': test_encodings_raw['input_ids'],
    'attention_mask': test_encodings_raw['attention_mask']
})


In [None]:
# Predictions for test raw dataaset
predictions_raw = trainer_raw.predict(test_dataset_raw)
logits_raw = predictions_raw.predictions
predicted_labels_raw = torch.argmax(torch.tensor(logits_raw), dim=1).numpy()


In [None]:
# Evaluation metrics
accuracy_trainer_raw = accuracy_score(true_labels, predicted_labels_raw)
f1_trainer_raw = f1_score(true_labels, predicted_labels_raw, average='weighted')
precision_trainer_raw = precision_score(true_labels, predicted_labels_raw, average='weighted')
recall_trainer_raw = recall_score(true_labels, predicted_labels_raw, average='weighted')

print(f"Accuracy: {accuracy_trainer_raw:.4f}")
print(f"F1-Score: {f1_trainer_raw:.4f}")
print(f"Precision: {precision_trainer_raw:.4f}")
print(f"Recall: {recall_trainer_raw:.4f}")

In [None]:
#Storing accuracy of Fine-tune model on raw news
finetune_cryptoBERT_accuracy_raw_news = pd.DataFrame({
    'Model':[ 'Fine-tune CryptoBERT on raw news'],
    "Accuracy": accuracy_trainer_raw
})


# Use pd.concat() to add the new row
sentiment_analyser_comparison = pd.concat([sentiment_analyser_comparison, finetune_cryptoBERT_accuracy_raw_news], ignore_index=True)


In [None]:
# Save the DataFrame as a CSV
sentiment_analyser_comparison.to_csv(file_path, index=False)

Fine-tuning with Processed data

In [None]:
finetune_cryptobert_model = AutoModelForSequenceClassification.from_pretrained("ElKulako/cryptobert")


In [None]:
#  Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # number of training epochs
    per_device_train_batch_size=8,   # batch size for training
    per_device_eval_batch_size=8,    # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    learning_rate=3e-5               # learning rate
)

# Define the Trainer
trainer_processed = Trainer(
    model=finetune_cryptobert_model,     # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=processed_dataset,               # training dataset
)

# Train the model
trainer_processed.train()



Step,Training Loss


TrainOutput(global_step=234, training_loss=1.1218822511852298, metrics={'train_runtime': 11594.4128, 'train_samples_per_second': 0.161, 'train_steps_per_second': 0.02, 'total_flos': 490969637480448.0, 'train_loss': 1.1218822511852298, 'epoch': 3.0})

In [None]:
# Prepare test data
test_processed_texts = test_dataset['processed_news']
true_labels = test_dataset['actual_label_num']


In [None]:

# Tokenize test data
test_encodings_processed = finetune_cryptobert_tokenizer(test_processed_texts, truncation=True, padding=True, max_length=512)



In [None]:
#  Convert to a Hugging Face Dataset without labels
test_dataset_processed = Dataset.from_dict({
    'input_ids': test_encodings_processed['input_ids'],
    'attention_mask': test_encodings_processed['attention_mask']
})


In [None]:
# Predictions for test raw dataaset
predictions_processed = trainer_processed.predict(test_dataset_processed)
logits_processed = predictions_processed.predictions
predicted_labels_processed = torch.argmax(torch.tensor(logits_processed), dim=1).numpy()


In [None]:
# Evaluation metrics
accuracy_trainer_processed = accuracy_score(true_labels, predicted_labels_processed)
f1_trainer_processed = f1_score(true_labels, predicted_labels_processed, average='weighted')
precision_trainer_processed = precision_score(true_labels, predicted_labels_processed, average='weighted')
recall_trainer_processed = recall_score(true_labels, predicted_labels_processed, average='weighted')

print(f"Accuracy: {accuracy_trainer_processed:.4f}")
print(f"F1-Score: {f1_trainer_processed:.4f}")
print(f"Precision: {precision_trainer_processed:.4f}")
print(f"Recall: {recall_trainer_processed:.4f}")

Accuracy: 0.6538
F1-Score: 0.6539
Precision: 0.6562
Recall: 0.6538


In [None]:
#Storing accuracy of Fine-tune model on raw news
finetune_cryptoBERT_accuracy_processed_news = pd.DataFrame({
    'Model':[ 'Fine-tune CryptoBERT on processed news'],
    "Accuracy": accuracy_trainer_processed
})


# Use pd.concat() to add the new row
sentiment_analyser_comparison = pd.concat([sentiment_analyser_comparison, finetune_cryptoBERT_accuracy_processed_news], ignore_index=True)


In [None]:
sentiment_analyser_comparison

Unnamed: 0,Model,Accuracy
0,VADER on raw news,0.580581
1,CryptoBERT on raw news,0.35035
2,FinBERT on raw news,0.248248
3,DistilBERT on raw news,0.258258
4,VADER on processed news,0.573574
5,CryptoBERT on processed news,0.373373
6,FinBERT on processed news,0.268268
7,DistilBERT on processed news,0.374374
8,Fine-tune CryptoBERT on raw news,0.641026
9,Fine-tune CryptoBERT on processed news,0.653846


In [None]:
# Save the DataFrame as a CSV
sentiment_analyser_comparison.to_csv(sentiment_analyser_comparison_path, index=False)

In [None]:

# Save the model to Google Drive
model_save_path = '/content/drive/MyDrive/Colab Notebooks/Thesis/models/finetune_model_processed_data'
trainer_processed.save_model(model_save_path)


In [None]:
sentiment_analyser_comparison

Unnamed: 0,Model,Accuracy
0,VADER on raw news,0.580581
1,CryptoBERT on raw news,0.35035
2,FinBERT on raw news,0.248248
3,DistilBERT on raw news,0.258258
4,VADER on processed news,0.573574
5,CryptoBERT on processed news,0.373373
6,FinBERT on processed news,0.268268
7,DistilBERT on processed news,0.374374
8,Fine-tune CryptoBERT on raw news,0.641026
9,Fine-tune CryptoBERT on processed news,0.653846


# PREDICTING LABELS ON ACTUAL UNLABELED PROCESSED DATASET USING FINE-TUNE CRYPTOBERT

In [None]:
bitcointalk.head()

Unnamed: 0,date,headline,headline_link,description,news,processed_news
0,2024-09-12,"Microstrategy Buys 18,300 More Bitcoins, ...",https://bitcointalk.org/index.php?topic=5509183.0,"Microstrategy Buys 18,300 More Bitcoins, Boost...","Microstrategy Buys 18,300 More Bitcoins, ... ...",microstrategy buy bitcoins microstrategy buy b...
1,2024-09-12,Cleanspark Expands Bitcoin Mining Operations ...,https://bitcointalk.org/index.php?topic=5509047.0,Cleanspark Expands Bitcoin Mining Operations b...,Cleanspark Expands Bitcoin Mining Operations ...,cleanspark expands bitcoin mining operation cl...
2,2024-09-11,Standard Chartered Launches Bitcoin and Ether...,https://bitcointalk.org/index.php?topic=5508911.0,Standard Chartered Launches Bitcoin and Ethere...,Standard Chartered Launches Bitcoin and Ether...,standard chartered launch bitcoin ethereum cus...
3,2024-09-10,Fractal Bitcoin Absorbs Over 35% of Bitcoins...,https://bitcointalk.org/index.php?topic=5508912.0,Fractal Bitcoin Absorbs Over 35% of Bitcoins ...,Fractal Bitcoin Absorbs Over 35% of Bitcoins...,fractal bitcoin absorbs bitcoin hashrate fract...
4,2024-09-10,Metaplanet Makes Significant Bitcoin Investment,https://bitcointalk.org/index.php?topic=5508833.0,"In Japan, the publicly traded company Metaplan...",Metaplanet Makes Significant Bitcoin Investme...,metaplanet make significant bitcoin investment...


In [None]:
# Loading fine-tuned model
final_model = AutoModelForSequenceClassification.from_pretrained(model_save_path)  # Path where the model was saved
final_tokenizer = AutoTokenizer.from_pretrained('ElKulako/cryptobert')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/957 [00:00<?, ?B/s]



In [None]:
# Prepare processed news
processed_news = bitcointalk['processed_news'].tolist()

# Tokenize the news data
processed_news_encodings = final_tokenizer(processed_news, truncation=True, padding=True, max_length=512)

#  Convert to a Hugging Face Dataset
processed_news_dataset = Dataset.from_dict({
    'input_ids': processed_news_encodings['input_ids'],
    'attention_mask': processed_news_encodings['attention_mask']
})

In [None]:
# Set model to evaluation mode
final_model.eval()

# Convert Hugging Face dataset to PyTorch Tensors
processed_news_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

# Create a DataLoader to batch the inputs (optional but useful for larger datasets)
dataloader = DataLoader(processed_news_dataset, batch_size=16)

predictions = []

# Loop through the dataloader and make predictions
for batch in dataloader:
    input_ids = batch['input_ids'].to(final_model.device)
    attention_mask = batch['attention_mask'].to(final_model.device)

    # Get model predictions
    with torch.no_grad():
        outputs = final_model(input_ids=input_ids, attention_mask=attention_mask)

    # Get the predicted labels
    logits = outputs.logits
    predicted_labels = torch.argmax(logits, dim=1)
    predictions.extend(predicted_labels.cpu().numpy())  # Append predictions



In [None]:
len(predictions)

370

In [None]:
bitcointalk_labeled_path = 'data/news_predicted_sentiment.csv'
bitcointalk_labeled = pd.read_csv(bitcointalk_labeled_path)
bitcointalk_labeled

Unnamed: 0,date,headline,headline_link,description,news,processed_news,sentiment
0,2024-09-12,"Microstrategy Buys 18,300 More Bitcoins, ...",https://bitcointalk.org/index.php?topic=5509183.0,"Microstrategy Buys 18,300 More Bitcoins, Boost...","Microstrategy Buys 18,300 More Bitcoins, ... ...",microstrategy buy bitcoins microstrategy buy b...,0.0
1,2024-09-12,Cleanspark Expands Bitcoin Mining Operations ...,https://bitcointalk.org/index.php?topic=5509047.0,Cleanspark Expands Bitcoin Mining Operations b...,Cleanspark Expands Bitcoin Mining Operations ...,cleanspark expands bitcoin mining operation cl...,2.0
2,2024-09-11,Standard Chartered Launches Bitcoin and Ether...,https://bitcointalk.org/index.php?topic=5508911.0,Standard Chartered Launches Bitcoin and Ethere...,Standard Chartered Launches Bitcoin and Ether...,standard chartered launch bitcoin ethereum cus...,2.0
3,2024-09-10,Fractal Bitcoin Absorbs Over 35% of Bitcoins...,https://bitcointalk.org/index.php?topic=5508912.0,Fractal Bitcoin Absorbs Over 35% of Bitcoins ...,Fractal Bitcoin Absorbs Over 35% of Bitcoins...,fractal bitcoin absorbs bitcoin hashrate fract...,2.0
4,2024-09-10,Metaplanet Makes Significant Bitcoin Investment,https://bitcointalk.org/index.php?topic=5508833.0,"In Japan, the publicly traded company Metaplan...",Metaplanet Makes Significant Bitcoin Investme...,metaplanet make significant bitcoin investment...,2.0
...,...,...,...,...,...,...,...
34525,2010-06-03,Techcrunch Calculating The Long-Term Value Of ...,https://bitcointalk.org/index.php?topic=224225.0,http://techcrunch.com/2013/06/02/calculating-t...,Techcrunch Calculating The Long-Term Value Of ...,techcrunch calculating long term value bitcoin,
34526,2004-12-18,CNBC: Russians move into bitcoin as ruble tanks,https://bitcointalk.org/index.php?topic=897213.0,QuoteTransaction volumes between the ruble and...,CNBC: Russians move into bitcoin as ruble tan...,cnbc russian move bitcoin ruble tank quotetran...,
34527,1996-10-31,How To Make A Mint: The Cryptography Of Anony...,https://bitcointalk.org/index.php?topic=235310.0,Not quite fresh news (yes the date is correct)...,How To Make A Mint: The Cryptography Of Anony...,make mint cryptography anonymous electronic ca...,
34528,2012-10-31,12-10-31 FinacialSense.com - Bitcoin Is Digit...,https://bitcointalk.org/index.php?topic=121411.0,http://www.financialsense.com/financial-sense-...,12-10-31 FinacialSense.com - Bitcoin Is Digit...,finacialsense com bitcoin digital gold future,


In [None]:
bitcointalk_labeled.loc[34159:]

Unnamed: 0,date,headline,headline_link,description,news,processed_news,sentiment
34159,2012-09-03,CalvinAyre.com - Can Bitcoin-Only Online Casin...,https://bitcointalk.org/index.php?topic=105663.0,2012-09-03 CalvinAyre.com - Can Bitcoin-Only O...,CalvinAyre.com - Can Bitcoin-Only Online Casin...,calvinayre com bitcoin online casino like bitz...,2.0
34160,2012-09-02,TNW - Bitcoin: Alive and here to stay? Or slow...,https://bitcointalk.org/index.php?topic=105504.0,Excellent recent article! http://thenextweb.co...,TNW - Bitcoin: Alive and here to stay? Or slow...,tnw bitcoin alive stay slowly fading away exce...,
34161,2012-08-31,ORF.at - Bitcoin statt Euro: Eine Onlinewähru...,https://bitcointalk.org/index.php?topic=285493.0,http://help.orf.at/stories/1723893/ORF is the ...,ORF.at - Bitcoin statt Euro: Eine Onlinewähru...,orf bitcoin statt euro eine onlinew hrung im o...,
34162,2012-08-31,forbes.com - BitZino And The Dawn Of 'Provably...,https://bitcointalk.org/index.php?topic=105180.0,QuoteBitZino And The Dawn Of 'Provably Fair' C...,forbes.com - BitZino And The Dawn Of 'Provably...,forbes com bitzino dawn provably fair casino g...,
34163,2012-08-31,Bitcoin: How a Virtual Currency Became Real wi...,https://bitcointalk.org/index.php?topic=105392.0,http://pandodaily.com/2012/08/31/bitcoin-how-a...,Bitcoin: How a Virtual Currency Became Real wi...,bitcoin virtual currency became real fraud,
...,...,...,...,...,...,...,...
34525,2010-06-03,Techcrunch Calculating The Long-Term Value Of ...,https://bitcointalk.org/index.php?topic=224225.0,http://techcrunch.com/2013/06/02/calculating-t...,Techcrunch Calculating The Long-Term Value Of ...,techcrunch calculating long term value bitcoin,
34526,2004-12-18,CNBC: Russians move into bitcoin as ruble tanks,https://bitcointalk.org/index.php?topic=897213.0,QuoteTransaction volumes between the ruble and...,CNBC: Russians move into bitcoin as ruble tan...,cnbc russian move bitcoin ruble tank quotetran...,
34527,1996-10-31,How To Make A Mint: The Cryptography Of Anony...,https://bitcointalk.org/index.php?topic=235310.0,Not quite fresh news (yes the date is correct)...,How To Make A Mint: The Cryptography Of Anony...,make mint cryptography anonymous electronic ca...,
34528,2012-10-31,12-10-31 FinacialSense.com - Bitcoin Is Digit...,https://bitcointalk.org/index.php?topic=121411.0,http://www.financialsense.com/financial-sense-...,12-10-31 FinacialSense.com - Bitcoin Is Digit...,finacialsense com bitcoin digital gold future,


In [None]:
bitcointalk_labeled.loc[34160:,"sentiment"] = predictions

In [None]:
bitcointalk_labeled.to_csv(bitcointalk_labeled_path, index=False)