# The following is the workflow for news headlines sentiment analysis using fin-bert based transformer

In [59]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import datetime
from tqdm import tqdm

In [60]:
df = pd.read_csv('bitcoin_news_data.csv', 
                 parse_dates=['Date'],
                 date_format=('%Y-%m-%d'))

# df['Date'] = pd.to_datetime(df['Date']).dt.strftime('%Y-%m-%d')
# df.set_index('Date', inplace = True)
df = df.drop(columns=["Query", "Link", "Summary"])
df_original = df.copy()
df

Unnamed: 0,Date,Title
0,2014-09-02 07:00:00,Bitcoin's future depends on public acceptance ...
1,2014-09-02 07:00:00,Armory to Match 10 BTC in Donations to Hal Fin...
2,2014-09-01 07:00:00,Finnish Investor Plans to Turn Estonian Castle...
3,2014-09-02 07:00:00,Hal Finney – We Salute You - Bitcoin Magazine
4,2014-09-01 07:00:00,Salaries paid in bitcoin a growing trend in Ca...
...,...,...
132456,2024-04-11 19:49:55,Wintermute sees positive prospects for Bitcoin...
132457,2024-04-12 05:36:39,Venezuelan Probe Unveils Crypto Money Launderi...
132458,2024-04-11 13:14:59,Crypto at the Capitol: Legislature revisiting ...
132459,2024-04-12 00:53:00,El Salvador's newest Hilton hotel to tap into ...


In [61]:
# The pipeline function is very convenient but can be inefficient for large-scale processing because it 
# handles texts one at a time. Directly using the model and tokenizer will be more efficient for batch processing.

def load_model_and_tokenizer(model_name):
    """
    Load the pre-trained BERT model and tokenizer from Hugging Face.
    
    Parameters:
    - model_name (str): Identifier for the model on Hugging Face.
    
    Returns:
    - tokenizer: Pre-trained tokenizer for the model.
    - model: Pre-trained BERT model with sequence classification head.
    """
    # Load pre-trained model tokenizer (vocabulary)
    tokenizer = BertTokenizer.from_pretrained(model_name)

    # Load pre-trained model
    model = BertForSequenceClassification.from_pretrained(model_name)
    
    # Send model to GPU if available
    if torch.cuda.is_available():
        model = model.cuda()
    
    return tokenizer, model


def classify_sentiment(text, tokenizer, model):
    """
    Classify the sentiment of the given text using the specified model and tokenizer.
    
    Parameters:
    - text (str): Text to classify.
    - tokenizer: Tokenizer corresponding to the model.
    - model: Pre-trained BERT model with sequence classification head.
    
    Returns:
    - str: Classification result.
    """
    # Create a pipeline for sentiment analysis, specify device to 0 if GPU is available
    device = 0 if torch.cuda.is_available() else -1
    classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, device=device)

    # Run classifier on the input text
    results = classifier(text)

    return results

##### Description in transformer mode used
1. The model used is trained on Kaggle finance news dataset.
2. The model is a pretrained bert-base transformer fine-tuned on financial news sentiment dataset making it approprite for our usecase of financial news headlines sentiment analysis.
3. The detailed description can be read from [Huggingface](https://huggingface.co/oferweintraub/bert-base-finance-sentiment-noisy-search).
4. Through qualitative comparison analysis between results of sentiment analysis using bert based transformer and nltk's vader-lexicon has been carried out.
5. After analysis, it can be seen that transformer model gives better results than nltk's vader-lexicon. Therefore we will use this for our machine learning workflow.

In [62]:
model_name = 'oferweintraub/bert-base-finance-sentiment-noisy-search'
tokenizer, model = load_model_and_tokenizer(model_name)

results = []

# Input text for sentiment classification
for input_text in tqdm(df['Title']):
    prediction = classify_sentiment(input_text, tokenizer, model)
    results.append(prediction[0]['label'])
    # print(prediction[0]['label'])

# print(results)

df['Sentiment'] = results
df


100%|██████████| 132461/132461 [28:06<00:00, 78.54it/s]


Unnamed: 0,Date,Title,Sentiment
0,2014-09-02 07:00:00,Bitcoin's future depends on public acceptance ...,neutral
1,2014-09-02 07:00:00,Armory to Match 10 BTC in Donations to Hal Fin...,positive
2,2014-09-01 07:00:00,Finnish Investor Plans to Turn Estonian Castle...,neutral
3,2014-09-02 07:00:00,Hal Finney – We Salute You - Bitcoin Magazine,neutral
4,2014-09-01 07:00:00,Salaries paid in bitcoin a growing trend in Ca...,positive
...,...,...,...
132456,2024-04-11 19:49:55,Wintermute sees positive prospects for Bitcoin...,positive
132457,2024-04-12 05:36:39,Venezuelan Probe Unveils Crypto Money Launderi...,negative
132458,2024-04-11 13:14:59,Crypto at the Capitol: Legislature revisiting ...,negative
132459,2024-04-12 00:53:00,El Salvador's newest Hilton hotel to tap into ...,neutral


In [63]:
df.to_csv('bitcoin_news_data_sentiment_finbert.csv', index=False, encoding='utf-8')