In [78]:
from transformers import (
    AutoTokenizer,
    AutoModelForMaskedLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    pipeline,
    AutoModelForSequenceClassification,
    TextClassificationPipeline,
    pipeline
)
from datasets import Dataset, DatasetDict

In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ast
import os
from google.colab import drive
import random
import torch

### initial setup

In [3]:
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
base_path = '/content/drive/My Drive/magisterka'

In [5]:
%cd /content/drive/My Drive/magisterka

/content/drive/My Drive/magisterka


In [7]:
from scripts.data_processing.financial_news_data_processing import load_news_data

In [8]:
df = load_news_data(f'{base_path}/data/finlighten_news/')

In [10]:
df['publishDate'] = pd.to_datetime(df['publishDate'], format='mixed')

In [11]:
df = df[df['publishDate'] > '2025-04-01 00:00:00']

In [12]:
df = df.drop_duplicates()

In [14]:
np.random.seed(123)
random.seed(123)

In [15]:
dataset = Dataset.from_pandas(df[['text']])

In [16]:
dataset = dataset.train_test_split(test_size=0.2)

In [17]:
model_name = "yiyanghkust/finbert-tone"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

Some weights of BertForMaskedLM were not initialized from the model checkpoint at yiyanghkust/finbert-tone and are newly initialized: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### pre-training MLM

In [18]:
def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)

In [27]:
tokenized_dataset = dataset.map(tokenize, batched=True)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15
)

Map:   0%|          | 0/14578 [00:00<?, ? examples/s]

Map:   0%|          | 0/3645 [00:00<?, ? examples/s]

In [28]:
training_args = TrainingArguments(
    output_dir="./finbert-mlm-crypto",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=16,
    save_steps=500,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=50,
    report_to="none",
    eval_strategy="no",
)

In [29]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    data_collator=data_collator,
)


In [30]:
trainer.train()

Step,Training Loss
50,3.218
100,2.7289
150,2.5616
200,3.0049
250,3.082
300,2.8611
350,2.7671
400,2.6722
450,2.5968
500,2.4883


TrainOutput(global_step=4560, training_loss=1.6737264074777303, metrics={'train_runtime': 2245.2831, 'train_samples_per_second': 32.464, 'train_steps_per_second': 2.031, 'total_flos': 4796269350059520.0, 'train_loss': 1.6737264074777303, 'epoch': 5.0})

In [33]:
model.save_pretrained(f'{base_path}/data/results/finbert_articles_1')

In [35]:
sampled_indices = random.sample(range(len(tokenized_dataset['test'])), 100)
sampled_data = tokenized_dataset["test"].select(sampled_indices)

In [36]:
sampled_data.to_csv(f"{base_path}/data/results/sample_test_dataset_finlight.csv")

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

469612

### getting pseudolabels from not fine-tuned BERT

In [64]:
pseudo_pipeline = pipeline(
    "sentiment-analysis",
    model="yiyanghkust/finbert-tone",
    tokenizer=tokenizer,
    truncation=True,
    max_length=128
)

Device set to use cuda:0


In [66]:
label2id = {"negative": 0, "neutral": 1, "positive": 2}

In [67]:
texts = dataset["train"]["text"]
pseudo_labels = []
pseudo_texts = []

for t in texts:
    out = pseudo_pipeline(t)[0]
    if out["score"] >= 0.9:  # keep only confident predictions
        pseudo_texts.append(t)
        pseudo_labels.append(out["label"])


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


KeyError: 'Positive'

In [70]:
label_ids = [label2id[l.lower()] for l in pseudo_labels]

In [80]:
pseudo_ds = Dataset.from_dict({
    "text": pseudo_texts,
    "label": label_ids,
})

In [82]:
pseudo_ds.to_csv(f"{base_path}/data/results/pseudo_labels_finlight.csv")

Creating CSV from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

39662592

### BERT finalny trenowanie

In [43]:
checkpoint = f'{base_path}/data/results/finbert_articles_1'

In [83]:
model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint,
    num_labels=3,
    ignore_mismatched_sizes=True
).to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /content/drive/My Drive/magisterka/data/results/finbert_articles_1 and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [84]:
training_args = TrainingArguments(
    output_dir="./finbert-sentiment-pseudo",
    num_train_epochs=5,
    per_device_train_batch_size=16,
    eval_strategy="no",
    report_to="none",
    logging_steps=10,
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=False,
)

In [87]:
pseudo_tokenized = pseudo_ds.map(tokenize, batched=True)

Map:   0%|          | 0/11993 [00:00<?, ? examples/s]

In [88]:
def add_labels(example):
    example["labels"] = example["label"]
    return example

pseudo_tokenized = pseudo_tokenized.map(add_labels)

Map:   0%|          | 0/11993 [00:00<?, ? examples/s]

In [90]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=pseudo_tokenized,
)

trainer.train()

Step,Training Loss
10,0.8576
20,0.508
30,0.3219
40,0.1138
50,0.1858
60,0.2879
70,0.2987
80,0.249
90,0.1012
100,0.2213


TrainOutput(global_step=3750, training_loss=0.01919175753488283, metrics={'train_runtime': 1523.2177, 'train_samples_per_second': 39.367, 'train_steps_per_second': 2.462, 'total_flos': 3944399023514880.0, 'train_loss': 0.01919175753488283, 'epoch': 5.0})

In [91]:
model.save_pretrained(f"{base_path}/data/results/finbert_articles_final")

In [92]:
def predict_sentiment(text, model, tokenizer, device="cuda", max_length=128):
    model.to(device)
    model.eval()

    encodings = tokenizer(
        text,
        truncation=True,
        padding="max_length",
        max_length=max_length,
        return_tensors="pt"
    )

    encodings = {k: v.to(device) for k, v in encodings.items()}

    with torch.no_grad():
        outputs = model(**encodings)
        probs = torch.softmax(outputs.logits, dim=-1)
        pred_id = torch.argmax(probs, dim=-1).item()

    id2label = getattr(model.config, "id2label", None)
    if id2label is not None:
        pred_label = id2label[pred_id]
    else:
        pred_label = str(pred_id)

    return {
        "label_id": pred_id,
        "label": pred_label,
        "probs": probs[0].cpu().tolist()
    }

In [106]:
text = 'Projections that the total supply of stablecoins could triple or quadruple in the next year or two from the current level of around $240 billion — approaching the trillion-dollar mark — are "far too optimistic," even as U.S. regulatory frameworks gain traction, according to JPMorgan analysts. Of the two proposed stablecoin bills in the United States — the GENIUS Act in the Senate and the STABLE Act in the House — the former has gained more support. Earlier this week, senators voted to advance the GENIUS Act. However, both bills prohibit stablecoins from paying interest, aiming to define them as "payment stablecoins" akin to traditional money. This restriction would hurt stablecoin growth by making them less competitive with traditional interest-bearing instruments like money market funds, which saw $900 billion in inflows in the U.S. over the past year, JPMorgan analysts led by managing director Nikolaos Panigirtzoglou wrote in a report shared with The Block on Wednesday. "The growth of these non-interest bearing stablecoins over time would depend mostly on two factors 1) on their use in payments systems and 2) on the broader crypto ecosystem\'s expansion, the higher the usage of crypto tokens in real-world applications or higher activity in areas such as DeFi, NFTs or other applications, would expand the overall crypto market cap and with that the stablecoin universe given its typical 7%-8% share," the analysts wrote. "We find talk about tripling or quadrupling of the stablecoin universe over the coming year or two to be far too optimistic." Several major institutions, including Standard Chartered and Citi , have projected that stablecoins could surpass $1 trillion in supply in the coming years, but JPMorgan remains skeptical. If stablecoins were allowed to pay interest, they could have tapped into part of the $900 billion annual growth in money market funds, enabling faster expansion. But with proposed regulations prohibiting yield, the JPMorgan analysts say their growth will be more limited and dependent on crypto adoption and real-world use cases. Meanwhile, yield-bearing stablecoins like tokenized treasuries (e.g., BlackRock\'s BUIDL) and securitized products (e.g., Figure Markets\' YLDS) are expected to keep growing as idle crypto capital seeks returns, according to the analysts. While it\'s hard to estimate the size of that idle cash, the analysts added that it likely makes up a small portion of the total stablecoin market. In March, the analysts projected that yield-bearing stablecoins could grow from 6% to as much as 50% of the market. The analysts further said on Wednesday that a stablecoin framework, if passed, would favor U.S.-compliant entities such as banks, exchanges, and fintech firms — with players like Bank of America and Stripe already making moves in the space. By contrast, non-compliant issuers like Tether would face increased scrutiny, particularly around reserve disclosures. "18% of Tether reserves are currently non-compliant with the GENIUS Act," the analysts noted. "Required reserves would need to be compliant with the proposed regulation. It is unclear as to how Tether may do this. Its $5.6 billion of excess reserves and profits cumulated over the past years offer some room for manoeuvre." The analysts also said that crypto-backed or algorithmic stablecoins like DAI would be banned under the proposed U.S. rules, calling them "the main losers" of the legislation. Such projects may shrink or move offshore to avoid U.S. oversight, they added. Disclaimer: The Block is an independent media outlet that delivers news, research, and data. As of November 2023, Foresight Ventures is a majority investor of The Block. Foresight Ventures invests in other companies in the crypto space. Crypto exchange Bitget is an anchor LP for Foresight Ventures. The Block continues to operate independently to deliver objective, impactful, and timely information about the crypto industry. Here are our current financial disclosures . © 2025 The Block. All Rights Reserved. This article is provided for informational purposes only. It is not offered or intended to be used as legal, tax, investment, financial, or other advice.'

In [107]:
predict_sentiment(text, model, tokenizer)

{'label_id': 2,
 'label': 'Negative',
 'probs': [4.074674961884739e-06, 2.1315307094482705e-05, 0.9999746084213257]}