In [1]:
from google.colab import drive

In [2]:
import logging
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ast
import os
import random
import torch

from transformers import (
    AutoTokenizer,
    AutoModelForMaskedLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    AutoModelForSequenceClassification,
    pipeline
)
from datasets import Dataset

In [3]:
drive.mount('/content/drive')
base_path = '/content/drive/My Drive/magisterka'
%cd /content/drive/My Drive/magisterka

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/magisterka


In [4]:
from scripts.data_processing.reddit_comments_processing import load_and_process_reddit_comments

In [5]:
class MLMUnlabeledDataTrainer:
    def __init__(self, final_checkpoint, mlm_checkpoint, models_path, device="cuda",
                 epochs_num: int = 5):
        self.final_checkpoint = f"{models_path}/{final_checkpoint}"
        self.mlm_checkpoint = f"{models_path}/{mlm_checkpoint}"
        self.device = device
        self.base_model_name = "yiyanghkust/finbert-tone"
        self.logger = logging.getLogger(self.__class__.__name__)
        self.epochs_num = epochs_num
        self.tokenizer = None
        self.mlm_model = None
        self.final_model = None
        self.data_collator = None
        self.label2id = {"negative": 0, "neutral": 1, "positive": 2}
        self.id2label = {0: "negative", 1: "neutral", 2: "positive"}

    def init_tokenizer(self) -> None:
        self.tokenizer = AutoTokenizer.from_pretrained(self.base_model_name)

    def init_data_collator(self, initial_model: bool = False) -> None:
        checkpoint = self.mlm_checkpoint if not initial_model else self.base_model_name
        self.init_mlm_model(checkpoint)
        if self.tokenizer is None:
            self.init_tokenizer()
        self.data_collator = DataCollatorForLanguageModeling(
            tokenizer=self.tokenizer,
            mlm_probability=0.15,
            mlm=True
        )

    def init_mlm_model(self, checkpoint: str) -> None:
        self.mlm_model = AutoModelForMaskedLM.from_pretrained(checkpoint)

    def init_final_model(self, checkpoint: str) -> None:
        self.final_model = AutoModelForSequenceClassification.from_pretrained(
            checkpoint,
            num_labels=3,
            ignore_mismatched_sizes=True,
        ).to(self.device)

    def tokenize(self, example):
        if self.tokenizer is None:
            self.init_tokenizer()
        return self.tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)

    def train_mlm_model(self, dataset: Dataset, initial_model: bool = False) -> None:
        self.init_data_collator(initial_model)
        tokenized_dataset = dataset.map(self.tokenize, batched=True)
        training_args = TrainingArguments(
            output_dir="./finbert-mlm-crypto",
            overwrite_output_dir=True,
            num_train_epochs=self.epochs_num,
            per_device_train_batch_size=16,
            save_steps=500,
            save_total_limit=2,
            logging_dir="./logs",
            logging_steps=50,
            report_to="none",
            eval_strategy="no",
        )
        trainer = Trainer(
            model=self.mlm_model,
            args=training_args,
            train_dataset=tokenized_dataset["train"],
            data_collator=self.data_collator,
        )
        trainer.train()
        trainer.save_model(self.mlm_checkpoint)

    @property
    def pseudo_pipeline(self):
        if self.tokenizer is None:
            self.init_tokenizer()
        return pipeline(
            "sentiment-analysis",
            model=self.base_model_name,
            tokenizer=self.tokenizer,
            truncation=True,
            max_length=128
        )

    def assign_pseudo_labels(self, dataset: Dataset, dataset_path : str) -> Dataset:
        texts = dataset["train"]["text"]
        pseudo_labels = []
        pseudo_texts = []
        pipeline = self.pseudo_pipeline
        for t in texts:
            out = pipeline(t)[0]
            if out["score"] >= 0.9:
                pseudo_texts.append(t)
                pseudo_labels.append(out["label"])
        label_ids = [self.label2id[l.lower()] for l in pseudo_labels]
        pseudo_ds = Dataset.from_dict({"text": pseudo_texts, "labels": label_ids})
        pseudo_ds.to_csv(dataset_path)
        return pseudo_ds

    def train_final_model(self, dataset: Dataset, pseudo_dataset_path: str) -> None:
        try:
            self.init_final_model(self.mlm_checkpoint)
        except Exception as e:
            self.logger.error(e)
            return
        pseudo_ds = self.assign_pseudo_labels(dataset, dataset_path=pseudo_dataset_path)
        tokenized_dataset = pseudo_ds.map(self.tokenize, batched=True)
        training_args = TrainingArguments(
            output_dir="./finbert-sentiment-pseudo",
            num_train_epochs=self.epochs_num,
            per_device_train_batch_size=16,
            eval_strategy="no",
            report_to="none",
            logging_steps=10,
            save_strategy="epoch",
            save_total_limit=2,
            load_best_model_at_end=False,
        )
        trainer = Trainer(
            model = self.final_model,
            args=training_args,
            train_dataset=tokenized_dataset
        )
        trainer.train()
        self.final_model.save_pretrained(self.final_checkpoint)

    def train_final_model_labeled_data(self, dataset: Dataset) -> None:
        try:
            self.init_final_model(self.mlm_checkpoint)
        except Exception as e:
            self.logger.error(e)
            return
        tokenized_dataset = dataset.map(self.tokenize, batched=True)
        training_args = TrainingArguments(
            output_dir="./finbert-sentiment-pseudo",
            num_train_epochs=self.epochs_num,
            per_device_train_batch_size=16,
            eval_strategy="no",
            report_to="none",
            logging_steps=10,
            save_strategy="epoch",
            save_total_limit=2,
            load_best_model_at_end=False,
        )
        trainer = Trainer(
            model = self.final_model,
            args=training_args,
            train_dataset=tokenized_dataset
        )
        trainer.train()
        self.final_model.save_pretrained(self.final_checkpoint)

    def predict_sentiment(self, text: str, device: str, max_length: int=128):
        self.final_model.to(device)
        self.final_model.eval()

        encodings = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=max_length,
            return_tensors="pt"
        )

        encodings = {k: v.to(device) for k, v in encodings.items()}

        with torch.no_grad():
            outputs = self.final_model(**encodings)
            probs = torch.softmax(outputs.logits, dim=-1)
            pred_id = torch.argmax(probs, dim=-1).item()
            pred_label = self.id2label[pred_id]

        return {
            "label_id": pred_id,
            "label": pred_label,
            "probs": probs[0].cpu().tolist()
        }

In [6]:
models_path = f"{base_path}/data/results"
mlm_checkpoint = "finbert_reddit_mlm"
final_checkpoint = "finbert_reddit_final_labeled"

In [7]:
model  = MLMUnlabeledDataTrainer(final_checkpoint, mlm_checkpoint, models_path, epochs_num=25)

In [5]:
df = load_and_process_reddit_comments(f"{base_path}/data/reddit_comments")

In [6]:
df = df.rename(columns={"comment": "text"})

In [24]:
dataset = Dataset.from_pandas(df).train_test_split(test_size=0.2)

In [54]:
model.train_mlm_model(dataset, initial_model=True)

Some weights of BertForMaskedLM were not initialized from the model checkpoint at yiyanghkust/finbert-tone and are newly initialized: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/2312 [00:00<?, ? examples/s]

Map:   0%|          | 0/578 [00:00<?, ? examples/s]

Step,Training Loss
50,7.0389
100,5.6707
150,5.1563
200,4.8511
250,4.6955
300,4.4462
350,4.2726
400,4.1931
450,4.1467
500,3.9517


Step,Training Loss
50,7.0389
100,5.6707
150,5.1563
200,4.8511
250,4.6955
300,4.4462
350,4.2726
400,4.1931
450,4.1467
500,3.9517


In [56]:
tokenized_dataset = dataset.map(model.tokenize, batched=True)

Map:   0%|          | 0/2312 [00:00<?, ? examples/s]

Map:   0%|          | 0/578 [00:00<?, ? examples/s]

In [57]:
np.random.seed(123)
random.seed(123)

In [58]:
sampled_indices = random.sample(range(len(tokenized_dataset['test'])), 100)
sampled_data = tokenized_dataset["test"].select(sampled_indices)
sampled_data.to_csv(f"{base_path}/data/results/sample_test_dataset_reddit.csv")

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

152293

In [59]:
pseudo_dataset_path = f"{base_path}/data/results/pseudo_labels_reddit.csv"

In [60]:
model.train_final_model(dataset, pseudo_dataset_path=pseudo_dataset_path)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /content/drive/My Drive/magisterka/data/results/finbert_reddit_mlm and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cuda:0


Creating CSV from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Map:   0%|          | 0/1983 [00:00<?, ? examples/s]

Step,Training Loss
10,0.6691
20,0.5127
30,0.2818
40,0.4436
50,0.2962
60,0.1654
70,0.3172
80,0.2291
90,0.2989
100,0.1962


In [61]:
model.predict_sentiment("I fully expect this idiot to pardon himself at the end of his term or impeachment.", "cuda")

{'label_id': 1,
 'label': 'neutral',
 'probs': [5.561777925322531e-06, 0.9999909400939941, 3.4829722608265e-06]}

In [62]:
samples_list = sampled_data.to_dict()["text"]

In [64]:
for text in samples_list[15:50]:
  print(text)
  print(model.predict_sentiment(text, "cuda"))

Except, the dumb bagholder on the right is looking across the horizon for Altseason still hoping to make wife changing gains. In previous cycles, Alts like ETH, LTC, DOGE already had made their biggest gains 1-Year from the Halvening date NOT at EOY next year after halvening. This time around these Alts and the Total Alt Marketcap actually has lost value since halvening and show zero indication of masking any parabolic gains like we saw in previous cycles or gains whatsoever. | ETH | Halvening Date | 1-YR Post Halvening | |:-----------|------------:|:------------:| | July 2016-17 | $11 | $199 | May 2020-21 | $210 | $2,800 | April 2024-25 | $3,157 | $1,580 | **LTC** | July 2016-17 | $4 | $40 | May 2020-21 | $40 | $377 | April 2024-25 | $85 | $77 | **DOGE** | July 2016-17 | $0.0002 | $0.001 | May 2020-21 | $0.002 | $0.49 | April 2024-25 | $0.16 | $0.15 | **Alt Marketcap** *(Excluding Stablecoins)* | July 2016-17 | $2.05 Billion | $52.17 Billion | May 2020-21 | $74.81 Billion | $1.34 Tril

### alternative approach - manually assigning the labels to comments for supervised training

In [7]:
df

Unnamed: 0,text,upvotes,created_utc,upvotes_log,upvote_pct_day
26,The POTUS makes history by being the biggest p...,708,2025-04-02 07:33:54,6.563856,0.977974
27,Makes history? As in most corrupt fuckhead who...,90,2025-04-02 08:10:08,4.510860,0.907489
28,I fully expect this idiot to pardon himself at...,82,2025-04-02 07:50:59,4.418841,0.903084
29,He s dragging the US into an authoritarian shi...,24,2025-04-02 12:05:53,3.218876,0.713656
37,100% Russians used this to launder money,112,2025-04-02 08:18:12,4.727388,0.911894
...,...,...,...,...,...
14835,Omg my empathy makes me feel the pain bro,17,2025-06-05 00:23:01,2.890372,0.755700
14867,My decade old SQLcoin has the fastest settleme...,17,2025-06-05 19:12:01,2.890372,0.755700
14873,Some folks are just insane when it comes to th...,17,2025-06-05 02:08:46,2.890372,0.755700
14874,"Unfortunately, that s why bitcoin is the king....",14,2025-06-05 02:08:51,2.708050,0.700326


In [8]:
random.seed(123)
np.random.seed(123)

In [10]:
sampled_df = df.sample(500)

In [11]:
sampled_df.to_csv(f"{base_path}/data/results/reddit_comments_manual_labels.csv", index=False)

loading the labeled data:

In [21]:
df = pd.read_csv(f"{base_path}/data/results/reddit_comments_manual_labels.csv")[["text", "label"]]

In [28]:
df["labels"] = df["label"].apply(lambda x: model.label2id.get(x, 1))

In [29]:
df

Unnamed: 0,text,label,labels
0,"Nah, the top was pretty evidently the TRUMP la...",neutral,1
1,Saylor is a major red flag,negative,0
2,"Not that hard, you just need 10 million dollar...",neutral,1
3,A 2% move is a rally?,neutral,1
4,Bitcoin is down almost 40% vs gold prices.,negative,0
...,...,...,...
495,This guy deserves to be in jail for a long tim...,negative,0
496,"Burgers and Taxes, the true American experience.",neutral,1
497,The suppository is quite popular,neutral,1
498,I remember Coinbase as the exchange that didn'...,neutral,1


In [36]:
dataset = Dataset.from_pandas(df[["text", "labels"]])

In [33]:
dataset = dataset.train_test_split(test_size=0.001)

In [37]:
dataset

Dataset({
    features: ['text', 'labels'],
    num_rows: 500
})

In [38]:
model.train_final_model_labeled_data(dataset)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /content/drive/My Drive/magisterka/data/results/finbert_reddit_mlm and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Step,Training Loss
10,0.9283
20,0.8534
30,0.7894
40,0.7134
50,0.69
60,0.5635
70,0.6172
80,0.2144
90,0.3096
100,0.2694


Step,Training Loss
10,0.9283
20,0.8534
30,0.7894
40,0.7134
50,0.69
60,0.5635
70,0.6172
80,0.2144
90,0.3096
100,0.2694


In [39]:
1

1

In [45]:
for text in samples_list[50:]:
  print(text)
  print(model.predict_sentiment(text, "cuda"))

Yeah how many times did that happen already?
{'label_id': 1, 'label': 'neutral', 'probs': [2.712336390686687e-05, 0.9999500513076782, 2.282149944221601e-05]}
{'label_id': 0, 'label': 'negative', 'probs': [0.9998749494552612, 2.733855217229575e-05, 9.773109195521101e-05]}
No generations been marketed to as frequently and as well as new generations. Went from body acceptance to everyone has a eating disorder again including guys hitting the gym and everyone starting their skin care routine at age 12
{'label_id': 1, 'label': 'neutral', 'probs': [4.377762888907455e-05, 0.9999377727508545, 1.83988522621803e-05]}
*The earlier half of holders are in profit
{'label_id': 2, 'label': 'positive', 'probs': [0.00020375459280330688, 0.07762399315834045, 0.9221723675727844]}
I do not understand what any of that means. Definitely more complicated than regular money,
{'label_id': 1, 'label': 'neutral', 'probs': [3.5735993151320145e-05, 0.999944806098938, 1.9431083273957483e-05]}
Trump was never debanke

In [41]:
sample_dataset = Dataset.from_csv((f"{base_path}/data/results/sample_test_dataset_reddit.csv"))

Generating train split: 0 examples [00:00, ? examples/s]

In [42]:
samples_list = sample_dataset.to_dict()["text"]

In [49]:
text="He s dragging the US into an authoritarian shit hole, and crypto bros are concerned about Trump s impact on crypto. It s so pathetic"
print(model.predict_sentiment(text, "cuda"))

{'label_id': 0, 'label': 'negative', 'probs': [0.999908447265625, 3.5185446904506534e-05, 5.638055517920293e-05]}
