In [2]:
from datasets import Dataset, DatasetDict
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ast
import os
from google.colab import drive
import random
import torch
from torch.utils.data import DataLoader

In [3]:
drive.mount('/content/drive')
base_path = '/content/drive/My Drive/magisterka'
%cd /content/drive/My Drive/magisterka

Mounted at /content/drive
/content/drive/My Drive/magisterka


In [4]:
from scripts.data_processing.reddit_comments_processing import load_and_process_reddit_comments
from model_training.mlm_unlabeled_trainer import MLMUnlabeledDataTrainer

In [5]:
models_path = f'{base_path}/data/results'
comments_checkpoint = 'finbert_reddit_final'
comments_model = MLMUnlabeledDataTrainer(final_checkpoint=comments_checkpoint, mlm_checkpoint='not_needed', models_path=models_path)

In [6]:
df = load_and_process_reddit_comments(f"{base_path}/data/reddit_comments")

In [7]:
df

Unnamed: 0,comment,upvotes,created_utc,upvotes_log,upvote_pct_day
26,The POTUS makes history by being the biggest p...,708,2025-04-02 07:33:54,6.563856,0.977974
27,Makes history? As in most corrupt fuckhead who...,90,2025-04-02 08:10:08,4.510860,0.907489
28,I fully expect this idiot to pardon himself at...,82,2025-04-02 07:50:59,4.418841,0.903084
29,He s dragging the US into an authoritarian shi...,24,2025-04-02 12:05:53,3.218876,0.713656
37,100% Russians used this to launder money,112,2025-04-02 08:18:12,4.727388,0.911894
...,...,...,...,...,...
14835,Omg my empathy makes me feel the pain bro,17,2025-06-05 00:23:01,2.890372,0.755700
14867,My decade old SQLcoin has the fastest settleme...,17,2025-06-05 19:12:01,2.890372,0.755700
14873,Some folks are just insane when it comes to th...,17,2025-06-05 02:08:46,2.890372,0.755700
14874,"Unfortunately, that s why bitcoin is the king....",14,2025-06-05 02:08:51,2.708050,0.700326


In [8]:
df['created_utc'] = pd.to_datetime(df['created_utc'])

In [9]:
df = df[df['created_utc'] > '2025-04-01']

In [10]:
df = df.sort_values("created_utc")

In [11]:
df = df[["comment", "created_utc", "upvote_pct_day"]]

In [16]:
df = df.rename(columns={"comment": "text"})

In [17]:
df_training = df[["text"]]

In [18]:
dataset = Dataset.from_pandas(df_training)

In [19]:
comments_model.init_final_model(comments_model.final_checkpoint)

In [20]:
tokenized_dataset = dataset.map(comments_model.tokenize, batched=True)

Map:   0%|          | 0/2878 [00:00<?, ? examples/s]

In [21]:
tokenized_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask"],
)

In [22]:
data_loader = DataLoader(tokenized_dataset, batch_size=32, shuffle=False)

In [23]:
all_preds = []
for batch in data_loader:
  batch = {k: v.to("cuda") for k, v in batch.items()}
  with torch.no_grad():
    output = comments_model.final_model(**batch)
    logits = output.logits
    preds = torch.argmax(logits, dim=1)
  all_preds.extend(preds.cpu().numpy())

In [25]:
pred_labels = [comments_model.id2label[pred] for pred in all_preds]

In [26]:
result_df = pd.DataFrame({"text": df["text"], "created_utc": df["created_utc"],
                          "label": pred_labels,
                          "upvote_pct_day": df["upvote_pct_day"]})

In [27]:
result_df = result_df.drop_duplicates(subset=["text"])

In [28]:
result_df

Unnamed: 0,text,created_utc,label,upvote_pct_day
434,"The second page says you will receive it, and ...",2025-04-01 00:09:55,neutral,0.781481
419,OP can t read apparently. This isn t for minin...,2025-04-01 00:33:27,neutral,0.781481
426,"Also leveraging the product you're selling, it...",2025-04-01 00:36:25,positive,0.885185
425,They mention NCUs that can be earned during th...,2025-04-01 00:58:26,neutral,0.855556
481,I do this calc for my balance sheet too,2025-04-01 01:24:04,neutral,0.707407
...,...,...,...,...
14534,Trump's ENTIRE agenda is rigged for the rich..,2025-06-06 03:34:41,neutral,0.738636
14422,So many cunts in this sub thought these two fe...,2025-06-06 03:59:52,positive,0.738636
14552,And tell her it was a solar flare that fried u...,2025-06-06 04:05:31,neutral,0.954545
14555,"Only thing is, I wonder if that which this guy...",2025-06-06 04:33:36,neutral,0.806818


In [29]:
result_df.to_csv(f"{base_path}/data/results/comments_labeled_non_finetuned.csv", index=False)