In [None]:
from datasets import Dataset, DatasetDict

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ast
import os
from google.colab import drive
import random
import torch
from torch.utils.data import DataLoader

In [None]:
drive.mount('/content/drive')
base_path = '/content/drive/My Drive/magisterka'
%cd /content/drive/My Drive/magisterka

Mounted at /content/drive
/content/drive/My Drive/magisterka


In [None]:
from scripts.data_processing.financial_news_data_processing import load_news_data
from model_training.mlm_unlabeled_trainer import MLMUnlabeledDataTrainer

In [None]:
models_path = f'{base_path}/data/results'
articles_checkpoint = 'finbert_articles_final'
articles_model = MLMUnlabeledDataTrainer(final_checkpoint=articles_checkpoint, mlm_checkpoint='not_needed', models_path=models_path)

In [None]:
df = load_news_data(f'{base_path}/data/finlighten_news/')
df['publishDate'] = pd.to_datetime(df['publishDate'], format="mixed")


In [None]:
df = df[df["publishDate"] > '2025-04-01']

In [None]:
df = df.sort_values(by="publishDate")

In [None]:
df = df[["text", "publishDate"]]

In [None]:
df_training = df[["text"]]

In [None]:
dataset = Dataset.from_pandas(df_training)

In [None]:
articles_model.init_final_model(articles_model.final_checkpoint)

In [None]:
tokenized_dataset = dataset.map(articles_model.tokenize, batched=True)

Map:   0%|          | 0/18223 [00:00<?, ? examples/s]

In [None]:
tokenized_dataset

Dataset({
    features: ['text', 'publishDate', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 18223
})

In [None]:
tokenized_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask"],
)

In [None]:
data_loader = DataLoader(tokenized_dataset, batch_size=32, shuffle=False)

In [None]:
all_preds = []
for batch in data_loader:
  batch = {k: v.to("cuda") for k, v in batch.items()}
  with torch.no_grad():
    output = articles_model.final_model(**batch)
    logits = output.logits
    preds = torch.argmax(logits, dim=1)
  all_preds.extend(preds.cpu().numpy())

In [None]:
pred_labels = [articles_model.id2label[pred] for pred in all_preds]

In [None]:
result_df = pd.DataFrame({"text": df["text"], "publishDate": df["publishDate"],
                          "label": pred_labels})

In [None]:
result_df = result_df.drop_duplicates(subset=["text"])

In [None]:
result_df.to_csv(f"{base_path}/data/results/articles_labeled.csv", index=False)