In [None]:
from openai import OpenAI

In [None]:
from google.colab import drive, userdata
import pandas as pd
import json
import numpy as np
import random
from datasets import Dataset

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/My Drive/magisterka

/content/drive/My Drive/magisterka


In [None]:
from scripts.data_processing.financial_news_data_processing import load_news_data
from scripts.data_processing.reddit_comments_processing import load_and_process_reddit_comments

In [None]:
base_path = '/content/drive/My Drive/magisterka'
df = load_news_data(f'{base_path}/data/finlighten_news/')

In [None]:
df['publishDate'] = pd.to_datetime(df['publishDate'], format='mixed')
df = df[df['publishDate'] > '2025-04-01 00:00:00']
df = df.drop_duplicates()

In [None]:
np.random.seed(123)
random.seed(123)

In [None]:
dataset = Dataset.from_pandas(df)

In [None]:
dataset = dataset.train_test_split(test_size=0.2)

In [None]:
test_data = dataset["test"]

In [None]:
df_test = test_data.to_pandas()

In [None]:
df_test_sample = df_test.sample(n=500, random_state=123)

In [None]:
client=OpenAI(api_key=userdata.get("OPENAI_API_KEY"))

In [None]:
def classify_sentiment(text: str) -> str:
  prompt = (
      """Classify these financial articles into ONE of these THREE labels:
      * negative
      * neutral
      * positive
      Choose 'negative' if the overall sentiment of the article,
      regarding either the cryptocurrency market as a whole or a specific cryptocurrency,
      is clearly negative. Choose 'positive' if the overall sentiment of the article,
      regarding either the cryptocurrency market as a whole or a specific cryptocurrency,
      is clearly positive. Choose 'neutral' in any other case - when the sentiment is mixed, unclear, or factual.
      Output exactly one word: one of three possible labels.
       """
  )
  resp = client.chat.completions.create(
      model="gpt-4.1-mini",
      messages=[
          {'role': "system", "content": prompt},
          {'role': "user", "content": text},
      ],
      temperature=0.0,
      max_tokens=2,
  )
  label=resp.choices[0].message.content.strip().lower()
  if label not in {"positive", "neutral", "negative"}:
    label="neutral"
  return label

In [None]:
classify_sentiment(df_test_sample.iloc[0]["text"])

'negative'

In [None]:
df_test_sample["true_sentiment"] = df_test_sample["text"].apply(classify_sentiment)

In [None]:
df_test_sample.to_csv(f"{base_path}/data/results/labeled_articles_gtp.csv", index=False)

In [None]:
def classify_sentiment_reddit(text: str) -> str:
  prompt = (
      """Classify these Reddit comments into ONE of these THREE labels:
      * negative
      * neutral
      * positive
      Choose 'negative' if the overall sentiment of the comment,
      regarding either the cryptocurrency market as a whole, a specific cryptocurrency,
      or the overall state of politics,
      is clearly negative. Choose 'positive' if the overall sentiment of the article,
      regarding either the cryptocurrency market as a whole or a specific cryptocurrency,
      is clearly positivew. Choose 'neutral' in any other case - when the sentiment is mixed, unclear, or factual.
      Output exactly one word: one of three possible labels.
       """
  )
  resp = client.chat.completions.create(
      model="gpt-4.1-mini",
      messages=[
          {'role': "system", "content": prompt},
          {'role': "user", "content": text},
      ],
      temperature=0.0,
      max_tokens=2,
  )
  label=resp.choices[0].message.content.strip().lower()
  if label not in {"positive", "neutral", "negative"}:
    label="neutral"
  return label

In [None]:
df_reddit = load_and_process_reddit_comments(f"{base_path}/data/reddit_comments")

In [None]:
dataset_reddit = Dataset.from_pandas(df_reddit).train_test_split(test_size=0.2)

In [None]:
test_reddit = dataset_reddit["test"].to_pandas()

In [None]:
test_reddit_sample = test_reddit.sample(n=500, random_state=123)

In [None]:
test_reddit_sample["label"] = test_reddit_sample["comment"].apply(classify_sentiment_reddit)

In [None]:
test_reddit_sample = test_reddit_sample.rename(columns={"comment": "text"})

In [None]:
test_reddit_sample.to_csv(f"{base_path}/data/results/comments_labeled_gpt.csv", index=False)