In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertModel, Trainer, TrainingArguments
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# 데이터 로드
reddit_news = pd.read_csv('data/Combined_News_DJIA.csv')


# 날짜 형식 변환 및 인덱스 설정
reddit_news['Date'] = pd.to_datetime(reddit_news['Date'])
reddit_news.set_index('Date', inplace=True)


# 뉴스 데이터를 하나의 문자열로 결합
reddit_news['News'] = reddit_news[[col for col in reddit_news.columns if  col == 'Top1' or 'Top2'or 'Top3' or 'Top4' or'Top5']].apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1)
reddit_news['News'] = reddit_news['News'].replace('b\"|b\'|\\\\|\\\"', '', regex=True)

# 피처 및 타겟 설정
X = reddit_news['News'].values
y = reddit_news['Label'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = BertTokenizer.from_pretrained('prajjwal1/bert-tiny')
bert_model = BertModel.from_pretrained('prajjwal1/bert-tiny')

# 데이터셋 클래스 정의
class NewsDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(texts.tolist(), truncation=True, padding=True, max_length=512)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.labels)

# 데이터셋 생성
train_dataset = NewsDataset(X_train, y_train)
test_dataset = NewsDataset(X_test, y_test)


In [3]:
class RegressionModel(nn.Module):
    def __init__(self, bert):
        super(RegressionModel, self).__init__()
        self.bert = bert
        self.regressor = nn.Linear(bert.config.hidden_size, 1)
        self.sigmoid = nn.Sigmoid()  # Add Sigmoid layer
    
    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]
        logits = self.regressor(pooled_output)
        logits = self.sigmoid(logits)  # Apply Sigmoid activation
        
        loss = None
        if labels is not None:
            loss_fct = nn.MSELoss()
            loss = loss_fct(logits.view(-1), labels.view(-1))
        
        return {"loss": loss, "logits": logits}

# 회귀 모델 인스턴스 생성
regression_model = RegressionModel(bert_model)

# 트레이너 설정
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    eval_steps=500,
    weight_decay=0.1,
    logging_dir='./logs',
    logging_steps=500,
    report_to=None,
)

trainer = Trainer(
    model=regression_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# 모델 학습
trainer.train()

100%|██████████| 300/300 [01:08<00:00,  4.35it/s]

{'train_runtime': 68.93, 'train_samples_per_second': 69.244, 'train_steps_per_second': 4.352, 'train_loss': 0.2504034423828125, 'epoch': 3.0}





TrainOutput(global_step=300, training_loss=0.2504034423828125, metrics={'train_runtime': 68.93, 'train_samples_per_second': 69.244, 'train_steps_per_second': 4.352, 'total_flos': 0.0, 'train_loss': 0.2504034423828125, 'epoch': 3.0})

In [4]:
from sklearn.metrics import mean_squared_error,accuracy_score,precision_score,recall_score,f1_score
import numpy as np
eval_results = trainer.evaluate()
print("Evaluation results:", eval_results)

# 예측
predictions = trainer.predict(test_dataset).predictions
predictions = predictions.flatten()


# MSE 계산
mse = mean_squared_error(y_test, predictions)
print("Mean Squared Error:", mse)

100%|██████████| 25/25 [00:01<00:00, 13.39it/s]


Evaluation results: {'eval_loss': 0.24645309150218964, 'eval_runtime': 1.957, 'eval_samples_per_second': 203.372, 'eval_steps_per_second': 12.775, 'epoch': 3.0}


100%|██████████| 25/25 [00:01<00:00, 13.43it/s]

Mean Squared Error: 0.2464530857195152





In [6]:
binary_predictions = [1 if pred > 0.5 else 0 for pred in predictions]


# 평가 지표 계산
accuracy = accuracy_score(y_test, binary_predictions)
precision = precision_score(y_test, binary_predictions, pos_label=1)
recall = recall_score(y_test, binary_predictions, pos_label=1)
f1 = f1_score(y_test, binary_predictions, pos_label=1)

print(f'Accuracy: {accuracy * 100:.2f}%')
print(f'Precision: {precision * 100:.2f}%')
print(f'Recall: {recall * 100:.2f}%')
print(f'F1 Score: {f1 * 100:.2f}%')



Accuracy: 56.78%
Precision: 56.93%
Recall: 99.56%
F1 Score: 72.44%
