# Financial PhraseBank: 금융 감성 분석 모델 학습

이 노트북은 Financial PhraseBank 데이터셋을 사용하여 금융 뉴스 헤드라인의 감성(긍정/중립/부정)을 분류하는 모델을 학습하고 평가합니다.

## 1. 환경 설정 및 라이브러리 임포트

In [None]:
!pip install -q datasets transformers[torch] scikit-learn pandas numpy matplotlib seaborn tensorboard

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch

from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from huggingface_hub import notebook_login

In [None]:
# Hugging Face Hub 로그인 (필요시)
# notebook_login()

## 2. 데이터셋 준비

Financial PhraseBank 데이터셋은 일반적으로 `.txt` 파일 형태로 제공되며, 각 라인은 `텍스트.@감성` 형식입니다.
여기서는 `Sentences_50Agree.txt` 파일을 기본으로 사용합니다. 다른 파일을 사용하려면 `file_path` 변수를 수정하세요.
데이터셋은 [여기](https://www.kaggle.com/datasets/ankurzing/sentiment-analysis-for-financial-news/data) 등에서 다운로드 받을 수 있습니다.
다운로드 후, 이 노트북과 같은 디렉토리에 해당 파일을 위치시키거나 `file_path`를 알맞게 수정해주세요.

In [None]:
file_path = 'Sentences_50Agree.txt' 
encoding_to_try = ['utf-8', 'latin1', 'ISO-8859-1'] 

df = None
for encoding in encoding_to_try:
    try:
        df = pd.read_csv(file_path, sep='.@', header=None, names=['text', 'sentiment'], engine='python', encoding=encoding)
        print(f"Successfully loaded data with encoding: {encoding}")
        break
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found. Please download it and place it in the correct directory.")
        df = None 
        break
    except Exception as e:
        print(f"Failed to load with encoding {encoding}: {e}")

if df is not None and not df.empty:
    print("\nDataset loaded successfully.")
    print(f"Dataset shape: {df.shape}")
    print("\nFirst 5 rows:")
    print(df.head())
else:
    print("\nError: Could not load the dataset. Please check the file path and encoding.")
    if df is None: df = pd.DataFrame(columns=['text', 'sentiment'])

### 2.0. 데이터셋 레이블 분포 시각화 (추가)

In [None]:
if df is not None and not df.empty and 'sentiment' in df.columns:
    sentiment_counts = df['sentiment'].value_counts()
    plt.figure(figsize=(8, 6))
    if not sentiment_counts.empty:
        if len(sentiment_counts) > 3 : # 많은 카테고리면 막대 그래프
             sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values, palette='viridis')
             plt.title('Financial PhraseBank: Sentiment Distribution')
             plt.xlabel('Sentiment')
             plt.ylabel('Number of Sentences')
        else: # 적은 카테고리면 파이 차트
            plt.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%', startangle=140, colors=sns.color_palette('viridis', len(sentiment_counts)))
            plt.title('Financial PhraseBank: Sentiment Distribution')
            plt.axis('equal') # 파이차트를 원형으로 만듭니다.
        plt.show()
    else:
        print("Sentiment column is empty or does not exist for visualization.")
else:
    print("DataFrame is empty or 'sentiment' column is missing, skipping sentiment distribution visualization.")

### 2.1. 데이터 전처리 및 라벨 인코딩

In [None]:
if not df.empty:
    sentiment_to_id = {'positive': 0, 'neutral': 1, 'negative': 2}
    id_to_sentiment = {v: k for k, v in sentiment_to_id.items()}
    num_fin_labels = len(sentiment_to_id)

    df['label'] = df['sentiment'].map(sentiment_to_id)

    if df['label'].isnull().any():
        print("\nWarning: Some sentiments were not mapped to labels. Check sentiment_to_id mapping and dataset values.")
        print(df[df['label'].isnull()])
        df.dropna(subset=['label'], inplace=True) 
        df['label'] = df['label'].astype(int) 
    
    print("\nData with encoded labels:")
    print(df.head())
    print(f"\nNumber of labels: {num_fin_labels}")
    print(f"Label mapping: {sentiment_to_id}")
else:
    print("DataFrame is empty, skipping preprocessing.")
    num_fin_labels = 3 
    sentiment_to_id = {'positive': 0, 'neutral': 1, 'negative': 2}
    id_to_sentiment = {v: k for k, v in sentiment_to_id.items()}

### 2.2. 데이터 분할 (학습, 검증, 테스트)

In [None]:
if not df.empty and 'label' in df.columns:
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])
    train_df, val_df = train_test_split(train_df, test_size=0.125, random_state=42, stratify=train_df['label'])

    print(f"Train set size: {len(train_df)}")
    print(f"Validation set size: {len(val_df)}")
    print(f"Test set size: {len(test_df)}")

    train_dataset_hf = Dataset.from_pandas(train_df.reset_index(drop=True))
    val_dataset_hf = Dataset.from_pandas(val_df.reset_index(drop=True))
    test_dataset_hf = Dataset.from_pandas(test_df.reset_index(drop=True))

    financial_datasets = DatasetDict({
        'train': train_dataset_hf,
        'validation': val_dataset_hf,
        'test': test_dataset_hf
    })
    print("\nFinancial Datasets (Hugging Face format):")
    print(financial_datasets)
else:
    print("DataFrame is empty or 'label' column is missing, skipping data splitting.")
    financial_datasets = DatasetDict({
        'train': Dataset.from_dict({'text': [], 'label': []}),
        'validation': Dataset.from_dict({'text': [], 'label': []}),
        'test': Dataset.from_dict({'text': [], 'label': []})
    })

### 2.3. 토큰화

In [None]:
MODEL_NAME = "distilbert-base-uncased" 
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [None]:
def tokenize_function_financial(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

In [None]:
if not df.empty and financial_datasets['train']:
    tokenized_financial_datasets = financial_datasets.map(tokenize_function_financial, batched=True)
    columns_to_remove = ['text', 'sentiment']
    if 'Unnamed: 0' in tokenized_financial_datasets['train'].column_names:
        columns_to_remove.append('Unnamed: 0')
    if '__index_level_0__' in tokenized_financial_datasets['train'].column_names:
        columns_to_remove.append('__index_level_0__')
        
    # remove_columns_ 사용 시 오류 발생 가능성 있어 remove_columns로 변경
    for split in tokenized_financial_datasets.keys():
        # 현재 split에 존재하는 컬럼만 제거 시도
        actual_columns_to_remove = [col for col in columns_to_remove if col in tokenized_financial_datasets[split].column_names]
        tokenized_financial_datasets[split] = tokenized_financial_datasets[split].remove_columns(actual_columns_to_remove)
    tokenized_financial_datasets.set_format("torch")

    print("\nTokenized Financial Datasets:")
    print(tokenized_financial_datasets)
    if tokenized_financial_datasets['train']:
      print(tokenized_financial_datasets['train'][0])
else:
    print("Dataset is empty, skipping tokenization.")
    tokenized_financial_datasets = DatasetDict({
        'train': Dataset.from_dict({'input_ids': [], 'attention_mask': [], 'label': []}),
        'validation': Dataset.from_dict({'input_ids': [], 'attention_mask': [], 'label': []}),
        'test': Dataset.from_dict({'input_ids': [], 'attention_mask': [], 'label': []})
    })
    tokenized_financial_datasets.set_format("torch")

## 3. 모델 선택 및 학습

In [None]:
model_financial = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, 
    num_labels=num_fin_labels, 
    id2label=id_to_sentiment, 
    label2id=sentiment_to_id
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_financial.to(device)
print(f"Using device: {device}")

### 3.1. 학습 설정 및 메트릭 정의

In [None]:
def compute_metrics_financial(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1_macro": f1_score(labels, predictions, average="macro", zero_division=0)
    }

In [None]:
BATCH_SIZE = 16
LEARNING_RATE = 2e-5
NUM_EPOCHS = 3

training_args_financial = TrainingArguments(
    output_dir="./results_financial",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=NUM_EPOCHS,
    learning_rate=LEARNING_RATE,
    logging_dir="./logs_financial",
    logging_strategy="epoch", # 로깅 전략을 epoch으로 변경
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    save_total_limit=2,
    fp16=torch.cuda.is_available(),
    report_to="tensorboard"
)

### 3.2. Trainer 정의 및 모델 학습

In [None]:
trainer_financial = Trainer(
    model=model_financial,
    args=training_args_financial,
    train_dataset=tokenized_financial_datasets["train"],
    eval_dataset=tokenized_financial_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics_financial,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

In [None]:
# TensorBoard 실행 (Colab 또는 로컬 터미널에서)
# %load_ext tensorboard
# %tensorboard --logdir logs_financial

In [None]:
if not df.empty and tokenized_financial_datasets['train'] and len(tokenized_financial_datasets['train']) > 0:
    train_result_financial = trainer_financial.train()
else:
    print("Skipping training as dataset is not loaded or processed correctly, or is empty.")
    train_result_financial = None # 학습이 스킵되었음을 명시

### 3.3. 학습 과정 시각화 (추가)

In [None]:
if train_result_financial is not None: # 학습이 진행되었을 경우에만 시각화
    log_history_financial = trainer_financial.state.log_history
    df_log_financial = pd.DataFrame(log_history_financial)

    train_loss_fin = df_log_financial[df_log_financial['loss'].notna()][['epoch', 'loss']]
    eval_loss_fin = df_log_financial[df_log_financial['eval_loss'].notna()][['epoch', 'eval_loss']]
    eval_metrics_fin = df_log_financial[df_log_financial['eval_f1_macro'].notna()]

    plt.figure(figsize=(15, 5))

    plt.subplot(1, 2, 1)
    if not train_loss_fin.empty:
        plt.plot(train_loss_fin['epoch'], train_loss_fin['loss'], label='Training Loss', marker='o')
    if not eval_loss_fin.empty:
        plt.plot(eval_loss_fin['epoch'], eval_loss_fin['eval_loss'], label='Validation Loss', marker='o')
    plt.title('Financial Model: Training and Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True)

    plt.subplot(1, 2, 2)
    if not eval_metrics_fin.empty:
        plt.plot(eval_metrics_fin['epoch'], eval_metrics_fin['eval_f1_macro'], label='Validation F1 Macro', marker='o', color='green')
        if 'eval_accuracy' in eval_metrics_fin.columns:
            plt.plot(eval_metrics_fin['epoch'], eval_metrics_fin['eval_accuracy'], label='Validation Accuracy', marker='x', color='purple', linestyle='--')
    plt.title('Financial Model: Validation Metrics')
    plt.xlabel('Epoch')
    plt.ylabel('Score')
    plt.legend()
    plt.grid(True)

    plt.tight_layout()
    plt.show()
else:
    print("Skipping training visualization as training was not performed or log history is unavailable.")

## 4. 모델 평가 및 분석

In [None]:
if not df.empty and tokenized_financial_datasets['test'] and len(tokenized_financial_datasets['test']) > 0 and train_result_financial is not None:
    eval_results_financial = trainer_financial.evaluate(tokenized_financial_datasets["test"])
    print("\nTest Set Evaluation Results (Financial):")
    for key, value in eval_results_financial.items():
        print(f"{key}: {value:.4f}")
else:
    print("Skipping evaluation as dataset is not loaded, processed correctly, is empty, or model was not trained.")

In [None]:
true_labels_financial = np.array([])
predicted_labels_financial = np.array([])

if not df.empty and tokenized_financial_datasets['test'] and len(tokenized_financial_datasets['test']) > 0 and train_result_financial is not None:
    predictions_output_financial = trainer_financial.predict(tokenized_financial_datasets["test"])
    logits_financial = predictions_output_financial.predictions
    true_labels_financial = predictions_output_financial.label_ids
    predicted_labels_financial = np.argmax(logits_financial, axis=-1)
else:
    print("Skipping prediction as dataset is not loaded, processed correctly, is empty, or model was not trained.")

### 4.1. 분류 보고서

In [None]:
if true_labels_financial.size > 0:
    financial_label_names = [id_to_sentiment[i] for i in sorted(id_to_sentiment.keys())]
    print("\nFinancial PhraseBank Classification Report (Test Set):")
    print(classification_report(true_labels_financial, predicted_labels_financial, target_names=financial_label_names, zero_division=0))
else:
    print("Cannot generate classification report: No predictions available.")

### 4.2. 혼동 행렬

In [None]:
if true_labels_financial.size > 0:
    cm_financial = confusion_matrix(true_labels_financial, predicted_labels_financial)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm_financial, annot=True, fmt="d", cmap="Blues", 
                xticklabels=financial_label_names, yticklabels=financial_label_names)
    plt.title("Financial PhraseBank Confusion Matrix")
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.show()
else:
    print("Cannot generate confusion matrix: No predictions available.")

## 5. 샘플 예측

In [None]:
def predict_financial_sentiment(text, model_to_use, tokenizer_to_use, current_id2label):
    model_to_use.eval()
    current_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model_to_use.to(current_device)

    inputs = tokenizer_to_use(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    inputs = {k: v.to(current_device) for k, v in inputs.items()}

    with torch.no_grad():
        logits_output = model_to_use(**inputs).logits
    
    predicted_class_id = logits_output.argmax().item()
    predicted_label_name = current_id2label[predicted_class_id]
    
    return predicted_label_name, torch.softmax(logits_output, dim=1).squeeze().cpu().numpy()

In [None]:
if not df.empty and 'label' in df.columns and train_result_financial is not None:
    sample_financial_texts = [
        "The company reported strong earnings growth this quarter.",
        "Market sentiment remains neutral amidst global uncertainties.",
        "Analysts predict a downturn in stock prices next week.",
        "Despite the volatile market, our portfolio showed a slight increase.",
        "The new regulations are expected to negatively impact the industry."
    ]

    for text_item in sample_financial_texts:
        predicted_sentiment, probs = predict_financial_sentiment(text_item, model_financial, tokenizer, id_to_sentiment)
        print(f"\nSample Financial Text: '{text_item}'")
        print(f"Predicted Sentiment: {predicted_sentiment}")
else:
    print("\nSkipping sample prediction as model was not trained or data was not available.")

## 6. 모델 저장 (선택 사항)

In [None]:
if not df.empty and tokenized_financial_datasets['train'] and hasattr(trainer_financial, 'model') and train_result_financial is not None:
    output_model_dir_financial = "./saved_model_financial"
    os.makedirs(output_model_dir_financial, exist_ok=True)

    trainer_financial.save_model(output_model_dir_financial)
    tokenizer.save_pretrained(output_model_dir_financial)

    print(f"Financial model and tokenizer saved to {output_model_dir_financial}")
else:
    print("Skipping model saving as model was not trained, does not exist, or data was not available.")

### 모델 로드 및 사용 예시 (저장된 모델)

In [None]:
output_model_dir_financial_path = "./saved_model_financial" # 변수명 일관성 유지
if os.path.exists(output_model_dir_financial_path) and os.path.exists(os.path.join(output_model_dir_financial_path, 'pytorch_model.bin')):
    loaded_model_financial = AutoModelForSequenceClassification.from_pretrained(output_model_dir_financial_path)
    loaded_tokenizer_financial = AutoTokenizer.from_pretrained(output_model_dir_financial_path)

    sample_text_for_loading_test = "The acquisition is expected to boost profits significantly."
    predicted_sentiment_loaded, _ = predict_financial_sentiment(sample_text_for_loading_test, loaded_model_financial, loaded_tokenizer_financial, id_to_sentiment)

    print(f"\nSample Financial Text (loaded model): '{sample_text_for_loading_test}'")
    print(f"Predicted Sentiment: {predicted_sentiment_loaded}")
else:
    print(f"Skipping loading example: Model not found at {output_model_dir_financial_path}")