In [1]:
import pandas as pd
import numpy as np
import re
import os
os.environ["WANDB_DISABLED"] = "true"

In [2]:
from scipy.stats import loguniform
from sklearn.model_selection import train_test_split
import itertools
from itertools import permutations
import shutil
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from safetensors.torch import load_file
from transformers import (
    AutoTokenizer,
    AutoModel,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)




# 1. 데이터 준비 및 전처리

In [3]:
# 데이터 로드
train_path = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')
submission_path = pd.read_csv('../data/sample_submission.csv')

In [None]:
submission_path.info()
submission_path.head()

In [5]:
# 텍스트 정제
def clean_text(text):
  # 특수문자 제거
  text = re.sub(r'[^\w\s]', '', text)
  # 소문자 변환: 한글에는 무의미
  text = text.lower()
  # 불필요한 공백 제거
  text = ' '.join(text.split())
  return text

In [6]:
# 텍스트 정제
for i in range(4):
    train_path[f'sentence_{i}'] = train_path[f'sentence_{i}'].apply(clean_text)
    test_df[f'sentence_{i}'] = test_df[f'sentence_{i}'].apply(clean_text)

In [7]:
# data_utils.py
def load_data(train_path, test_path):
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    return train, test

def make_labels(df):
    # answer_0 ~ answer_3 → [문장0은 몇 번째, 문장1은 몇 번째, ...]
    answers = df[[f'answer_{i}' for i in range(4)]].values
    labels = []
    for row in answers:
        label = [0]*4
        for pos, sent_idx in enumerate(row):
            label[sent_idx] = pos
        labels.append(label)
    return np.array(labels)

Dataset 클래스
 * 4개의 문장을 [SEP]로 묶어서 BERT에 넣을 수 있게 바꿔줌

In [8]:
# ✅ 1. 데이터셋 클래스
class GlobalOrderDataset(Dataset):
    def __init__(self, df, tokenizer, labels=None, max_length=256):
        self.sentences = df[[f'sentence_{i}' for i in range(4)]].values
        self.tokenizer = tokenizer
        self.labels = labels
        self.max_length = max_length

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sents = self.sentences[idx]
        text = '[CLS] ' + ' [SEP] '.join(sents) + ' [SEP]'
        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        item = {k: v.squeeze(0) for k, v in encoding.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
            return item

# 2. 학습 준비

Model 클래스
- AutoModel (예: Roberta) 사용
- 문장 4개를 넣었을 때 그 순서를 예측
- 출력은 [batch, 4, 4] 크기의 행렬 → 각 문장이 어떤 위치에 있어야 하는지 예측

In [9]:
class GlobalOrderModel(nn.Module):
    def __init__(self, model_name='klue/roberta-large'):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        hidden_size = self.bert.config.hidden_size

        # 더 깊은 분류기 구조
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size, 1024),
            nn.LayerNorm(1024),  # LayerNorm 추가
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(1024, 512),
            nn.LayerNorm(512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, 4 * 4)  # 4문장 * 4 클래스
        )

    def forward(self, input_ids, attention_mask, labels=None):
        # BERT 인코딩
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        
        # [CLS] 토큰과 마지막 hidden state의 평균을 사용
        cls_token = outputs.last_hidden_state[:, 0]  # [CLS] 토큰
        last_hidden = outputs.last_hidden_state[:, 1:].mean(dim=1)  # 나머지 토큰들의 평균
        pooled = (cls_token + last_hidden) / 2  # 두 특징의 평균
        
        # 분류
        logits = self.classifier(pooled)
        
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits.view(-1, 4), labels.view(-1))
            return {"loss": loss, "logits": logits.view(-1, 4, 4)}
        else:
            return {"logits": logits.view(-1, 4, 4)}

In [10]:
def compute_metrics(eval_pred):
    try:
        logits, labels = eval_pred
        preds = np.argmax(logits, axis=2)
        sentence_accuracy = (preds == labels).mean()
        full_order_accuracy = (preds == labels).all(axis=1).mean()

        return {
            "sentence_accuracy": sentence_accuracy,
            "full_order_accuracy": full_order_accuracy
        }
    except Exception as e:
        print(f"❌ compute_metrics 내부 오류: {e}")
        return {}

In [None]:
model = GlobalOrderModel(model_name="klue/roberta-large")  # ✅ RoBERTa로 교체
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # ✅ GPU 사용 여부 확인
model.to(device)  # ✅ 모델을 해당 디바이스로 이동

In [12]:
# ✅ RoBERTa 전용 tokenizer 사용
tokenizer = AutoTokenizer.from_pretrained("klue/roberta-large")

In [13]:
# ✅ train/val 분리 (20% → 검증에 사용)
train_df_split, val_df = train_test_split(train_path, test_size=0.2, random_state=42)

# ✅ 라벨 생성
train_labels = make_labels(train_df_split)
val_labels = make_labels(val_df)

In [14]:
# 🔹 학습 데이터셋 생성
train_dataset = GlobalOrderDataset(train_df_split, tokenizer, labels=train_labels)
val_dataset = GlobalOrderDataset(val_df, tokenizer, labels=val_labels)

In [None]:
train_dataset[0]

In [None]:
val_dataset[0]

In [17]:
# 🔹 전체 라벨 생성
labels = make_labels(train_path)

# ✅ TrainingArguments
training_args = TrainingArguments(
    output_dir="./global_results",
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=100,
    warmup_steps=500,
    weight_decay=0.01,
    
    load_best_model_at_end=True,
    metric_for_best_model='full_order_accuracy',
    greater_is_better=True,
    seed=42,
)

# ✅ Trainer 설정 (전체 학습용)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2, early_stopping_threshold=0.001)]
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


# 3. 학습 실행

In [None]:
trainer.train()

In [16]:
# import shutil

# checkpoints = ["checkpoint-368", "checkpoint-736", "checkpoint-1104", "checkpoint-1472", "checkpoint-1840"]
# for ckpt in checkpoints:
#     shutil.rmtree(f"/content/global_results/{ckpt}", ignore_errors=True)

In [None]:
# ✅ best checkpoint 기준으로 모델 저장
save_path = "./global_results/best_model"

trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)

# 4. 하이퍼파라미터 튜닝

In [30]:
# ✅ 튜닝용 데이터 분리
train_split_df, val_df = train_test_split(train_path, test_size=0.2, random_state=42)
train_split_labels = make_labels(train_split_df)
val_labels = make_labels(val_df)

train_split_dataset = GlobalOrderDataset(train_split_df, tokenizer, labels=train_split_labels)
val_dataset = GlobalOrderDataset(val_df, tokenizer, labels=val_labels)


In [None]:
def run_global_tuning(train_split_dataset, val_dataset, tokenizer, n_trials=2):
    results_path = './global_results/tuning_log.csv'
    if os.path.exists(results_path):
        results = pd.read_csv(results_path).to_dict(orient='records')
        start_trial = len(results)
    else:
        results = []
        start_trial = 0

    for trial in range(start_trial, n_trials):
        print(f"\n🎯 Trial {trial + 1} 시작")
        lr = float(loguniform.rvs(1.5e-5, 3.5e-5))
        wd = float(loguniform.rvs(0.01, 0.07))
        epochs = 10
        batch_size = int(np.random.randint(16, 32))
        total_steps = (len(train_split_dataset) // batch_size) * epochs
        warmup = int(total_steps * 0.02)

        args = TrainingArguments(
            output_dir=f'./global_results/trial_{trial+1}',
            learning_rate=lr,
            weight_decay=wd,
            warmup_steps=warmup,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=64,
            num_train_epochs=epochs,
            gradient_accumulation_steps=1,
            lr_scheduler_type='linear',
            logging_dir='./roberta_logs',
            logging_steps=100,
            save_strategy="epoch",
            save_total_limit=2,
            eval_strategy='epoch',  # ✅ 평가 활성화
            seed=42,
            load_best_model_at_end=True,
            metric_for_best_model='full_order_accuracy',  # ✅ 이 기준으로 best 선택
            greater_is_better=True,
            report_to='none',
            fp16=True,
            optim='adamw_torch_fused'
        )

        trainer = Trainer(
            model=GlobalOrderModel("klue/roberta-large"),
            args=args,
            train_dataset=train_split_dataset,
            eval_dataset=val_dataset,
            tokenizer=tokenizer,
            compute_metrics=compute_metrics,  # ✅ 사용자 정의 메트릭
            callbacks=[EarlyStoppingCallback(early_stopping_patience=2, early_stopping_threshold=0.001)],
        )

        try:
            trainer.train()

            # 최종 평가
            eval_result = trainer.evaluate()
            sentence_acc = eval_result.get("eval_sentence_accuracy", None)
            full_order_acc = eval_result.get("eval_full_order_accuracy", None)
            eval_loss = eval_result.get("eval_loss", None)

            save_path = f'./global_results/trial_{trial+1}/best_model'
            try:
                trainer.save_model(save_path)
                tokenizer.save_pretrained(save_path)
                model_saved = True
            except Exception as e:
                print(f"⚠️ 모델 저장 실패: {e}")
                model_saved = False
                save_path = "FAILED"

            results.append({
                'trial': trial + 1,
                'learning_rate': lr,
                'weight_decay': wd,
                'warmup_steps': warmup,
                'epochs': epochs,
                'sentence_accuracy': sentence_acc,
                'full_order_accuracy': full_order_acc,
                'eval_loss': eval_loss,
                'model_saved': model_saved,
                'save_path': save_path
            })
            pd.DataFrame(results).to_csv(results_path, index=False)

            # checkpoint 정리
            output_dir = f'./global_results/trial_{trial+1}'
            for subdir in os.listdir(output_dir):
                if subdir.startswith("checkpoint"):
                    shutil.rmtree(os.path.join(output_dir, subdir), ignore_errors=True)

            print(f"✅ Trial {trial+1} 완료 | 저장 경로: {save_path}")

        except Exception as e:
            print(f"⛔ Trial {trial+1} 중 오류 발생: {e}")
            break

    print("\n🏆 상위 Trial:")
    top_trials = pd.DataFrame(results).sort_values(by="full_order_accuracy", ascending=False).head(1)
    print(top_trials)
    return top_trials

In [None]:
# ✅ 튜닝 로그 불러오기
df = pd.read_csv("./global_results/tuning_log.csv")

# ✅ 모델 저장된 trial만 남기기
df = df[df["model_saved"] == True]

# ✅ 평가 결과가 있는 경우: full_order_accuracy 기준 선택, 없으면 trial 번호 기준
if "full_order_accuracy" in df.columns and df["full_order_accuracy"].notna().any():
    top_trial = df.sort_values("full_order_accuracy", ascending=False).iloc[0]
else:
    top_trial = df.sort_values("trial", ascending=True).iloc[0]  # fallback

# ✅ 경로 확인
best_model_path = top_trial["save_path"]
print(f"🏆 선택된 Best Model 경로: {best_model_path}")


In [None]:
# ✅ 실제 폴더 존재하는지 확인 (예시로 trial_2)
print(os.listdir(best_model_path))

In [None]:
# ✅ 모델 로드
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = GlobalOrderModel("klue/roberta-large")

# safetensors 파일 로드
state_dict = load_file(f"{best_model_path}/model.safetensors")
model.load_state_dict(state_dict)
model.to(device)
model.eval()

In [None]:
final_model_dir = "./global_results/best_model_custom"
shutil.copytree(best_model_path, final_model_dir, dirs_exist_ok=True)
print(f"📦 최종 best model 저장됨: {final_model_dir}")


# 5. 추론

In [None]:
# 🔹 전체 학습 데이터셋 구성
train_labels = make_labels(train_path)
train_dataset = GlobalOrderDataset(train_path, tokenizer, labels=train_labels)

# 🔹 튜닝 결과에서 best 설정 추출
# (예: top_trial에서 learning_rate, weight_decay 등 가져오기)
args = TrainingArguments(
    output_dir="./global_results/best_final",
    learning_rate=top_trial['learning_rate'],
    weight_decay=top_trial['weight_decay'],
    warmup_steps=int(top_trial['warmup_steps']),
    per_device_train_batch_size=16,
    num_train_epochs=int(top_trial['epochs']),
    logging_dir='./retrain_logs',
    save_strategy="no",  # ❌ 저장은 수동으로
    evaluation_strategy="no",
    report_to='none',
    fp16=True,
    optim="adamw_torch_fused"
)

trainer = Trainer(
    model=GlobalOrderModel("klue/roberta-large"),
    args=args,
    train_dataset=train_dataset,
    tokenizer=tokenizer
)

# 🔹 재학습
trainer.train()

# 🔹 최종 모델 저장
trainer.save_model("./global_results/final_model")
tokenizer.save_pretrained("./global_results/final_model")


In [None]:
# ✅ 모델 클래스 직접 정의 (model.py 없이도 OK)
class GlobalOrderModel(nn.Module):
    def __init__(self, model_name='klue/roberta-large'):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        hidden_size = self.bert.config.hidden_size
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size, 1024),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(1024, 4 * 4)
        )

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled = outputs.last_hidden_state[:, 0]
        logits = self.classifier(pooled)
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits.view(-1, 4), labels.view(-1))
            return {"loss": loss, "logits": logits.view(-1, 4, 4)}
        else:
            return {"logits": logits.view(-1, 4, 4)}


In [None]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

best_model_path = "./global_results/best_model_custom"  # ← 경로 꼭 이걸로 맞춰주세요

model = GlobalOrderModel("klue/roberta-large")
state_dict = load_file(f"{best_model_path}/model.safetensors")
model.load_state_dict(state_dict)
model.to(device)
model.eval()


In [None]:
# ✅ 직접 정의한 Dataset 클래스
class GlobalOrderDataset(Dataset):
    def __init__(self, df, tokenizer, labels=None, max_length=256):
        self.sentences = df[[f'sentence_{i}' for i in range(4)]].values
        self.tokenizer = tokenizer
        self.labels = labels
        self.max_length = max_length

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sents = self.sentences[idx]
        text = '[CLS] ' + ' [SEP] '.join(sents) + ' [SEP]'
        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        item = {k: v.squeeze(0) for k, v in encoding.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item


In [None]:
def predict(model, test_df, tokenizer, device, batch_size=32):
    test_dataset = GlobalOrderDataset(test_df, tokenizer, labels=None)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    model.eval()
    all_preds = []
    
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            
            # 모델 출력이 [batch, 4, 4] 형태로 나옴
            # 각 문장별로 4개의 위치에 대한 확률
            logits = model(input_ids, attention_mask)['logits']
            
            # 각 문장이 어떤 위치에 가야하는지 예측
            # [batch, 4, 4] -> [batch, 4] (각 문장의 최적 위치)
            preds = logits.argmax(-1).cpu().numpy()
            all_preds.append(preds)
    
    all_preds = np.concatenate(all_preds, axis=0)
    
    # [문장0은 몇 번째, ...] → [answer_0, answer_1, ...]로 역변환
    answers = []
    for row in all_preds:
        answer = [0] * 4
        for sent_idx, pos in enumerate(row):
            answer[pos] = sent_idx
        answers.append(answer)
    
    return np.array(answers)

def save_submission(test_df, answers, submission_path, output_path):
    sub = pd.read_csv(submission_path)
    for i in range(4):
        sub[f'answer_{i}'] = answers[:, i]
    sub.to_csv(output_path, index=False)

# 6. 예측 및 저장

In [None]:
# 예측 수행
answers = predict(
    model=model,
    test_df=test_df,
    tokenizer=tokenizer,
    device=device,
    batch_size=32
)

# 제출 파일 저장
save_submission(
    test_df=test_df,
    answers=answers,
    submission_path="./sample_submission.csv",
    output_path="./submission.csv"
)

print("✅ submission.csv 저장 완료")