In [None]:
import os
import torch
import pandas as pd
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModel, Trainer, TrainingArguments
import torch.nn as nn
from google.colab import drive

In [None]:
# 1. Google Drive 마운트
print("Google Drive 마운트 중...")
drive.mount('/content/drive')

# 2. 경로 설정
BASE_DIR = '/content/drive/MyDrive/Dacon_FakeText'
train_csv_path = f"{BASE_DIR}/data/train.csv"
test_csv_path = f"{BASE_DIR}/data/test.csv"
model_save_path = f"{BASE_DIR}/data/my_best_model_8"
submission_path = f"{BASE_DIR}/data/submission_8.csv"

✅ Google Drive 마운트 중...
Mounted at /content/drive


In [None]:
# 하이퍼파라미터
EPOCHS = 2  # 짧게
BATCH_SIZE = 64  # GPU에 맞게 조정
LEARNING_RATE = 3e-5
SAMPLE_FRAC = 0.1  # 10%만 사용

# 전처리 함수 (변경 없음)
def preprocess_train_csv(train_csv_path):
    df = pd.read_csv(train_csv_path)
    processed = []
    for _, row in df.iterrows():
        title = row["title"]
        full_text = row["full_text"]
        label = row["generated"]
        paragraphs = full_text.split("\n\n")
        for idx, para in enumerate(paragraphs):
            para = para.strip()
            if len(para) > 10:
                processed.append({
                    "title": title,
                    "paragraph_index": idx,
                    "paragraph_text": para,
                    "label": label
                })
    return pd.DataFrame(processed)

In [None]:
class RobertaRegressionModel(nn.Module):
    def __init__(self, model_name="klue/roberta-base"):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.1)
        self.regressor = nn.Linear(self.bert.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = self.dropout(outputs.last_hidden_state[:, 0, :])
        logits = self.regressor(cls_output).squeeze()
        loss = None
        if labels is not None:
            loss = nn.BCEWithLogitsLoss()(logits, labels)
        return (loss, logits) if loss is not None else logits

In [None]:
class ParagraphDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=512):
        self.texts = df["paragraph_text"].tolist()
        self.labels = df["label"].tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        inputs = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        item = {key: val.squeeze(0) for key, val in inputs.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item


In [None]:
def train_subset_model(train_csv_path, model_save_path):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    tokenizer = AutoTokenizer.from_pretrained("klue/roberta-base")

    # 전체 데이터 로딩 후 일부 샘플링
    full_train_df = preprocess_train_csv(train_csv_path)
    train_df = full_train_df.sample(frac=SAMPLE_FRAC, random_state=42).reset_index(drop=True)
    print(f"Subset 샘플 수: {len(train_df)}")

    train_dataset = ParagraphDataset(train_df, tokenizer)

    model = RobertaRegressionModel().to(device)

    training_args = TrainingArguments(
        output_dir="./results",
        per_device_train_batch_size=BATCH_SIZE,
        num_train_epochs=EPOCHS,
        learning_rate=LEARNING_RATE,
        save_total_limit=1,
        save_strategy="no",
        report_to="none",
        logging_dir="./logs",
        logging_steps=50,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        tokenizer=tokenizer
    )

    trainer.train()
    model.eval()

    # 모델 저장
    os.makedirs(model_save_path, exist_ok=True)
    torch.save(model.state_dict(), os.path.join(model_save_path, "pytorch_model.bin"))
    tokenizer.save_pretrained(model_save_path)

    print("Subset 학습 완료 및 모델 저장 완료!")

In [None]:
def inference_model(test_csv_path, model_save_path, submission_path):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    tokenizer = AutoTokenizer.from_pretrained(model_save_path)

    model = RobertaRegressionModel().to(device)
    model.load_state_dict(torch.load(os.path.join(model_save_path, "pytorch_model.bin"), map_location=device))
    model.eval()

    test_df = pd.read_csv(test_csv_path)
    preds = []
    for text in test_df["paragraph_text"]:
        inputs = tokenizer(text, truncation=True, padding="max_length", max_length=512, return_tensors="pt")
        inputs = {k: v.to(device) for k, v in inputs.items() if k != 'token_type_ids'}
        with torch.no_grad():
            output = model(**inputs)
            logit = output if not isinstance(output, tuple) else output[1]
            prob = torch.sigmoid(logit).item()
            preds.append(prob)

    test_df["generated"] = preds
    test_df[["ID", "generated"]].to_csv(submission_path, index=False)
    print("제출 파일 생성 완료!")

In [None]:

# 1. Subset 학습
train_subset_model(train_csv_path, model_save_path)

# 2. 추론
inference_model(test_csv_path, model_save_path, submission_path)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/375 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

✅ Subset 샘플 수: 9717


config.json:   0%|          | 0.00/546 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/443M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
50,0.26
100,0.1523
150,0.1559
200,0.1436
250,0.1529
300,0.1214


✅ Subset 학습 완료 및 모델 저장 완료!


Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ 제출 파일 생성 완료!
