In [None]:
import os
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from transformers import (
    AutoTokenizer, AutoModel,
    TrainingArguments, Trainer,
    EarlyStoppingCallback
)
from sklearn.model_selection import GroupKFold
from google.colab import drive
from safetensors.torch import load_file



In [None]:
print("Google Drive ÎßàÏö¥Ìä∏ Ï§ë...")
drive.mount('/content/drive')

BASE_DIR = '/content/drive/MyDrive/Dacon_FakeText'
train_csv_path = f"{BASE_DIR}/data/train.csv"
test_csv_path = f"{BASE_DIR}/data/test.csv"
model_save_path = f"{BASE_DIR}/data/my_best_model_final"
submission_path = f"{BASE_DIR}/data/submission_final.csv"

‚úÖ Google Drive ÎßàÏö¥Ìä∏ Ï§ë...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
EPOCHS = 4
BATCH_SIZE = 100
LEARNING_RATE = 2e-5
N_SPLITS = 3


def preprocess_train_csv(train_csv_path):
    df = pd.read_csv(train_csv_path)
    processed = []
    for _, row in df.iterrows():
        title = row["title"]
        full_text = row["full_text"]
        label = row["generated"]
        paragraphs = full_text.split("\n\n")
        for idx, para in enumerate(paragraphs):
            para = para.strip()
            if len(para) > 10:
                processed.append({
                    "title": title,
                    "paragraph_index": idx,
                    "paragraph_text": para,
                    "label": label
                })
    return pd.DataFrame(processed)

class ParagraphDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=512):
        self.texts = df["paragraph_text"].tolist()
        self.labels = df["label"].tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        inputs = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        item = {key: val.squeeze(0) for key, val in inputs.items() if key != 'token_type_ids'}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item


In [None]:
class RobertaRegressionModel(nn.Module):
    def __init__(self, model_name="klue/roberta-base"):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.1)

        self.hidden = nn.Linear(self.bert.config.hidden_size, 256)
        self.act = nn.ReLU()
        self.out = nn.Linear(256, 1)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)

        cls_output = self.dropout(outputs.last_hidden_state[:, 0, :])
        hidden_out = self.act(self.hidden(cls_output))
        logits = self.out(hidden_out).squeeze()

        loss = None
        if labels is not None:
            loss = nn.BCEWithLogitsLoss()(logits, labels)
        return (loss, logits) if loss is not None else logits

In [None]:
def train_model_with_folds(train_csv_path, model_save_dir, n_splits=3):
    df = preprocess_train_csv(train_csv_path)
    tokenizer = AutoTokenizer.from_pretrained("klue/roberta-base")
    group_kfold = GroupKFold(n_splits=n_splits)
    groups = df["title"]

    for fold, (train_idx, val_idx) in enumerate(group_kfold.split(df, df["label"], groups)):
        print(f"\nüöÄ Fold {fold} ÏãúÏûë")
        train_df, val_df = df.iloc[train_idx], df.iloc[val_idx]

        train_dataset = ParagraphDataset(train_df, tokenizer)
        val_dataset = ParagraphDataset(val_df, tokenizer)

        model = RobertaRegressionModel()

        training_args = TrainingArguments(
            output_dir=f"./results/fold{fold}",
            per_device_train_batch_size=BATCH_SIZE,
            per_device_eval_batch_size=BATCH_SIZE,
            num_train_epochs=EPOCHS,
            learning_rate=LEARNING_RATE,
            logging_dir=f"./logs/fold{fold}",
            logging_steps=100,
            save_total_limit=1,
            report_to="none"
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            tokenizer=tokenizer
        )

        trainer.train()

        fold_dir = os.path.join(model_save_dir, f"fold{fold}")
        os.makedirs(fold_dir, exist_ok=True)
        trainer.save_model(fold_dir)
        tokenizer.save_pretrained(fold_dir)
        print(f" Fold {fold} Î™®Îç∏ Ï†ÄÏû• ÏôÑÎ£å")

In [None]:
@torch.no_grad()
def predict_with_ensemble(test_csv_path, model_save_dir, submission_path, n_splits=3):
    test_df = pd.read_csv(test_csv_path)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    preds_all = []

    for fold in range(n_splits):
        print(f"üîç Fold {fold} Î™®Îç∏Î°ú Ï∂îÎ°† Ï§ë...")
        fold_dir = os.path.join(model_save_dir, f"fold{fold}")
        tokenizer = AutoTokenizer.from_pretrained(fold_dir)
        model = RobertaRegressionModel().to(device)
        model.load_state_dict(load_file(os.path.join(fold_dir, "model.safetensors")))
        model.eval()

        preds = []
        for text in test_df["paragraph_text"]:
            inputs = tokenizer(text, truncation=True, padding="max_length", max_length=512, return_tensors="pt")
            inputs = {k: v.to(device) for k, v in inputs.items() if k != 'token_type_ids'}
            output = model(**inputs)
            logit = output if not isinstance(output, tuple) else output[1]
            prob = torch.sigmoid(logit).item()
            preds.append(prob)

        preds_all.append(preds)

    # ÌèâÍ∑†
    final_preds = torch.tensor(preds_all).mean(dim=0).tolist()
    test_df["generated"] = final_preds
    test_df[["ID", "generated"]].to_csv(submission_path, index=False)
    print("Ï†úÏ∂ú ÌååÏùº ÏÉùÏÑ± ÏôÑÎ£å!")


In [None]:
# ÌïôÏäµ
train_model_with_folds(train_csv_path, model_save_dir="./final_model", n_splits=3)

# Ï∂îÎ°†
predict_with_ensemble(test_csv_path, model_save_dir="./final_model", submission_path=submission_path, n_splits=3)

Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



üöÄ Fold 0 ÏãúÏûë


  trainer = Trainer(


Step,Training Loss
100,0.2094
200,0.156
300,0.1468
400,0.1477
500,0.129
600,0.1185
700,0.103
800,0.1014
900,0.0908
1000,0.0852


‚úÖ Fold 0 Î™®Îç∏ Ï†ÄÏû• ÏôÑÎ£å

üöÄ Fold 1 ÏãúÏûë


Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
100,0.2171
200,0.1451
300,0.1416
400,0.129
500,0.1429
600,0.1292
700,0.1256
800,0.1039
900,0.0994
1000,0.0941


‚úÖ Fold 1 Î™®Îç∏ Ï†ÄÏû• ÏôÑÎ£å

üöÄ Fold 2 ÏãúÏûë


Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
100,0.2002
200,0.1458
300,0.1392
400,0.1323
500,0.1275
600,0.1205
700,0.1101
800,0.0845
900,0.0879
1000,0.0849


‚úÖ Fold 2 Î™®Îç∏ Ï†ÄÏû• ÏôÑÎ£å
üîç Fold 0 Î™®Îç∏Î°ú Ï∂îÎ°† Ï§ë...


Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


FileNotFoundError: [Errno 2] No such file or directory: './final_model/fold0/pytorch_model.bin'

In [None]:
predict_with_ensemble(test_csv_path, model_save_dir="./final_model", submission_path=submission_path, n_splits=3)

Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


üîç Fold 0 Î™®Îç∏Î°ú Ï∂îÎ°† Ï§ë...
üîç Fold 1 Î™®Îç∏Î°ú Ï∂îÎ°† Ï§ë...


Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


üîç Fold 2 Î™®Îç∏Î°ú Ï∂îÎ°† Ï§ë...


Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


‚úÖ Ï†úÏ∂ú ÌååÏùº ÏÉùÏÑ± ÏôÑÎ£å!


In [None]:
# # 7. ÌååÏù¥ÌîÑÎùºÏù∏ Ìï®Ïàò

# def run_pipeline(train_csv_path, test_csv_path, model_save_path, submission_path):
#     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#     tokenizer = AutoTokenizer.from_pretrained("klue/roberta-base")

#     train_df = preprocess_train_csv(train_csv_path)
#     train_dataset = ParagraphDataset(train_df, tokenizer)

#     model = RobertaRegressionModel().to(device)

#     training_args = TrainingArguments(
#         output_dir="./results",
#         per_device_train_batch_size=BATCH_SIZE,
#         num_train_epochs=EPOCHS,
#         learning_rate=LEARNING_RATE,
#         save_total_limit=1,
#         save_strategy="no",
#         report_to="none",
#         logging_dir="./logs",
#         logging_steps=100,
#     )

#     trainer = Trainer(
#         model=model,
#         args=training_args,
#         train_dataset=train_dataset,
#         tokenizer=tokenizer
#     )

#     trainer.train()
#     model.eval()

#     # Î™®Îç∏ Ï†ÄÏû•
#     os.makedirs(model_save_path, exist_ok=True)
#     torch.save(model.state_dict(), os.path.join(model_save_path, "pytorch_model.bin"))
#     tokenizer.save_pretrained(model_save_path)

#     # ÏòàÏ∏°
#     test_df = pd.read_csv(test_csv_path)
#     preds = []
#     for text in test_df["paragraph_text"]:
#         inputs = tokenizer(text, truncation=True, padding="max_length", max_length=512, return_tensors="pt")
#         inputs = {k: v.to(device) for k, v in inputs.items()}
#         with torch.no_grad():
#             output = model(**inputs)
#             pred = output[1].item() if isinstance(output, tuple) else output.item()
#             preds.append(pred)

#     # Ï†úÏ∂ú ÌååÏùº Ï†ÄÏû•
#     test_df["generated"] = preds
#     test_df[["ID", "generated"]].to_csv(submission_path, index=False)
#     print("Ï†úÏ∂ú ÌååÏùº ÏÉùÏÑ± ÏôÑÎ£å!")

# # 8. Ïã§Ìñâ
# run_pipeline(train_csv_path, test_csv_path, model_save_path, submission_path)

