In [None]:
import transformers
import datasets
import pandas as pd
import numpy as np
from datasets import Dataset
import os

In [None]:
# link：https://www.kaggle.com/datasets/mustafakeser4/distilroberta-base

model_checkpoint = "/kaggle/input/distilroberta-base/distilroberta-base" #base model

In [None]:

train0 = pd.read_csv('gendata.csv')

In [None]:
train0.label.value_counts()

In [None]:
from sklearn.model_selection import StratifiedKFold
sk = StratifiedKFold(n_splits=10,shuffle=True,random_state=42)
for i, (tr,val) in enumerate(sk.split(train0,train0.label)):
    train = train0.iloc[tr]
    valid = train0.iloc[val]
    break

In [None]:
train.text = train.text.fillna("")
valid.text = valid.text.apply(lambda x: x.strip('\n'))
train.text = train.text.apply(lambda x: x.strip('\n'))

In [None]:
ds_train = Dataset.from_pandas(train)
ds_valid = Dataset.from_pandas(valid)

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
def preprocess_function(examples):
    return tokenizer(examples['text'], max_length=128, padding=True, truncation=True)

ds_train_enc = ds_train.map(preprocess_function, batched=True)

ds_valid_enc = ds_valid.map(preprocess_function, batched=True)

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

num_labels = 2
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

import torch

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# Move your model and data to the GPU
model.to(device);

from transformers import EarlyStoppingCallback

early_stopping = EarlyStoppingCallback(early_stopping_patience=2)

num_train_epochs=16.0

In [None]:
metric_name = "roc_auc"
model_name = "distilroberta"#"deberta-large"
batch_size = 2

args = TrainingArguments(
    f"{model_name}-finetuned_v5",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    lr_scheduler_type = "cosine",
    
    optim="adamw_torch",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=8,
    num_train_epochs=num_train_epochs,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    report_to='none',
    save_total_limit=2,
    
)

In [None]:
from sklearn.metrics import roc_auc_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = np.exp(logits) / np.sum(np.exp(logits), axis=-1, keepdims=True)
    auc = roc_auc_score(labels, probs[:,1], multi_class='ovr')
    return {"roc_auc": auc}

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=ds_train_enc,
    eval_dataset=ds_valid_enc,
    tokenizer=tokenizer,
    callbacks = [early_stopping],
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()