In [None]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import matplotlib.pyplot as plt

# Load data
train_data = pd.read_csv('train.tsv', sep='\t', header=None, names=['label', 'text'], dtype={'label': int, 'text': str})
dev_data = pd.read_csv('dev.tsv', sep='\t', header=None, names=['label', 'text'], dtype={'label': int, 'text': str})

# Filter out rows with NaN values
train_data = train_data.dropna()
dev_data = dev_data.dropna()

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("beomi/kcbert-base")
model = AutoModelForSequenceClassification.from_pretrained("beomi/kcbert-base", num_labels=2)

# Tokenize data
train_encodings = tokenizer(train_data['text'].tolist(), truncation=True, padding=True, max_length=128)
dev_encodings = tokenizer(dev_data['text'].tolist(), truncation=True, padding=True, max_length=128)

# Create torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = Dataset(train_encodings, train_data['label'].tolist())
dev_dataset = Dataset(dev_encodings, dev_data['label'].tolist())

# Define compute metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    compute_metrics=compute_metrics,
)

# Train model
train_result = trainer.train()

# Evaluate model
eval_result = trainer.evaluate()
print(f"Evaluation results: {eval_result}")

# Save model
model.save_pretrained('./kcbert-finetuned')
tokenizer.save_pretrained('./kcbert-finetuned')

# Extract training metrics
metrics = trainer.state.log_history

# Extract loss and accuracy values
train_loss = [x['loss'] for x in metrics if 'loss' in x]
eval_loss = [x['eval_loss'] for x in metrics if 'eval_loss' in x]
train_accuracy = [x['accuracy'] for x in metrics if 'accuracy' in x]
eval_accuracy = [x['eval_accuracy'] for x in metrics if 'eval_accuracy' in x]

# Extract steps
train_steps = [x['step'] for x in metrics if 'loss' in x]
eval_steps = [x['step'] for x in metrics if 'eval_loss' in x]

# Plotting loss
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(train_steps, train_loss, label='Training Loss')
plt.plot(eval_steps, eval_loss, label='Evaluation Loss')
plt.xlabel('Steps')
plt.ylabel('Loss')
plt.title('Training and Evaluation Loss')
plt.legend()

# Plotting accuracy
plt.subplot(1, 2, 2)
plt.plot(train_steps, train_accuracy, label='Training Accuracy')
plt.plot(eval_steps, eval_accuracy, label='Evaluation Accuracy')
plt.xlabel('Steps')
plt.ylabel('Accuracy')
plt.title('Training and Evaluation Accuracy')
plt.legend()

plt.show()