In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import DataLoader, Dataset

# Daten laden
df = pd.read_csv("/kaggle/input/drcat-dataset/train_v2_drcat_02.csv")

# Train-Test-Split
train_texts, val_texts, train_labels, val_labels = train_test_split(df['text'], df['label'], test_size=0.2)



# Tokenizer laden
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

def compute_metrics(pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='binary')
    precision = precision_score(labels, predictions, average='binary')
    recall = recall_score(labels, predictions, average='binary')
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Dataset-Klasse definieren
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.encodings = tokenizer(texts.tolist(), truncation=True, padding=True, max_length=512)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(int(self.labels[idx]))
        return item

    def __len__(self):
        return len(self.labels)

# Datasets erstellen
train_dataset = TextDataset(train_texts, train_labels, tokenizer)
val_dataset = TextDataset(val_texts, val_labels, tokenizer)
# Modell laden
model = RobertaForSequenceClassification.from_pretrained('roberta-base')

# TrainingArguments definieren
training_args = TrainingArguments(
    output_dir='./results',          # Ausgabeverzeichnis
    num_train_epochs=3,              # Anzahl der Trainings-Epochen
    per_device_train_batch_size=8,   # Batch-Größe für Training
    per_device_eval_batch_size=16,   # Batch-Größe für Evaluation
    warmup_steps=500,                # Anzahl der Warmup-Schritte für Learning-Rate-Scheduler
    weight_decay=0.01,               # Weight Decay
    logging_dir='./logs',            # Verzeichnis für Logging
    logging_steps=10,
)

# Trainer initialisieren
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# Training starten
trainer.train()


2024-04-12 07:16:14.548731: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-12 07:16:14.548867: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-12 07:16:14.678219: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/wandb/sdk/wandb_init.py", line 1183, in init
    wi.setup(kwargs)
  File "/opt/conda/lib/python3.10/site-packages/w

Error: An unexpected error occurred

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

trainer.evaluate()


In [None]:
trainer.evaluate()

In [None]:
test_df = pd.read_csv('/kaggle/input/textdata/test.csv')
test_texts = test_df['text'].tolist()
# Tokenisierung der Testdaten
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

# Erstellen eines PyTorch-Dataset für die Testdaten
class TestDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings.input_ids)

test_dataset = TestDataset(test_encodings)

# Vorhersagen treffen
model.eval()
predictions = []
with torch.no_grad():
    for item in test_dataset:
        inputs = {k: v.unsqueeze(0) for k, v in item.items()}
        output = model(**inputs)
        logits = output.logits
        predicted_class_id = logits.argmax().item()
        predictions.append(predicted_class_id)

# Ergebnisse zuordnen
test_df['predicted'] = predictions

test_df.to_csv('test_predictions.csv', index=False)
# Zum Anzeigen der ersten paar Zeilen
print(test_df.head())
