In [None]:
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import DataLoader, Dataset
import os
os.environ['WANDB_DISABLED'] = 'true'
# Daten laden
df = pd.read_csv('/kaggle/input/drcat-dataset/train_v2_drcat_02.csv')
texts = df['text'].tolist()  
labels = df['label'].tolist()  

train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2)



# Tokenizer laden
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='binary')
    precision = precision_score(labels, predictions, average='binary')
    recall = recall_score(labels, predictions, average='binary')
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Dataset-Klasse definieren
class EssaysDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Datasets erstellen
train_dataset = EssaysDataset(train_encodings, train_labels)
val_dataset = EssaysDataset(val_encodings, val_labels)
# Modell laden
model = RobertaForSequenceClassification.from_pretrained('roberta-base')

# TrainingArguments definieren
training_args = TrainingArguments(
    output_dir='./results',         
    num_train_epochs=1,              
    per_device_train_batch_size=8,  
    per_device_eval_batch_size=16,  
    warmup_steps=500,               
    weight_decay=0.01,              
    logging_dir='./logs',           
    logging_steps=10,
    evaluation_strategy='epoch',  
)

# Trainer initialisieren
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# Training starten
trainer.train()


In [None]:
trainer.evaluate()

In [None]:
from accelerate import Accelerator
accelerator = Accelerator()
test_df = pd.read_csv('/kaggle/input/testdatensatz/test.csv')
test_texts = test_df['text'].tolist()

test_encodings = tokenizer(test_texts, truncation=True, padding=True)

class TestDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings.input_ids)

test_dataset = TestDataset(test_encodings)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device) 

# Vorhersagen treffen
model.eval()
predictions = []
with torch.no_grad():

    for item in test_dataset:
        inputs = {k: v.to(device).unsqueeze(0) for k, v in item.items()}
        output = model(**inputs)
        #logits = output.logits
        
        
    
        logits = output.logits
        predicted_class_id = torch.sigmoid(logits)
        predicted_class_id = accelerator.gather_for_metrics(predicted_class_id)
        predicted_class_id = predicted_class_id.cpu().numpy().tolist()
    
        predictions.append(predicted_class_id)

test_df['generated'] = predictions
test_df["generated"] = test_df["generated"].rank(method='min')
test_df.sort_values(by='id').head()
test_df.groupby("id")["generated"].mean().reset_index()
test_df.to_csv('test_predictions.csv', index=False)
print(test_df.head())
