In [None]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import f1_score
from tqdm.notebook import tqdm
import random


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
train_cleaned_data = pd.read_csv("drive/MyDrive/Colab Notebooks/data/train_cleaned_rnn.csv")
test_cleaned_data = pd.read_csv("drive/MyDrive/Colab Notebooks/data/test_cleaned_rnn.csv")

X_train = train_cleaned_data['tweet_cleaned'].values
y_train = train_cleaned_data['label'].values

X_test = test_cleaned_data['tweet_cleaned'].values
y_test = test_cleaned_data['label'].values

In [None]:
# Tokenizer initialisieren
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Daten tokenisieren und in Tensoren umwandeln
def encode_data(texts, labels, tokenizer, max_length=256):
    encoded = tokenizer.batch_encode_plus(
        texts,
        add_special_tokens=True,
        return_attention_mask=True,
        pad_to_max_length=True,
        max_length=max_length,
        return_tensors='pt'
    )
    input_ids = encoded['input_ids']
    attention_masks = encoded['attention_mask']
    labels = torch.tensor(labels)
    return TensorDataset(input_ids, attention_masks, labels)

train_data = encode_data(X_train, y_train, tokenizer)
test_data = encode_data(X_test, y_test, tokenizer)

# DataLoader erstellen
batch_size = 16

train_dataloader = DataLoader(train_data, sampler=RandomSampler(train_data), batch_size=batch_size)
test_dataloader = DataLoader(test_data, sampler=RandomSampler(test_data), batch_size=batch_size)

# Modell initialisieren
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=2,  # Binäre Klassifikation
    output_attentions=False,
    output_hidden_states=False
)

# Optimizer und Scheduler einrichten
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
epochs = 5
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=len(train_dataloader) * epochs
)

# Seed setzen
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Training-Funktion
def train_model(dataloader, model, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    for batch in dataloader:
        model.zero_grad()
        batch = tuple(b.to(device) for b in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
        outputs = model(**inputs)
        loss = outputs[0]
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
    return total_loss / len(dataloader)

# Evaluation-Funktion
def evaluate(dataloader, model, device):
    model.eval()
    total_loss = 0
    predictions, true_labels = [], []
    for batch in dataloader:
        batch = tuple(b.to(device) for b in batch)
        with torch.no_grad():
            outputs = model(input_ids=batch[0], attention_mask=batch[1], labels=batch[2])
        loss = outputs[0]
        logits = outputs[1]
        total_loss += loss.item()
        predictions.append(logits.detach().cpu().numpy())
        true_labels.append(batch[2].detach().cpu().numpy())
    predictions = np.concatenate(predictions, axis=0)
    true_labels = np.concatenate(true_labels, axis=0)
    return total_loss / len(dataloader), predictions, true_labels

# Training und Evaluation
for epoch in range(1, epochs + 1):
    print(f"Epoch {epoch}")
    train_loss = train_model(train_dataloader, model, optimizer, scheduler, device)
    print(f"Training loss: {train_loss}")
    val_loss, predictions, true_labels = evaluate(test_dataloader, model, device)
    preds_flat = np.argmax(predictions, axis=1).flatten()
    labels_flat = true_labels.flatten()
    f1 = f1_score(labels_flat, preds_flat, average='weighted')
    print(f"Validation loss: {val_loss}")
    print(f"F1 Score: {f1}")
