In [15]:
import pandas as pd
import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [16]:
class BiasDataset(torch.utils.data.Dataset):
    def __init__(self, dataframe, tokenizer):
        self.data = dataframe
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        english = self.data.iloc[idx]["english"]
        german = self.data.iloc[idx]["german"]
        label = int(self.data.iloc[idx]["label"])
        text_pair = english + " [SEP] " + german
        inputs = self.tokenizer(
            text_pair,
            padding="max_length",
            truncation=True,
            max_length=128,
            return_tensors="pt",
        )
        item = {key: val.squeeze(0) for key, val in inputs.items()}
        item["labels"] = torch.tensor(label)
        return item

In [17]:
def evaluate_model(model, data_loader, device):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in data_loader:
            labels = batch['labels'].to(device)
            inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
            outputs = model(**inputs)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)

            all_preds.extend(preds.cpu().tolist())
            all_labels.extend(labels.cpu().tolist())

    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='binary')
    return accuracy, precision, recall, f1

In [18]:
def main():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Load your dataset
    df = pd.read_csv("datasets/dataset.csv")

    # Load tokenizer and model from saved folder
    tokenizer = BertTokenizer.from_pretrained("./model_output")
    model = BertForSequenceClassification.from_pretrained("./model_output")
    model.to(device)

    # Prepare validation dataset and loader
    val_dataset = BiasDataset(df, tokenizer)
    val_loader = DataLoader(val_dataset, batch_size=4)

    # Evaluate
    accuracy, precision, recall, f1 = evaluate_model(model, val_loader, device)
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")

In [13]:
main()

KeyboardInterrupt: 

In [21]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = BertTokenizer.from_pretrained("./model_output")
model = BertForSequenceClassification.from_pretrained("./model_output")
model.to(device)
model.eval()


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1

In [19]:
def predict_single(english, german, tokenizer, model, device):
    text_pair = english + " [SEP] " + german
    inputs = tokenizer(
        text_pair,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=128,
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=1)
        pred = torch.argmax(probs).item()
        confidence = probs[0][pred].item()
    return pred, confidence


In [None]:
examples = [
    ("The nurse is kind", "Die Krankenschwester ist freundlich"),
    ("The doctor is strong", "Der Arzt ist stark"),
    ("Hello, how are you?", "Hallo, wie geht es dir?"),
    ("The teacher is a woman", "Der Lehrer ist eine Frau"),
    ("The teacher is a woman", "Die Lehrerin ist eine Frau"),
    ("The teacher is a man", "Der Lehrer ist eine Mann"),
    ("The teacher is a man", "Die Lehrerin ist eine Mann"),
    ("The teacher is kind", "Der Lehrer ist freundlich"),
    ("The teacher is kind", "Die Lehrerin ist freundlich"),
    ("The teacher is kind", "Die Lehrkraft ist freundlich"),
]

for eng, ger in examples:
    pred, conf = predict_single(eng, ger, tokenizer, model, device)
    label_str = "Biased" if pred == 1 else "Neutral"
    print(f"English: {eng}")
    print(f"German: {ger}")
    print(f"Prediction: {label_str} (Confidence: {conf:.2f})")
    print("-" * 40)


English: The nurse is kind
German: Die Krankenschwester ist freundlich
Prediction: Biased (Confidence: 1.00)
----------------------------------------
English: The doctor is strong
German: Der Arzt ist stark
Prediction: Biased (Confidence: 1.00)
----------------------------------------
English: Hello, how are you?
German: Hallo, wie geht es dir?
Prediction: Neutral (Confidence: 0.88)
----------------------------------------
