In [4]:
# from transformers import AutoTokenizer, AutoModelForMaskedLM

# tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-german-cased")
# model = AutoModelForMaskedLM.from_pretrained("google-bert/bert-base-german-cased")

In [5]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-german-cased")
model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-german-cased", num_labels=2)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Tokenizer und Modell laden
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-german-cased")
model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-german-cased", num_labels=2)

# Beispieltext
text = "Verkehrsdaten in Berlin"

# Text tokenisieren
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

# Vorhersage machen (ungewöhnlich gut aufgrund der nicht trainierten Klassifikationsschicht)
with torch.no_grad():
    outputs = model(**inputs)

# Ergebnisse (Logits)
logits = outputs.logits
print(f"Logits: {logits}")

# Vorhersage (Index der höchsten Wahrscheinlichkeit)
prediction = torch.argmax(logits, dim=-1)
print(f"Vorhersage: {prediction.item()}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Logits: tensor([[-0.2487,  0.1334]])
Vorhersage: 1


In [8]:
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Tokenizer und Modell laden
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-german-cased")
model = AutoModel.from_pretrained("google-bert/bert-base-german-cased")


In [33]:
# Beispiel für Mobilitätsbegriffe
mobilitaetsbegriffe = [
    "Verkehr", "Transport", "Fahrzeug", "Straße", "Fahrplan", "Auto", "Bahn", "Mobilität",
    "Fahrrad", "Bus", "E-Mobilität", "ÖPNV", "Flughafen", "Lkw", "Velo", "Fabian", "XYZ", "Essen", "Mami"
]


In [34]:
def get_embedding(text):
    # Tokenisieren und in Tensoren umwandeln
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    
    # Vorwärtsdurchlauf durch das Modell
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Wir nehmen den [CLS]-Token als Repräsentation des Textes
    # outputs.last_hidden_state ist ein Tensor der Form (batch_size, sequence_length, hidden_size)
    # Wir nehmen den ersten Token ([CLS]-Token) und extrahieren das Embedding
    embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
    
    return embedding

# Beispieltext
text = "Reiseverhalten"

# Embedding des Beispieltextes berechnen
text_embedding = get_embedding(text)

# Embedding der Mobilitätsbegriffe berechnen
mobilitaets_embeddings = [get_embedding(begriff) for begriff in mobilitaetsbegriffe]


In [35]:
# Kosinusähnlichkeit berechnen
similarities = cosine_similarity([text_embedding], mobilitaets_embeddings)

# Ähnlichsten Begriffe finden
similarity_scores = similarities[0]  # Da es nur einen Vergleichstext gibt
similarity_dict = {mobilitaetsbegriffe[i]: similarity_scores[i] for i in range(len(mobilitaetsbegriffe))}

# Sortiere nach der höchsten Ähnlichkeit
sorted_similarity = sorted(similarity_dict.items(), key=lambda item: item[1], reverse=True)

# Ausgabe der Mobilitätsbegriffe nach Ähnlichkeit
for begriff, score in sorted_similarity:
    print(f"{begriff}: {score:.4f}")


Mobilität: 0.8375
E-Mobilität: 0.8211
Verkehr: 0.7885
Auto: 0.7408
Fahrplan: 0.7294
Velo: 0.7192
Fahrrad: 0.7171
Transport: 0.7162
Lkw: 0.7160
Essen: 0.7158
Fahrzeug: 0.6914
ÖPNV: 0.6898
Bahn: 0.6739
XYZ: 0.6719
Bus: 0.6578
Flughafen: 0.6200
Straße: 0.6139
Fabian: 0.6051
Mami: 0.5899


## No Difference found from training, BERT seems not to be what we search for. For Word embeddings and word2vec in german may exist better libraries/models

## May try different approach using BERT, see other files