In [10]:
# Zunächst über die Python Konsole die benötigten erweiterungen installieren mit:
# pip install transformers datasets torch tensorflow accelerate tf-keras
# Danach:
# python.exe -m pip install --upgrade pip

# Verwendetes Learning Dataset:
# import kagglehub

# Download latest version
# path = kagglehub.dataset_download("tobiasbueck/multilingual-customer-support-tickets")

# print("Path to dataset files:", path)

# Dataset aus den Rohdaten einlesen
from datasets import load_dataset
dataset = load_dataset('csv', data_files='trainigsdaten/archive/dataset-tickets-german_normalized_50_5_2.csv')
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['subject', 'body', 'queue', 'priority', 'language'],
        num_rows: 13178
    })
})


In [11]:
# Distilbert Modul mit transformers zum tokenizieren der Rohdaten laden
from transformers import AutoTokenizer, AutoModelForSequenceClassification

modell_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(modell_name)
model = AutoModelForSequenceClassification.from_pretrained(modell_name, num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# Daten Tokenisieren
def tokenize_function(examples):
    combined_texts = [str(subject) + " " + str(body) for subject, body in zip(examples["subject"], examples["body"])]
    return tokenizer(combined_texts, padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [13]:
# KI mit Tokenisierten Daten trainieren
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

# --- Corrected Data Preparation ---

# 1. Convert the 'priority' column to integer IDs.
tokenized_datasets = tokenized_datasets.class_encode_column("priority")

# 2. Get the number of classes from the 'priority' column's features.
num_labels = tokenized_datasets["train"].features["priority"].num_classes
print(f"Found {num_labels} unique labels in the 'priority' column.")

# 3. NOW, rename the 'priority' column to 'labels'.
tokenized_datasets = tokenized_datasets.rename_column("priority", "labels")

# Optional: Tidy up by removing columns that are no longer needed
tokenized_datasets = tokenized_datasets.remove_columns(['subject', 'body', 'queue', 'language'])

# --- Model and Trainer Setup ---

# Define the model and tokenizer with the CORRECT num_labels
modell_name = "distilbert-local" # Corrected: Removed the "/"
tokenizer = AutoTokenizer.from_pretrained(modell_name)
model = AutoModelForSequenceClassification.from_pretrained(modell_name, num_labels=num_labels)

# The rest of your code remains the same
training_args = TrainingArguments(
    output_dir="./ergebnisse",
    eval_strategy="no",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
)

trainer.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-local and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Found 5 unique labels in the 'priority' column.




Step,Training Loss
500,1.0895
1000,0.6002
1500,0.4959
2000,0.4324
2500,0.3646
3000,0.3216
3500,0.2708
4000,0.1915
4500,0.1586




TrainOutput(global_step=4944, training_loss=0.4132528752570785, metrics={'train_runtime': 40271.5891, 'train_samples_per_second': 0.982, 'train_steps_per_second': 0.123, 'total_flos': 5237246320035840.0, 'train_loss': 0.4132528752570785, 'epoch': 3.0})

In [1]:
# Chatbot erstellen
import os
import json
from transformers import pipeline

# --- 1. Load the final trained model ---

# Path to your final trained model checkpoint
model_path = "./ergebnisse/checkpoint-4944"

print(f"Loading model from: {model_path}")

# Create the text-classification pipeline
classifier = pipeline("text-classification", model=model_path, tokenizer=model_path)


# --- 2. Load the label mapping to understand the output ---

# This reads the model's configuration to find the names of your categories
config_path = os.path.join(model_path, 'config.json')
with open(config_path) as f:
    config = json.load(f)

# Creates a dictionary to map IDs (like 0, 1, 2...) to names (like 'High', 'Low'...)
id2label = {int(k): v for k, v in config['id2label'].items()}


# --- 3. Start the interactive session ---

print("\n✅ Interactive session with the classifier has started!")
print(f"The model will classify text into these categories: {list(id2label.values())}")
print("Type a sentence and press Enter, or type 'exit' to end.")
print("-" * 50)

while True:
    try:
        # Get input from the user
        user_input = input("You: ")
        
        # Check for exit command
        if user_input.lower() in ["exit", "quit", "ende"]:
            print("Bot: Goodbye!")
            break
            
        if not user_input.strip(): # Skip empty input
            continue
            
        # Make a prediction
        prediction = classifier(user_input)[0]
        
        # Get the meaningful label and score
        label_id = int(prediction['label'].split('_')[1])
        predicted_label_name = id2label[label_id]
        confidence_score = prediction['score']
        
        # Display the result
        print(f"Bot: I classify that as '{predicted_label_name}' (Confidence: {confidence_score:.2%})")

    except (KeyboardInterrupt, EOFError): # Handle Ctrl+C or Ctrl+D
        print("\nBot: Goodbye!")
        break

  from .autonotebook import tqdm as notebook_tqdm


Loading model from: ./ergebnisse/checkpoint-4944



Device set to use cpu



✅ Interactive session with the classifier has started!
The model will classify text into these categories: ['LABEL_0', 'LABEL_1', 'LABEL_2', 'LABEL_3', 'LABEL_4']
Type a sentence and press Enter, or type 'exit' to end.
--------------------------------------------------
Bot: I classify that as 'LABEL_4' (Confidence: 36.97%)

Bot: Goodbye!
