# Verwendete Bibliotheken & Initialisierung

In [1]:
# Falls nötig:
# pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
# pip install -U "transformers[torch]"

In [1]:
import pandas as pd
import numpy as np

# Huggingface Datasets & Transformers
from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
)

# Modellverwendung und Evaluation
import evaluate
import torch, random
import torch.nn.functional as F

# für Baseline-Modell
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

# Visualisierung
import plotly.express as px



In [2]:
# Spalten nicht begrenzen (Texte)
pd.set_option("display.max_colwidth", None)  

In [3]:
# Reproduzierbarkeit gewährleisten
seed = 42
random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
pass

In [4]:
# Tokenizer-Funktion definieren
checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_fn(batch):
    return tokenizer(
        batch["ticket_text"],
        truncation=True,
        max_length=256, # CPU-freundlich
        padding=False        
    )

In [5]:
# Metriken definieren
acc_metric = evaluate.load("accuracy")
prec_metric = evaluate.load("precision")
rec_metric = evaluate.load("recall")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        "accuracy": acc_metric.compute(predictions=preds, references=labels)["accuracy"],
        "precision": prec_metric.compute(predictions=preds, references=labels, average="binary")["precision"],
        "recall": rec_metric.compute(predictions=preds, references=labels, average="binary")["recall"],
        "f1": f1_metric.compute(predictions=preds, references=labels, average="binary")["f1"],
    }

# Einlesen der Trainingsdaten

In [6]:
df = pd.read_csv("data/tickets_large_with_labels.csv")
print(f"Anzahl Datenpunkte: {len(df)}")
df.iloc[::50]

Anzahl Datenpunkte: 200


Unnamed: 0,ticket_id,timestamp,ticket_class,ticket_text
0,T000036,2025-10-01 03:23:01.351191808,sales enquiry,I want different settings for console and cable. Can the A9 save picture modes per input automatically?
50,T000054,2025-10-20 00:47:54.221043968,service issue,Picture comes through fine but there is no audio on my receiver. The TV reports eARC connected yet stays silent.
100,T000024,2025-10-30 19:56:17.222929408,sales enquiry,Our home is Apple centric. Does the A9 support AirPlay casting and can it be added to HomeKit scenes?
150,T000111,2025-11-16 19:31:44.205664000,sales enquiry,"Does the Arc 65 come in more than one color trim, or is it only available in a black finish?"


# Vorverarbeitung & Tokenisierung

In [7]:
# Target als Integer (0, 1) definieren - 1 = sales enquiry
df['label'] = (df.ticket_class == 'sales enquiry').astype('int')  

# Umwandlung des Integer-Labels in Text und umgekehrt definieren
id2label = {0: "service issue", 1: "sales enquiry"}
label2id = {"service issue": 0, "sales enquiry": 1}

# Beispielhafte Tokenisierung
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
df['ticket_text_tokens'] = df.ticket_text.apply(lambda x: tokenizer.tokenize(x))
df['ticket_text_tokenized'] = df.ticket_text.apply(lambda x: tokenizer.encode(x, truncation=True))

df.iloc[::50]



Unnamed: 0,ticket_id,timestamp,ticket_class,ticket_text,label,ticket_text_tokens,ticket_text_tokenized
0,T000036,2025-10-01 03:23:01.351191808,sales enquiry,I want different settings for console and cable. Can the A9 save picture modes per input automatically?,1,"[i, want, different, settings, for, console, and, cable, ., can, the, a, ##9, save, picture, modes, per, input, automatically, ?]","[101, 1045, 2215, 2367, 10906, 2005, 10122, 1998, 5830, 1012, 2064, 1996, 1037, 2683, 3828, 3861, 11583, 2566, 7953, 8073, 1029, 102]"
50,T000054,2025-10-20 00:47:54.221043968,service issue,Picture comes through fine but there is no audio on my receiver. The TV reports eARC connected yet stays silent.,0,"[picture, comes, through, fine, but, there, is, no, audio, on, my, receiver, ., the, tv, reports, ear, ##c, connected, yet, stays, silent, .]","[101, 3861, 3310, 2083, 2986, 2021, 2045, 2003, 2053, 5746, 2006, 2026, 8393, 1012, 1996, 2694, 4311, 4540, 2278, 4198, 2664, 12237, 4333, 1012, 102]"
100,T000024,2025-10-30 19:56:17.222929408,sales enquiry,Our home is Apple centric. Does the A9 support AirPlay casting and can it be added to HomeKit scenes?,1,"[our, home, is, apple, cent, ##ric, ., does, the, a, ##9, support, airplay, casting, and, can, it, be, added, to, home, ##kit, scenes, ?]","[101, 2256, 2188, 2003, 6207, 9358, 7277, 1012, 2515, 1996, 1037, 2683, 2490, 15341, 9179, 1998, 2064, 2009, 2022, 2794, 2000, 2188, 23615, 5019, 1029, 102]"
150,T000111,2025-11-16 19:31:44.205664000,sales enquiry,"Does the Arc 65 come in more than one color trim, or is it only available in a black finish?",1,"[does, the, arc, 65, come, in, more, than, one, color, trim, ,, or, is, it, only, available, in, a, black, finish, ?]","[101, 2515, 1996, 8115, 3515, 2272, 1999, 2062, 2084, 2028, 3609, 12241, 1010, 2030, 2003, 2009, 2069, 2800, 1999, 1037, 2304, 3926, 1029, 102]"


# Trennung in Trainings- und Testdaten

In [8]:
# Train-test-split
train_df, test_df = train_test_split(
    df[['ticket_text', 'label']], test_size=0.2, random_state=seed, stratify=df["label"]
)

In [9]:
# HuggingFace Dataset aus dem DataFrame erzeugen 
# (leicht mit den verwendeten Modellen kombinierbar)
train_ds = Dataset.from_pandas(train_df, preserve_index=False)
test_ds  = Dataset.from_pandas(test_df,  preserve_index=False)

In [10]:
# Tokenisierung der Trainings- und Testtexte (für das Modell, oben nur zur Veranschaulichung)
train_tok = train_ds.map(tokenize_fn, batched=True, remove_columns=["ticket_text"])
test_tok  = test_ds.map(tokenize_fn,  batched=True, remove_columns=["ticket_text"])

Map:   0%|          | 0/160 [00:00<?, ? examples/s]

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

# Training
## Vorbereitung

In [11]:
# Vortrainiertes Modell laden

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2,  # es gibt nur die beiden Klassen 'sales enquiry' und 'service issue'
    id2label=id2label,  # id2label = {0: "service issue", 1: "sales enquiry"}
    label2id=label2id,  # label2id = {"service issue": 0, "sales enquiry": 1}
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# Parametrisierung des Trainings

# Data Collator ("Padding on-the-fly")
# --> sorgt dafür, dass alle Sätze in einem Batch gleich lang sind
collator = DataCollatorWithPadding(tokenizer=tokenizer)


# Parameter des Trainings setzen (insb. CPU-Tauglichkeit!)
args = TrainingArguments(
    output_dir="./data/sales_service",
    learning_rate=2e-5,                # relevant für den Gradientenabstieg
    per_device_train_batch_size=4,     
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=1,     
    num_train_epochs=3,                 # Definition der Anzahl Trainingsepochen (wie oft das Modell den gesamten Datensatz "sieht")
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    logging_steps=50,
    seed=seed,
    fp16=False,
    dataloader_pin_memory=False,   # CPU only (keine GPU versuchen anzusprechen)
)

# Den eigentlichen HuggingFace-Trainer definieren
# Hier wird alles vorher definierte zusammengeführt (Parameter, Metriken, Train-/Testdatensätze etc.)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_tok,
    eval_dataset=test_tok,
    processing_class=tokenizer,
    data_collator=collator,
    compute_metrics=compute_metrics,
)

## Eigentliches Training starten

In [16]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.02148,1.0,1.0,1.0,1.0
2,0.010900,0.003903,1.0,1.0,1.0,1.0
3,0.003800,0.003734,1.0,1.0,1.0,1.0


TrainOutput(global_step=120, training_loss=0.006651169620454311, metrics={'train_runtime': 36.5638, 'train_samples_per_second': 13.128, 'train_steps_per_second': 3.282, 'total_flos': 3489688033344.0, 'train_loss': 0.006651169620454311, 'epoch': 3.0})

In [17]:
trainer.save_model("./data/sales_service/best")
tokenizer.save_pretrained("./data/sales_service/best")

# Loading the model and Tokenizer from training
# model = AutoModelForSequenceClassification.from_pretrained("./data/sales_service/best")
# tokenizer = AutoTokenizer.from_pretrained("./data/sales_service/best")

('./data/sales_service/best\\tokenizer_config.json',
 './data/sales_service/best\\special_tokens_map.json',
 './data/sales_service/best\\vocab.txt',
 './data/sales_service/best\\added_tokens.json',
 './data/sales_service/best\\tokenizer.json')

# Modell verwenden (Inferenz)

## Beispieltickets, die das Modell noch nie gesehen hat

In [15]:
example_texts = [
    "Please provide a quote for 20 units and delivery options.",
    "My device stopped working after the update. Need help!",
]

enc = tokenizer(example_texts, truncation=True, padding=True, return_tensors="pt")

with torch.no_grad():
    out = model(**enc)  # hier passiert die eigentliche Inferenz
    probs = F.softmax(out.logits, dim=-1)  # hier werden die Wahrscheinlichkeiten pro Klasse zurückgegeben

preds = probs.argmax(dim=-1).tolist()  # Hier wird die Prognose abgerufen

print("=== Fine tuned model (BERT-based) ===\n")

for text, p, pr in zip(example_texts, preds, probs):
    print(f"Ticket text: '{text}'")
    print(f"\t\t→ Prediction: {id2label[p]}")
    for i, prediction_label in id2label.items():
        print(f"\t\t\tprobability '{prediction_label:<15}' : {pr[i]:.2f}")
    print()    


=== Fine tuned model (BERT-based) ===

Ticket text: 'Please provide a quote for 20 units and delivery options.'
		→ Prediction: service issue
			probability 'service issue  ' : 0.58
			probability 'sales enquiry  ' : 0.42

Ticket text: 'My device stopped working after the update. Need help!'
		→ Prediction: service issue
			probability 'service issue  ' : 0.96
			probability 'sales enquiry  ' : 0.04



## Alle Tickets aus dem Trainingsset beispielhaft prognostizieren lassen

In [20]:
pred_ds = Dataset.from_pandas(df)  # dataset erzeugen (voller DataFrame)
tokenized_pred = pred_ds.map(tokenize_fn, batched=True)  # Tokenisierung

pred_output = trainer.predict(tokenized_pred)  # Inferenz

# Wahrscheinlichkeiten und Labels extrahieren
probs = torch.nn.functional.softmax(torch.tensor(pred_output.predictions), dim=-1).numpy()
labels = probs.argmax(axis=1)

df["bert_predicted_label"] = [id2label[l] for l in labels]
df["bert_prob_service_issue"] = probs[:, 0]
df["bert_prob_sales_enquiry"] = probs[:, 1]

df.iloc[::50]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Unnamed: 0,ticket_id,timestamp,ticket_class,ticket_text,label,ticket_text_tokens,ticket_text_tokenized,bert_predicted_label,bert_prob_service_issue,bert_prob_sales_enquiry
0,T000036,2025-10-01 03:23:01.351191808,sales enquiry,I want different settings for console and cable. Can the A9 save picture modes per input automatically?,1,"[i, want, different, settings, for, console, and, cable, ., can, the, a, ##9, save, picture, modes, per, input, automatically, ?]","[101, 1045, 2215, 2367, 10906, 2005, 10122, 1998, 5830, 1012, 2064, 1996, 1037, 2683, 3828, 3861, 11583, 2566, 7953, 8073, 1029, 102]",sales enquiry,0.013356,0.986643
50,T000054,2025-10-20 00:47:54.221043968,service issue,Picture comes through fine but there is no audio on my receiver. The TV reports eARC connected yet stays silent.,0,"[picture, comes, through, fine, but, there, is, no, audio, on, my, receiver, ., the, tv, reports, ear, ##c, connected, yet, stays, silent, .]","[101, 3861, 3310, 2083, 2986, 2021, 2045, 2003, 2053, 5746, 2006, 2026, 8393, 1012, 1996, 2694, 4311, 4540, 2278, 4198, 2664, 12237, 4333, 1012, 102]",service issue,0.982158,0.017842
100,T000024,2025-10-30 19:56:17.222929408,sales enquiry,Our home is Apple centric. Does the A9 support AirPlay casting and can it be added to HomeKit scenes?,1,"[our, home, is, apple, cent, ##ric, ., does, the, a, ##9, support, airplay, casting, and, can, it, be, added, to, home, ##kit, scenes, ?]","[101, 2256, 2188, 2003, 6207, 9358, 7277, 1012, 2515, 1996, 1037, 2683, 2490, 15341, 9179, 1998, 2064, 2009, 2022, 2794, 2000, 2188, 23615, 5019, 1029, 102]",sales enquiry,0.013746,0.986254
150,T000111,2025-11-16 19:31:44.205664000,sales enquiry,"Does the Arc 65 come in more than one color trim, or is it only available in a black finish?",1,"[does, the, arc, 65, come, in, more, than, one, color, trim, ,, or, is, it, only, available, in, a, black, finish, ?]","[101, 2515, 1996, 8115, 3515, 2272, 1999, 2062, 2084, 2028, 3609, 12241, 1010, 2030, 2003, 2009, 2069, 2800, 1999, 1037, 2304, 3926, 1029, 102]",sales enquiry,0.01401,0.98599


# Vergleich mit einem "traditionellen" Modell (Baseline)

## Modell trainieren

In [21]:
# Baseline: TF-IDF + Logistic Regression 
df_baseline = df

text_col = "ticket_text"
X = df_baseline[text_col].astype(str)
y = df_baseline["label"].astype(int)

# Train-Test-Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Pipeline: TF-IDF -> Logistic Regression
baseline = Pipeline([
    ("tfidf", TfidfVectorizer(
        ngram_range=(1,2),           # Unigram + Bigram 
        max_features=100_000,        # CPU-freundlich
        min_df=2,                    # seltene Tokens ausschließen
    )),
    ("clf", LogisticRegression(
        max_iter=1000,        
        n_jobs=-1
    ))
])

# Trainieren
baseline = baseline.fit(X_train, y_train)


## Modell beispielhaft verwenden

In [22]:
example_texts_baseline = [
    "Please provide a quote for 20 units and delivery options.",
    "My device stopped working after the update. Need help!",
]

# Vorhersagen & Wahrscheinlichkeiten
preds = baseline.predict(example_texts_baseline)
probs = baseline.predict_proba(example_texts_baseline)


print("\n=== Baseline (TF-IDF + Logistic Regression) ===")

for text, pred, pr in zip(example_texts_baseline, preds, probs):
    print(f"Ticket text: '{text}'")
    print(f"\t\t→ Prediction: {id2label[pred]}")
    for i, label in id2label.items():
        print(f"\t\t\tprobability '{label:<15}' : {pr[i]:.2f}")
    


=== Baseline (TF-IDF + Logistic Regression) ===
Ticket text: 'Please provide a quote for 20 units and delivery options.'
		→ Prediction: sales enquiry
			probability 'service issue  ' : 0.48
			probability 'sales enquiry  ' : 0.52
Ticket text: 'My device stopped working after the update. Need help!'
		→ Prediction: service issue
			probability 'service issue  ' : 0.69
			probability 'sales enquiry  ' : 0.31


## Vergleich von Transfer Learning mit dem Baseline-Modell auf dem Originaldatensatz

In [23]:
# Alle Tickets aus dem Trainingsset beispielhaft prognostizieren lassen

preds = baseline.predict(df[text_col].astype(str))
probs = baseline.predict_proba(df[text_col].astype(str))

df["baseline_predicted_label"] = preds
df["baseline_predicted_label"] = df["baseline_predicted_label"].map(id2label)  # in Text umwandeln
df["baseline_prob_service_issue"] = probs[:, 0]
df["baseline_prob_sales_enquiry"] = probs[:, 1]

# df.iloc[::50]

# Visualisierung der Wahrscheinlichkeiten in der Prognose
Das Fine-tuned Modell ist "sich viel sicherer"!

In [24]:
df_long = pd.DataFrame({
    "Komplett auf eigenen Daten<br>trainiertes Modell": df["baseline_prob_sales_enquiry"],
    "BERT + Transfer Learning<br>(fine-tuning auf eigenen Daten)": df["bert_prob_sales_enquiry"]
}).melt(var_name="Modell", value_name="Wahrscheinlichkeit")

fig = px.histogram(
    df_long,
    x="Wahrscheinlichkeit",
    color="Modell",
    nbins=20,
    opacity=0.6,
    barmode="overlay",
    title="Verteilung der Vorhersagewahrscheinlichkeiten 'Traditionelles ML' vs. 'Transfer Learning mit BERT'",
    color_discrete_map={'Komplett auf eigenen Daten<br>trainiertes Modell':'grey', 
                        'BERT + Transfer Learning<br>(fine-tuning auf eigenen Daten)':'#08300d'},
    height=600
)

fig.update_layout(
    xaxis_title="Wahrscheinlichkeit (0=service issue, 1=sales enquiry)",
    yaxis_title="Anzahl Tickets",
    legend_title_text="Modell",
    template="plotly_white"
)

fig.show()
