In [1]:
from datasets import load_dataset

dataset = load_dataset("bkonkle/snips-joint-intent")

In [2]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input', 'intent', 'slots'],
        num_rows: 13084
    })
    test: Dataset({
        features: ['input', 'intent', 'slots'],
        num_rows: 700
    })
})

In [3]:
dataset["train"][0]

{'input': 'listen to westbam alumb allergic on google music',
 'intent': 'PlayMusic',
 'slots': 'O O B-artist O B-album O B-service I-service'}

In [4]:
train_data = dataset["train"]

# Get list of intents and slots
true_intents = [x["intent"] for x in train_data]
true_slots = [x["slots"].split() for x in train_data] 

In [5]:
import random

# Intent prediction: 80% correct
pred_intents = [
    intent if random.random() > 0.2 else "OtherIntent"
    for intent in true_intents
]

# Slot prediction: 70% correct BIO tags per sentence
pred_slots = []
for gold in true_slots:
    pred = [
        tag if random.random() > 0.3 else "O"  # remove some tags
        for tag in gold
    ]
    pred_slots.append(pred)


In [6]:
from sklearn.metrics import classification_report

print("Intent Classification Report:\n")
print(classification_report(true_intents, pred_intents, zero_division=0))


Intent Classification Report:

                      precision    recall  f1-score   support

       AddToPlaylist       1.00      0.79      0.88      1818
      BookRestaurant       1.00      0.80      0.89      1881
          GetWeather       1.00      0.80      0.89      1896
         OtherIntent       0.00      0.00      0.00         0
           PlayMusic       1.00      0.79      0.89      1914
            RateBook       1.00      0.79      0.88      1876
  SearchCreativeWork       1.00      0.81      0.89      1847
SearchScreeningEvent       1.00      0.81      0.89      1852

            accuracy                           0.80     13084
           macro avg       0.88      0.70      0.78     13084
        weighted avg       1.00      0.80      0.89     13084



In [7]:
def token_level_slot_metrics(y_true, y_pred):
    tp = fp = fn = 0
    for true_seq, pred_seq in zip(y_true, y_pred):
        for t, p in zip(true_seq, pred_seq):
            if t == p and t != "O":
                tp += 1
            elif t != p:
                if p != "O":
                    fp += 1
                if t != "O":
                    fn += 1
    precision = tp / (tp + fp) if (tp + fp) else 0
    recall = tp / (tp + fn) if (tp + fn) else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) else 0
    return precision, recall, f1


In [8]:
p, r, f1 = token_level_slot_metrics(true_slots, pred_slots)
print(f"\nSlot Labeling Metrics:\nPrecision: {p:.2f}, Recall: {r:.2f}, F1: {f1:.2f}")



Slot Labeling Metrics:
Precision: 1.00, Recall: 0.70, F1: 0.82
