# SNIPS + Embedding-Augmented FLAN-T5
Fine-tuning FLAN-T5 for real-world intent and slot detection using the SNIPS dataset with SBERT embeddings.

In [2]:
!pip install -q transformers datasets accelerate sentence-transformers seqeval

In [3]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
import pandas as pd, torch, json

In [4]:
# 1. Load SNIPS dataset
#ds = load_dataset('snips_built_in_intents')
#!pip install -U datasets

ds = load_dataset("bkonkle/snips-joint-intent")

README.md:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

train.csv:   0%|          | 0.00/1.74M [00:00<?, ?B/s]

test.csv:   0%|          | 0.00/93.3k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/13084 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/700 [00:00<?, ? examples/s]

In [5]:
#ds["train"] = ds["train"].select(range(100))
#ds["test"] = ds["test"].select(range(50))

print(ds)
print(ds["train"][0])

DatasetDict({
    train: Dataset({
        features: ['input', 'intent', 'slots'],
        num_rows: 13084
    })
    test: Dataset({
        features: ['input', 'intent', 'slots'],
        num_rows: 700
    })
})
{'input': 'listen to westbam alumb allergic on google music', 'intent': 'PlayMusic', 'slots': 'O O B-artist O B-album O B-service I-service'}


In [6]:
# 2. Format to T5-style examples
def prepare(example):
    intent = example['intent']
    slots = example['slots'].split()
    example['target'] = json.dumps({'intent': intent, 'slots': slots})
    return example

ds = ds.map(prepare)

Map:   0%|          | 0/13084 [00:00<?, ? examples/s]

Map:   0%|          | 0/700 [00:00<?, ? examples/s]

In [7]:
# 3. Add embeddings
embedder = SentenceTransformer('all-MiniLM-L6-v2')
def add_embed(ex):
    ex['emb'] = embedder.encode(ex['input']).tolist()
    return ex

ds = ds.map(add_embed)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Map:   0%|          | 0/13084 [00:00<?, ? examples/s]

Map:   0%|          | 0/700 [00:00<?, ? examples/s]

In [8]:
# 4. Tokenizer and FLAN-T5
tokenizer = AutoTokenizer.from_pretrained('google/flan-t5-small')
model = AutoModelForSeq2SeqLM.from_pretrained('google/flan-t5-small')

def preprocess(ex):
    prefix = ' '.join(map(lambda x: str(round(x, 4)), ex['emb'][:16]))
    inp = prefix + ' | ' + ex['input']
    tokens = tokenizer(inp, max_length=128, truncation=True)
    tgt = tokenizer(ex['target'], max_length=64, truncation=True)
    tokens['labels'] = tgt['input_ids']
    return tokens

tokenized = ds.map(preprocess)

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Map:   0%|          | 0/13084 [00:00<?, ? examples/s]

Map:   0%|          | 0/700 [00:00<?, ? examples/s]

In [14]:
# Print layer names
for name, param in model.named_parameters():
    print(name)

shared.weight
encoder.block.0.layer.0.SelfAttention.q.weight
encoder.block.0.layer.0.SelfAttention.k.weight
encoder.block.0.layer.0.SelfAttention.v.weight
encoder.block.0.layer.0.SelfAttention.o.weight
encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight
encoder.block.0.layer.0.layer_norm.weight
encoder.block.0.layer.1.DenseReluDense.wi_0.weight
encoder.block.0.layer.1.DenseReluDense.wi_1.weight
encoder.block.0.layer.1.DenseReluDense.wo.weight
encoder.block.0.layer.1.layer_norm.weight
encoder.block.1.layer.0.SelfAttention.q.weight
encoder.block.1.layer.0.SelfAttention.k.weight
encoder.block.1.layer.0.SelfAttention.v.weight
encoder.block.1.layer.0.SelfAttention.o.weight
encoder.block.1.layer.0.layer_norm.weight
encoder.block.1.layer.1.DenseReluDense.wi_0.weight
encoder.block.1.layer.1.DenseReluDense.wi_1.weight
encoder.block.1.layer.1.DenseReluDense.wo.weight
encoder.block.1.layer.1.layer_norm.weight
encoder.block.2.layer.0.SelfAttention.q.weight
encoder.block.2.layer.0.

In [25]:
# 5. Fine-tuning
args = Seq2SeqTrainingArguments(
    output_dir='snips_augmented',
    per_device_train_batch_size=4,
    num_train_epochs=5,
    logging_steps=10,
    eval_strategy='no',
    save_strategy='no',
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),
)

trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=tokenized['train'],
    eval_dataset=tokenized['test'],
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model)
)
trainer.train()

  trainer = Seq2SeqTrainer(


ValueError: Expected input batch_size (316) to match target batch_size (256).

In [24]:
# 6. Inference
def predict(text):
    emb = embedder.encode(text)
    prefix = ' '.join(map(lambda x: str(round(x,4)), emb[:16]))
    inp = prefix + ' | ' + text
    tokens = tokenizer(inp, return_tensors='pt').to(model.device)
    out = model.generate(**tokens, max_length=512)
    decoded = tokenizer.decode(out[0], skip_special_tokens=True)
    print(decoded)
    try:
        parsed = json.loads(decoded)
    except:
        parsed = decoded
    return parsed


#print(predict('Play the last song from Coldplay'))
#print(predict('Find me a restaurant in New York tomorrow night'))

output = predict('Play yello from Coldplay')
print(output)

AttributeError: 'BertForTokenClassification' object has no attribute 'generate'

In [21]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")

def decode_bio_slots(text, bio_tags):
    tokens = tokenizer.tokenize(text)
    result = {}
    current_slot = None
    current_tokens = []

    for token, tag in zip(tokens, bio_tags):
        if tag.startswith("B-"):
            if current_slot:
                slot_value = tokenizer.convert_tokens_to_string(current_tokens).replace("▁", " ").strip()
                result.setdefault(current_slot, []).append(slot_value)
            current_slot = tag[2:]
            current_tokens = [token]
        elif tag.startswith("I-") and current_slot == tag[2:]:
            current_tokens.append(token)
        else:
            if current_slot:
                slot_value = tokenizer.convert_tokens_to_string(current_tokens).replace("▁", " ").strip()
                result.setdefault(current_slot, []).append(slot_value)
                current_slot = None
                current_tokens = []

    if current_slot and current_tokens:
        slot_value = tokenizer.convert_tokens_to_string(current_tokens).replace("▁", " ").strip()
        result.setdefault(current_slot, []).append(slot_value)

    return result



In [22]:
text = "Play yello from Coldplay"
tags = ["O", "B-music_item", "O", "O", "B-playground", "I-playground", "I-playground", "I-playground", "I-play"]
decode_bio_slots(text, tags)

{'music_item': ['the'], 'playground': ['from Coldplay']}

In [19]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

nlp = pipeline("ner", model=model, tokenizer=tokenizer)
example = "My name is Wolfgang and I live in Berlin"

ner_results = nlp(example)
print(ner_results)

tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/829 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


[{'entity': 'B-PER', 'score': np.float32(0.9990139), 'index': 4, 'word': 'Wolfgang', 'start': 11, 'end': 19}, {'entity': 'B-LOC', 'score': np.float32(0.999645), 'index': 9, 'word': 'Berlin', 'start': 34, 'end': 40}]
