<a href="https://colab.research.google.com/github/nahbos/AUT-Language-Understanding/blob/main/Ex02/bert_slot_intent.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sobhan Moradian Daghigh
### **Ex 02: Slot Filling & Intent detection**

In [1]:
# Copyright @Steven Golovkine: available at [https://stevengolovkine.netlify.app/post/joint-intent-classification-slot-filling-with-transformers/]
# Replicated by @SobhanMoradianDaghigh on 12-8-2022

In [2]:
!pip install transformers==2.11.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers==2.11.0
  Downloading transformers-2.11.0-py3-none-any.whl (674 kB)
[K     |████████████████████████████████| 674 kB 12.8 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[K     |████████████████████████████████| 880 kB 64.8 MB/s 
Collecting tokenizers==0.7.0
  Downloading tokenizers-0.7.0-cp38-cp38-manylinux1_x86_64.whl (7.5 MB)
[K     |████████████████████████████████| 7.5 MB 21.9 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 73.5 MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.53-py3-none-any.whl size=895260 sha256=3e6fc1ed90555fe3fb42b6e22e2c1ed27790711d13c028e35932

In [3]:
# Load packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pathlib import Path
from transformers import BertTokenizer, TFBertModel
from urllib.request import urlretrieve

import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import SparseCategoricalAccuracy
from tensorflow.keras.optimizers import Adam

In [4]:
!wget -nc https://raw.githubusercontent.com/nahbos/AUT-Language-Understanding/main/Ex02/dataset/development-en.conllu
!wget -nc https://raw.githubusercontent.com/nahbos/AUT-Language-Understanding/main/Ex02/dataset/test-en.conllu
!wget -nc https://raw.githubusercontent.com/nahbos/AUT-Language-Understanding/main/Ex02/dataset/train-en.conllu

--2022-12-08 20:05:55--  https://raw.githubusercontent.com/nahbos/AUT-Language-Understanding/main/Ex02/dataset/development-en.conllu
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1492856 (1.4M) [text/plain]
Saving to: ‘development-en.conllu’


2022-12-08 20:05:55 (59.5 MB/s) - ‘development-en.conllu’ saved [1492856/1492856]

--2022-12-08 20:05:56--  https://raw.githubusercontent.com/nahbos/AUT-Language-Understanding/main/Ex02/dataset/test-en.conllu
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3084155 (2.9M) [text/plain]
Savin

## Preparing data

In [41]:
def data_preparing(dataset_path):
    prepared = []
    intents = []
    slots = []
    data_list = set(Path(dataset_path).read_text('utf-8').strip().split('\n\n'))
    for data in data_list:
        data_dic = ''
        for line in data.split('\n')[3:]:
            bio = line.split('\t')[-1]    # BIO
            bio = bio if bio != 'NoLabel' else 'O'
            tokens = line.split('\t')[1]  # tokens
            intent = data.split('\n')[1].split(':')[1].strip()  # intent
            data_dic += (tokens + ":" + bio + ' ')
            intents.append(intent)
            slots.append(bio)
        data_dic += '<=> ' + intent
        prepared.append(data_dic)
    return prepared, intents, slots

In [42]:
train, intent_names_train, slots_train = data_preparing('./train-en.conllu')
val, intent_names_val, slots_val       = data_preparing('./development-en.conllu')
test, intent_names_test, slots_test    = data_preparing('./test-en.conllu')

In [7]:
train[:10]

['clear:O all:O alarms:O <=> alarm/cancel_alarm',
 'set:O alarm:O for:B-datetime sundays:I-datetime at:I-datetime 8:I-datetime ::I-datetime 30am:I-datetime <=> alarm/set_alarm',
 'temperature:B-weather/noun palo:B-location alto:I-location today:B-datetime fahrenheit:B-weather/temperatureUnit <=> weather/find',
 'will:O it:O rain:B-weather/attribute this:B-datetime afteroon:I-datetime <=> weather/find',
 'remind:O me:O to:O floss:B-reminder/todo every:B-reminder/recurring_period night:I-reminder/recurring_period .:O <=> reminder/set_reminder',
 "what's:O the:O weather:B-weather/noun like:O for:B-datetime the:I-datetime week:I-datetime ?:O <=> weather/find",
 'can:O i:O have:O the:B-datetime current:I-datetime temperature:B-weather/noun <=> weather/find',
 'how:O cold:B-weather/attribute is:O it:O going:O to:O get:O tonight:B-datetime <=> weather/find',
 'what:O is:O the:O temperature:B-weather/noun right:O now:B-datetime outside:B-location <=> weather/find',
 'will:O it:O be:O above:B-w

In [58]:
intent_names = list(set(intent_names_train + intent_names_val + intent_names_test))
slots = list(set(slots_train + slots_val + slots_test))

In [9]:
def parse_line(line):
    utterance_data, intent_label = line.split(" <=> ")
    items = utterance_data.split()
    words = [item.rsplit(':', 1)[0] for item in items]
    word_labels = [item.rsplit(':', 1)[1] for item in items]
    return {
        'intent_label': intent_label,
        'words': " ".join(words),
        'words_label': " ".join(word_labels),
        'length': len(words)
    }

In [10]:
parse_line(train[0])

{'intent_label': 'alarm/cancel_alarm',
 'words': 'clear all alarms',
 'words_label': 'O O O',
 'length': 3}

In [11]:
df_train = pd.DataFrame([parse_line(line) for line in train])
df_validation = pd.DataFrame([parse_line(line) for line in val])
df_test = pd.DataFrame([parse_line(line) for line in test])

In [12]:
df_train.head()

Unnamed: 0,intent_label,words,words_label,length
0,alarm/cancel_alarm,clear all alarms,O O O,3
1,alarm/set_alarm,set alarm for sundays at 8 : 30am,O O B-datetime I-datetime I-datetime I-datetim...,8
2,weather/find,temperature palo alto today fahrenheit,B-weather/noun B-location I-location B-datetim...,5
3,weather/find,will it rain this afteroon,O O B-weather/attribute B-datetime I-datetime,5
4,reminder/set_reminder,remind me to floss every night .,O O O B-reminder/todo B-reminder/recurring_per...,7


## Intent classification (sentence level)

In [13]:
model_name = 'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(model_name)

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

In [14]:
tokenizer.vocab_size

28996

In [15]:
train_sequence_max_length = max([len(tokenizer.encode(text)) for text in df_train['words']])
train_sequence_max_length

34

In [16]:
def encode_dataset(tokenizer, text_sequences, max_length):
    token_ids = np.zeros(shape=(len(text_sequences), max_length), dtype=np.int32)
    for i, text_sequence in enumerate(text_sequences):
        encoded = tokenizer.encode(text_sequence)
        token_ids[i, 0:len(encoded)] = encoded
    attention_masks = (token_ids != 0).astype(np.int32)
    
    return {'input_ids': token_ids, 'attention_masks': attention_masks}

In [17]:
encoded_train      = encode_dataset(tokenizer, df_train['words'], train_sequence_max_length)
encoded_validation = encode_dataset(tokenizer, df_validation['words'], train_sequence_max_length)
encoded_test       = encode_dataset(tokenizer, df_test['words'], train_sequence_max_length)

In [18]:
intent_map = dict((label, idx) for idx, label in enumerate(intent_names))

In [19]:
intent_map

{'weather/find': 0,
 'reminder/show_reminders': 1,
 'alarm/show_alarms': 2,
 'alarm/cancel_alarm': 3,
 'reminder/set_reminder': 4,
 'weather/checkSunrise': 5,
 'alarm/set_alarm': 6,
 'weather/checkSunset': 7,
 'alarm/snooze_alarm': 8,
 'alarm/modify_alarm': 9,
 'alarm/time_left_on_alarm': 10,
 'reminder/cancel_reminder': 11}

In [20]:
intent_train = df_train['intent_label'].map(intent_map).values
intent_validation = df_validation['intent_label'].map(intent_map).values
intent_test = df_test['intent_label'].map(intent_map).values

**Loading and feeding a pretrained BERT model**

In [21]:
base_bert_model = TFBertModel.from_pretrained('bert-base-cased')
base_bert_model.summary()

Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/527M [00:00<?, ?B/s]

Model: "tf_bert_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  108310272 
                                                                 
Total params: 108,310,272
Trainable params: 108,310,272
Non-trainable params: 0
_________________________________________________________________


In [22]:
class IntentClassificationModel(tf.keras.Model):
    def __init__(self, intent_num_labels=None, model_name='bert-base-cased', dropout_prob=0.1):
        super().__init__(name='joint_intent_slot')
        # Let's preload the pretrained model BERT in the constructor of our classifier model.
        self.bert = TFBertModel.from_pretrained(model_name)
        self.dropout = Dropout(dropout_prob)
        
        # Define a (Dense) classification layer to compute for each
        # sequence in a batch of samples. The number of output classes
        # is given by the intent_num_labels parameter.
        # Use the default linear activation (no softmax) to compute
        # logits. The softmax normalization will be computed in the
        # loss function instead of the model itself.
        self.intent_classifier = Dense(intent_num_labels)
        
    def call(self, inputs, **kwargs):
        # Use the pretrained model to extract features from our encoded inputs.
        sequence_output, pooled_output = self.bert(inputs, **kwargs)
        
        # The second output of the main BERT layer has shape:
        # (batch_size, output_dim) and gives a "pooled" representation
        # for the full sequence from the hidden state that corresponds
        # to the "[CLS]" token.
        pooled_output = self.dropout(pooled_output, training=kwargs.get('training', False))
        
        # Use the classifier layer to compute the logits from the pooled features.
        intent_logits = self.intent_classifier(pooled_output)
        return intent_logits

In [23]:
# Build the model
intent_model = IntentClassificationModel(intent_num_labels=len(intent_map))

intent_model.compile(optimizer=Adam(learning_rate=3e-5, epsilon=1e-08),
                     loss=SparseCategoricalCrossentropy(from_logits=True),
                     metrics=[SparseCategoricalAccuracy('accuracy')])

In [24]:
# Train the model
history = intent_model.fit(encoded_train, intent_train,
                           epochs=2, batch_size=32,
                           validation_data=(encoded_validation, intent_validation))

Epoch 1/2
Epoch 2/2


In [25]:
def classify(text, tokenizerzer, model, intent_names):
    inputs = tf.constant(tokenizer.encode(text))[None, :] # Batch size = 1
    class_id = model(inputs).numpy().argmax(axis=1)[0]
    return intent_names[class_id]

In [29]:
classify('Wake me up for the first job meeting?', tokenizer, intent_model, intent_names)

'alarm/set_alarm'

**WOW))**

## Slot filling (word level)

In [44]:
slot_names = ["[PAD]"]
slot_names += list(slots)
slot_map = {}
for label in slot_names:
    slot_map[label] = len(slot_map)

In [45]:
slot_map

{'[PAD]': 0,
 'I-reminder/todo': 1,
 'I-demonstrative_reference': 2,
 'B-weather/temperatureUnit': 3,
 'B-reminder/recurring_period': 4,
 'I-reminder/reference': 5,
 'B-timer/noun': 6,
 'B-weather/attribute': 7,
 'B-timer/attributes': 8,
 'I-weather/attribute': 9,
 'I-reminder/recurring_period': 10,
 'B-reminder/reference': 11,
 'I-weather/noun': 12,
 'B-reminder/todo': 13,
 'B-reminder/noun': 14,
 'I-reminder/noun': 15,
 'B-weather/noun': 16,
 'O': 17,
 'I-alarm/alarm_modifier': 18,
 'B-demonstrative_reference': 19,
 'I-reminder/reminder_modifier': 20,
 'B-news/type': 21,
 'I-datetime': 22,
 'B-datetime': 23,
 'I-location': 24,
 'B-location': 25,
 'B-alarm/alarm_modifier': 26,
 'B-reminder/reminder_modifier': 27,
 'B-negation': 28}

In [46]:
def encode_token_labels(text_sequences, slot_names, tokenizer, slot_map, max_length):
    encoded = np.zeros(shape=(len(text_sequences), max_length), dtype=np.int32)
    for i, (text_sequence, word_labels) in enumerate(zip(text_sequences, slot_names)):
        encoded_labels = []
        for word, word_label in zip(text_sequence.split(), word_labels.split()):
            tokens = tokenizer.tokenize(word)
            encoded_labels.append(slot_map[word_label])
            expand_label = word_label.replace("B-", "I-")
            if not expand_label in slot_map:
                expand_label = word_label
            encoded_labels.extend([slot_map[expand_label]] * (len(tokens) - 1))
        encoded[i, 1:len(encoded_labels) + 1] = encoded_labels
    return encoded

In [47]:
slot_train = encode_token_labels(df_train['words'], df_train['words_label'], tokenizer, slot_map, train_sequence_max_length)
slot_validation = encode_token_labels(df_validation['words'], df_validation['words_label'], tokenizer, slot_map, train_sequence_max_length)
slot_test = encode_token_labels(df_test['words'], df_test['words_label'], tokenizer, slot_map, train_sequence_max_length)

In [48]:
class JointIntentAndSlotFillingModel(tf.keras.Model):

    def __init__(self, intent_num_labels=None, slot_num_labels=None,
                 model_name="bert-base-cased", dropout_prob=0.1):
        super().__init__(name="joint_intent_slot")
        self.bert = TFBertModel.from_pretrained(model_name)
        self.dropout = Dropout(dropout_prob)
        self.intent_classifier = Dense(intent_num_labels,
                                       name="intent_classifier")
        self.slot_classifier = Dense(slot_num_labels,
                                     name="slot_classifier")

    def call(self, inputs, **kwargs):
        sequence_output, pooled_output = self.bert(inputs, **kwargs)

        # The first output of the main BERT layer has shape:
        # (batch_size, max_length, output_dim)
        sequence_output = self.dropout(sequence_output,
                                       training=kwargs.get("training", False))
        slot_logits = self.slot_classifier(sequence_output)

        # The second output of the main BERT layer has shape:
        # (batch_size, output_dim)
        # and gives a "pooled" representation for the full sequence from the
        # hidden state that corresponds to the "[CLS]" token.
        pooled_output = self.dropout(pooled_output,
                                     training=kwargs.get("training", False))
        intent_logits = self.intent_classifier(pooled_output)

        return slot_logits, intent_logits

In [49]:
joint_model = JointIntentAndSlotFillingModel(
    intent_num_labels=len(intent_map), slot_num_labels=len(slot_map))

# Define one classification loss for each output:
opt = Adam(learning_rate=3e-5, epsilon=1e-08)
losses = [SparseCategoricalCrossentropy(from_logits=True),
          SparseCategoricalCrossentropy(from_logits=True)]
metrics = [SparseCategoricalAccuracy('accuracy')]
joint_model.compile(optimizer=opt, loss=losses, metrics=metrics)

In [50]:
history = joint_model.fit(
    encoded_train, (slot_train, intent_train),
    validation_data=(encoded_validation, (slot_validation, intent_validation)),
    epochs=2, batch_size=32)

Epoch 1/2
Epoch 2/2


In [51]:
def show_predictions(text, tokenizer, model, intent_names, slot_names):
    inputs = tf.constant(tokenizer.encode(text))[None, :]  # batch_size = 1
    outputs = model(inputs)
    slot_logits, intent_logits = outputs
    slot_ids = slot_logits.numpy().argmax(axis=-1)[0, 1:-1]
    intent_id = intent_logits.numpy().argmax(axis=-1)[0]
    print("## Intent:", intent_names[intent_id])
    print("## Slots:")
    for token, slot_id in zip(tokenizer.tokenize(text), slot_ids):
        print(f"{token:>10} : {slot_names[slot_id]}")

In [54]:
# Example of classification
show_predictions('What is the Temperature in beirut today?', tokenizer, joint_model, intent_names, slot_names)

## Intent: weather/find
## Slots:
      What : O
        is : O
       the : O
        Te : B-weather/noun
    ##mper : O
   ##ature : B-weather/noun
        in : O
        be : B-location
      ##ir : I-location
      ##ut : I-location
     today : B-datetime
         ? : O


In [55]:
def decode_predictions(text, tokenizer, intent_names, slot_names,
                       intent_id, slot_ids):
    info = {"intent": intent_names[intent_id]}
    collected_slots = {}
    active_slot_words = []
    active_slot_name = None
    for word in text.split():
        tokens = tokenizer.tokenize(word)
        current_word_slot_ids = slot_ids[:len(tokens)]
        slot_ids = slot_ids[len(tokens):]
        current_word_slot_name = slot_names[current_word_slot_ids[0]]
        if current_word_slot_name == "O":
            if active_slot_name:
                collected_slots[active_slot_name] = " ".join(active_slot_words)
                active_slot_words = []
                active_slot_name = None
        else:
            # Naive BIO: handling: treat B- and I- the same...
            new_slot_name = current_word_slot_name[2:]
            if active_slot_name is None:
                active_slot_words.append(word)
                active_slot_name = new_slot_name
            elif new_slot_name == active_slot_name:
                active_slot_words.append(word)
            else:
                collected_slots[active_slot_name] = " ".join(active_slot_words)
                active_slot_words = [word]
                active_slot_name = new_slot_name
    if active_slot_name:
        collected_slots[active_slot_name] = " ".join(active_slot_words)
    info["slots"] = collected_slots
    return info

In [56]:
def nlu(text, tokenizer, model, intent_names, slot_names):
    inputs = tf.constant(tokenizer.encode(text))[None, :]  # batch_size = 1
    outputs = model(inputs)
    slot_logits, intent_logits = outputs
    slot_ids = slot_logits.numpy().argmax(axis=-1)[0, 1:-1]
    intent_id = intent_logits.numpy().argmax(axis=-1)[0]

    return decode_predictions(text, tokenizer, intent_names, slot_names,
                              intent_id, slot_ids)

In [59]:
nlu('What is the Temperature in beirut today?', tokenizer, joint_model, intent_names, slot_names)

{'intent': 'weather/find',
 'slots': {'weather/noun': 'Temperature',
  'location': 'beirut',
  'datetime': 'today?'}}