## Import libraries

importing necessary libraries

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizer, BertModel
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd
import re
from sklearn.model_selection import train_test_split
import json

### Data Preparation

In [2]:
e_data=[]
i_data=[]

# loading NER annotations made through OpenAI API
with open("./batch_output.jsonl","r",encoding="utf-8") as lines:
    for line in lines:
        e_data.append(json.loads(line)) 

# loading NADA dataset
intent_data=pd.read_csv("./data.csv")

# sorting NER labels based on the ID to match the NADA dataset
entity_data = sorted(e_data, key=lambda x: int(x['custom_id'].split('-')[1]))


In [3]:
def label_entities(text, entities):
    '''
    a function to match the NER labels with the original text
    
    '''
    cleaned_text = re.sub(r"[^\w\s.,\d]", "", text)
    
    # Split the text into words
    words = cleaned_text.split()
    
    # Create an entity label list initialized with "O"
    labels = ["O"] * len(words)
    
    # Iterate through the entities to label the text
    for entity in entities:
        entity["text"].replace(",","")
        entity_words = entity['text'].split()
        category = entity['category']
        
        for i in range(len(words)):
            if words[i:i+len(entity_words)] == entity_words:
                labels[i] = f"B-{category}"
                for j in range(1, len(entity_words)):
                    labels[i+j] = f"I-{category}"
    
    # Return the cleaned text and the entity labels
    return " ".join(words), " ".join(labels)



combining data from the NADA dataset and NER labels into a list of JSON dictionaries

In [4]:
data=[]
for idx, row in intent_data.iterrows():
    try:
        match = re.search(r'```json\n(.+?)\n```', entity_data[idx]["response"]["body"]["choices"][0]["message"]["content"], re.DOTALL)
        if match:
            json_part = match.group(1)
        entities_labels = json.loads(json_part)
        text,entities = label_entities(row["text"],entities_labels["entities"])
        intent_label = row["title"]
        # break
        data.append({
                "intent_label": intent_label,
                "words": text,
                "word_labels": entities,
                "length": len(text)}) 
    except Exception as e: # skipping labelling errors
        print(e)


Expecting ',' delimiter: line 58 column 22 (char 1130)
Expecting ',' delimiter: line 42 column 26 (char 834)
'category'
Invalid control character at: line 34 column 20 (char 666)
Expecting ',' delimiter: line 84 column 26 (char 1736)
Expecting ',' delimiter: line 48 column 26 (char 1009)
Expecting ',' delimiter: line 30 column 26 (char 588)
Invalid \escape: line 58 column 18 (char 1268)
Expecting ',' delimiter: line 90 column 26 (char 1873)
Expecting ',' delimiter: line 223 column 28 (char 4586)
'entities'
Expecting ',' delimiter: line 30 column 26 (char 573)
Invalid control character at: line 28 column 19 (char 518)
Expecting property name enclosed in double quotes: line 62 column 5 (char 1214)


In [5]:
# coverting data into a dataframe
df = pd.DataFrame(data)

In [6]:
df

Unnamed: 0,intent_label,words,word_labels,length
0,الأدب العربي-أدبيات,يتناول المهرجان هذا العام العلاقة بين الشعر وا...,O O O O O O O O O O O O O O O O O O O O O O O ...,2721
1,الأدب العربي-أدبيات,لاشك فى أننى لا أرمى من وراء هذا العنوان إلى أ...,O O O O O O O O O O O O O O O O O O O O O O O ...,2380
2,الأدب العربي-أدبيات,لفت الدكتور خالد جودة إلى الإتجاهات المستحدثة ...,O B-PERSON I-PERSON I-PERSON O O O O O O O O O...,2130
3,الأدب العربي-أدبيات,لفت الدكتور خالد جودة إلى الإتجاهات المستحدثة ...,O O B-PERSON I-PERSON O O O O O O O O O O O O ...,2130
4,الأدب العربي-أدبيات,أصدر فاروق حسنى وزير الثقافة المصري قرارا بتعي...,O B-PERSON I-PERSON B-ORG I-ORG I-ORG O O B-PE...,2135
...,...,...,...,...
7291,فلك-علوم بحتة,موقع الكون المجموعة الشمسية أقمار زحل عودة للص...,O O O O O B-GPE O O O O O O O O O O B-GPE B-PE...,1311
7292,فلك-علوم بحتة,موقع الكون المجموعة الشمسية أقمار زحل عودة للص...,O O O O O B-GPE O O O O O O O O O O B-GPE B-PE...,1293
7293,فلك-علوم بحتة,موقع الكون المجموعة الشمسية أقمار زحل عودة للص...,O B-LOC B-LOC I-LOC B-LOC B-GPE O O O O O B-LO...,1293
7294,فلك-علوم بحتة,موقع الكون المجموعة الشمسية حلقات زحل عودة للص...,O O O O O B-GPE O O O O O O O O O O B-GPE O O ...,3118


In [7]:
# splitting data into train / validation / test sets  80%/10%/10%
df_train, temp = train_test_split(df, test_size=0.2, random_state=42)

df_valid, df_test = train_test_split(temp, test_size=0.5, random_state=42)


In [8]:
model_name = "aubmindlab/bert-base-arabertv02" # this value can be changed to any version of BERT (we could not use Answer.AI's ModernBERT because it's trained on English language only)
tokenizer = BertTokenizer.from_pretrained(model_name)

In [9]:
# a function for encoding dataset
def encode_dataset(tokenizer, text_sequences, max_length):
    input_ids = []
    attention_masks = []

    for text_sequence in text_sequences:
        encoded = tokenizer.encode_plus(
            text_sequence,
            add_special_tokens=True,
            max_length=max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        input_ids.append(encoded["input_ids"])
        attention_masks.append(encoded["attention_mask"])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return {"input_ids": input_ids, "attention_masks": attention_masks}


In [10]:
max_length = 512
encoded_train = encode_dataset(tokenizer, df_train["words"], max_length)
encoded_valid = encode_dataset(tokenizer, df_valid["words"], max_length)
encoded_test = encode_dataset(tokenizer, df_test["words"], max_length)

intent_names = set(df["intent_label"])
intent_names = list(intent_names)
intent_map = {label: idx for idx, label in enumerate(intent_names)}
intent_train = torch.tensor(df_train["intent_label"].map(intent_map).values)
intent_valid = torch.tensor(df_valid["intent_label"].map(intent_map).values)
intent_test = torch.tensor(df_test["intent_label"].map(intent_map).values)

unique_entities = set()

for sentence in df["word_labels"]:
    words = sentence.split()
    unique_entities.update(words)

unique_entities_list = list(unique_entities)
unique_entities_list.append("[PAD]")
# Slot encoding
slot_names = unique_entities_list
slot_map = {label: idx for idx, label in enumerate(slot_names)}


In [11]:
intent_names

['الإقتصاد-علوم إجتماعية',
 'الأدب العربي-أدبيات',
 'عام- فنون',
 'رياضة',
 'فلك-علوم بحتة',
 'القانون-علوم اجتماعية',
 'علم الكمبيوتر-علوم بحتة',
 'علوم صحية-علوم تطبيقية',
 'عام-إسلام-ديانات',
 'السياسة-علوم اجتماعية']

In [12]:
slot_names

['I-GPE',
 'B-MONEY',
 'I-AGE',
 'B-CARDINAL',
 'O',
 'B-AGE',
 'I-PERSON',
 'I-MISC',
 'B-EVENT',
 'I-ORDINAL',
 'I-EVENT',
 'B-ORDINAL',
 'B-LOCATION',
 'I-MONEY',
 'B-MONTH',
 'B-PERSON',
 'I-QUANTITY',
 'B-ORG',
 'B-GPE',
 'I-ORG',
 'I-LOC',
 'B-QUANTITY',
 'B-NORP',
 'I-TIME',
 'I-CARDINAL',
 'B-LOC',
 'B-DATE',
 'B-TIME',
 'B-MISC',
 'B-PERCENT',
 'I-PERCENT',
 'I-DATE',
 'I-LOCATION',
 '[PAD]']

In [None]:
# a function to encode 
def encode_token_labels(text_sequences, slot_sequences, tokenizer, slot_map, max_length):
    encoded_labels = torch.zeros((len(text_sequences), max_length), dtype=torch.long)
    for i, (text, slots) in enumerate(zip(text_sequences, slot_sequences)):
        token_labels = []
        for word, slot in zip(text.split(), slots.split()):
            tokens = tokenizer.tokenize(word)
            token_labels.append(slot_map[slot])
            expand_label = slot.replace("B-", "I-")
            token_labels.extend([slot_map.get(expand_label, slot_map[slot])] * (len(tokens) - 1))
        # encoded_labels[i, 1:1 + len(token_labels)] = torch.tensor(token_labels[:max_length - 2])
        max_tokens = min(len(token_labels), max_length - 2)
        encoded_labels[i, 1:1 + max_tokens] = torch.tensor(token_labels[:max_tokens])
    return encoded_labels

In [None]:
slot_train = encode_token_labels(
    df_train["words"], df_train["word_labels"], tokenizer, slot_map, max_length)
slot_valid = encode_token_labels(
    df_valid["words"], df_valid["word_labels"], tokenizer, slot_map, max_length)
slot_test = encode_token_labels(
    df_test["words"], df_test["word_labels"], tokenizer, slot_map, max_length)

In [15]:
# model definition
class JointIntentAndSlotFillingModel(nn.Module):
    def __init__(self, intent_num_labels, slot_num_labels, model_name, dropout_prob=0.1):
        super(JointIntentAndSlotFillingModel, self).__init__()
        self.bert = BertModel.from_pretrained(model_name) # a pre-trained BERT model from Hugging Face's transformers library.
        self.dropout = nn.Dropout(dropout_prob) # a dropout layer to prevent overfitting.
        self.intent_classifier = nn.Linear(self.bert.config.hidden_size, intent_num_labels) # a fully connected layer for intent classification.
        self.slot_classifier = nn.Linear(self.bert.config.hidden_size, slot_num_labels) # a fully connected layer for slot classification.

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = self.dropout(outputs.last_hidden_state)
        pooled_output = self.dropout(outputs.pooler_output)

        slot_logits = self.slot_classifier(sequence_output) # logits for slot classification 
        intent_logits = self.intent_classifier(pooled_output) # Logits for intent classification 

        return slot_logits, intent_logits


In [16]:
# model initialization
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = JointIntentAndSlotFillingModel(len(intent_map), len(slot_map),model_name=model_name).to(device)

# optimizer and loss functions
optimizer = optim.AdamW(model.parameters(), lr=3e-5, eps=1e-08)
slot_loss_fn = nn.CrossEntropyLoss(ignore_index=0)
intent_loss_fn = nn.CrossEntropyLoss()

epochs = 5
batch_size = 32

In [None]:
train_data = torch.utils.data.TensorDataset(
    encoded_train["input_ids"], encoded_train["attention_masks"], slot_train, intent_train)
train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)

valid_data = torch.utils.data.TensorDataset(
    encoded_valid["input_ids"], encoded_valid["attention_masks"], slot_valid, intent_valid)

valid_loader = torch.utils.data.DataLoader(valid_data, batch_size=batch_size)

## Model Training

In [18]:
for epoch in range(epochs):
    model.train()
    total_loss = 0
    intent_correct = 0
    total_intent = 0

    for batch in train_loader:
        input_ids, attention_mask, slot_labels, intent_labels = [x.to(device) for x in batch]

        optimizer.zero_grad()
        slot_logits, intent_logits = model(input_ids, attention_mask)

        slot_loss = slot_loss_fn(slot_logits.view(-1, len(slot_map)), slot_labels.view(-1))
        intent_loss = intent_loss_fn(intent_logits, intent_labels)

        loss = slot_loss + intent_loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        intent_preds = torch.argmax(intent_logits, dim=1)
        intent_correct += (intent_preds == intent_labels).sum().item()
        total_intent += intent_labels.size(0)

    train_loss = total_loss / len(train_loader)
    train_accuracy = intent_correct / total_intent

    # Validation
    model.eval()
    valid_loss = 0
    intent_correct = 0
    total_intent = 0

    with torch.no_grad():
        for batch in valid_loader:
            input_ids, attention_mask, slot_labels, intent_labels = [x.to(device) for x in batch]
            slot_logits, intent_logits = model(input_ids, attention_mask)

            slot_loss = slot_loss_fn(slot_logits.view(-1, len(slot_map)), slot_labels.view(-1))
            intent_loss = intent_loss_fn(intent_logits, intent_labels)

            loss = slot_loss + intent_loss
            valid_loss += loss.item()

            # Intent accuracy
            intent_preds = torch.argmax(intent_logits, dim=1)
            intent_correct += (intent_preds == intent_labels).sum().item()
            total_intent += intent_labels.size(0)

    valid_loss /= len(valid_loader)
    valid_accuracy = intent_correct / total_intent

    print(f"Epoch {epoch + 1}:")
    print(f"  Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f} - Valid Loss: {valid_loss:.4f}, Valid Accuracy: {valid_accuracy:.4f}")


Epoch 1:
  Train Loss: 1.2999, Train Accuracy: 0.6909 - Valid Loss: 0.8684, Valid Accuracy: 0.7466
Epoch 2:
  Train Loss: 0.8145, Train Accuracy: 0.7553 - Valid Loss: 0.8082, Valid Accuracy: 0.7466
Epoch 3:
  Train Loss: 0.7626, Train Accuracy: 0.7582 - Valid Loss: 0.7905, Valid Accuracy: 0.7315
Epoch 4:
  Train Loss: 0.7470, Train Accuracy: 0.7569 - Valid Loss: 0.7614, Valid Accuracy: 0.7521
Epoch 5:
  Train Loss: 0.7315, Train Accuracy: 0.7514 - Valid Loss: 0.7563, Valid Accuracy: 0.7466


In [19]:
def evaluate_on_test():
    model.eval()

    intent_preds_list = []
    intent_labels_list = []

    slot_preds_list = []
    slot_labels_list = []

    with torch.no_grad():
        for batch in torch.utils.data.DataLoader(
            torch.utils.data.TensorDataset(
                encoded_test["input_ids"],
                encoded_test["attention_masks"],
                slot_test,
                intent_test,
            ),
            batch_size=batch_size,
        ):
            input_ids, attention_mask, slot_labels, intent_labels = [x.to(device) for x in batch]
            slot_logits, intent_logits = model(input_ids, attention_mask)

            # Intent predictions
            intent_preds = torch.argmax(intent_logits, dim=1).cpu().numpy()
            intent_labels = intent_labels.cpu().numpy()
            intent_preds_list.extend(intent_preds)
            intent_labels_list.extend(intent_labels)

            # Slot predictions
            slot_preds = torch.argmax(slot_logits, dim=2).cpu().numpy()
            slot_labels = slot_labels.cpu().numpy()
            for pred, label in zip(slot_preds, slot_labels):
                slot_preds_list.extend(pred[:len(label[label > 0])])
                slot_labels_list.extend(label[label > 0])

    used_slot_names = [slot_names[i] for i in sorted(set(slot_labels_list))]

    # Intent classification report
    print("Intent Classification Report:")
    print(classification_report(intent_labels_list, intent_preds_list, target_names=intent_names))

    # Slot classification report
    print("Slot Classification Report:")
    print(classification_report(slot_labels_list, slot_preds_list, target_names=used_slot_names, labels=sorted(set(slot_labels_list))))

    # Intent accuracy
    print(f"Intent Accuracy: {accuracy_score(intent_labels_list, intent_preds_list):.4f}")



In [20]:
evaluate_on_test()

Intent Classification Report:
                          precision    recall  f1-score   support

الإقتصاد-علوم إجتماعية       1.00      0.74      0.85       144
   الأدب العربي-أدبيات       1.00      1.00      1.00        49
               عام- فنون       1.00      0.60      0.75        47
                   رياضة       1.00      0.83      0.91       127
           فلك-علوم بحتة       1.00      1.00      1.00        36
   القانون-علوم اجتماعية       1.00      0.83      0.91       151
 علم الكمبيوتر-علوم بحتة       1.00      0.26      0.42        38
  علوم صحية-علوم تطبيقية       1.00      1.00      1.00        39
       عام-إسلام-ديانات       1.00      0.36      0.53        55
   السياسة-علوم اجتماعية       0.21      1.00      0.35        44

                accuracy                           0.77       730
               macro avg       0.92      0.76      0.77       730
            weighted avg       0.95      0.77      0.81       730

Slot Classification Report:
              p

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Inference

In [21]:
def show_predictions(text, tokenizer, model, intent_names, slot_names):
    inputs = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_length,
        return_tensors="pt",
        truncation=True,
    )
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    with torch.no_grad():
        slot_logits, intent_logits = model(input_ids, attention_mask)

    slot_ids = slot_logits.argmax(dim=-1).cpu().numpy()[0, 1:-1]
    intent_id = intent_logits.argmax(dim=-1).cpu().numpy()[0]

    print("## Intent:", intent_names[intent_id])
    print("## Slots:")
    for token, slot_id in zip(tokenizer.tokenize(text), slot_ids):
        print(f"{token:>10} : {slot_names[slot_id]}")




In [22]:
show_predictions("و قال الدكتور محمد الخالد انه سيباشر عمله في الرياض", tokenizer, model, intent_names, slot_names)

## Intent: القانون-علوم اجتماعية
## Slots:
         و : O
       قال : O
   الدكتور : B-PERSON
      محمد : B-PERSON
    الخالد : I-PERSON
       انه : O
      سيبا : O
      ##شر : O
      عمله : O
        في : O
    الرياض : B-GPE


In [None]:
# Save the model
save_path = "joint_intent_slot_model.pt"
torch.save(model.state_dict(), save_path)
print(f"Model saved to {save_path}")

# Load the model
loaded_model = JointIntentAndSlotFillingModel(len(intent_map), len(slot_map),model_name=model_name)
loaded_model.load_state_dict(torch.load(save_path))
loaded_model.to(device)
print("Model loaded successfully.")

Model saved to joint_intent_slot_model.pt


  loaded_model.load_state_dict(torch.load(save_path))


Model loaded successfully.


In [24]:
# Save tokenizer
tokenizer.save_pretrained("./tokenizer")


('./tokenizer/tokenizer_config.json',
 './tokenizer/special_tokens_map.json',
 './tokenizer/vocab.txt',
 './tokenizer/added_tokens.json')

In [25]:
# Load tokenizer
loaded_tokenizer = BertTokenizer.from_pretrained("./tokenizer")


In [26]:
show_predictions("و قال الدكتور محمد الخالد انه سيباشر عمله في الرياض", loaded_tokenizer, loaded_model, intent_names, slot_names)


## Intent: القانون-علوم اجتماعية
## Slots:
         و : O
       قال : O
   الدكتور : B-PERSON
      محمد : B-PERSON
    الخالد : I-PERSON
       انه : O
      سيبا : O
      ##شر : O
      عمله : O
        في : O
    الرياض : B-GPE


In [27]:
def decode_predictions(text, tokenizer, intent_names, slot_names,
                       intent_id, slot_ids):
    info = {"intent": intent_names[intent_id]}
    collected_slots = {}
    active_slot_words = []
    active_slot_name = None
    for word in text.split():
        tokens = tokenizer.tokenize(word)
        current_word_slot_ids = slot_ids[:len(tokens)]
        slot_ids = slot_ids[len(tokens):]
        current_word_slot_name = slot_names[current_word_slot_ids[0]]
        if current_word_slot_name == "O":
            if active_slot_name:
                collected_slots[active_slot_name] = " ".join(active_slot_words)
                active_slot_words = []
                active_slot_name = None
        else:
            # Naive BIO: handling: treat B- and I- the same...
            new_slot_name = current_word_slot_name[2:]
            if active_slot_name is None:
                active_slot_words.append(word)
                active_slot_name = new_slot_name
            elif new_slot_name == active_slot_name:
                active_slot_words.append(word)
            else:
                collected_slots[active_slot_name] = " ".join(active_slot_words)
                active_slot_words = [word]
                active_slot_name = new_slot_name
    if active_slot_name:
        collected_slots[active_slot_name] = " ".join(active_slot_words)
    info["slots"] = collected_slots
    return info

In [28]:
def nlu(text, tokenizer, model, intent_names, slot_names):
    inputs = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_length,
        return_tensors="pt",
        truncation=True,
    )
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    with torch.no_grad():
        slot_logits, intent_logits = model(input_ids, attention_mask)

    slot_ids = slot_logits.argmax(dim=-1).cpu().numpy()[0, 1:-1]
    intent_id = intent_logits.argmax(dim=-1).cpu().numpy()[0]

    return decode_predictions(text, tokenizer, intent_names, slot_names,
                              intent_id, slot_ids)

nlu("و قال الدكتور محمد الخالد انه سيباشر عمله في الرياض",
    loaded_tokenizer, loaded_model, intent_names, slot_names)

{'intent': 'القانون-علوم اجتماعية',
 'slots': {'PERSON': 'الدكتور محمد الخالد', 'GPE': 'الرياض'}}