In [1]:
import json
import os

with open('training_data/ner/ner_data.json', 'r') as f:
    train_sentences = json.load(f)


In [2]:
from transformers import BertForTokenClassification, BertTokenizerFast, TrainingArguments, Trainer
import torch

trained_ner_location = 'trained_models/ner'

ner_already_trained = os.path.exists(trained_ner_location)

# Load pre-trained model and tokenizer
model = BertForTokenClassification.from_pretrained(trained_ner_location) if ner_already_trained else BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=3)
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# Define a mapping from label names to indices
label_to_index = {"O": 0, "setting_name": 1, "state": 2}
index_to_label = {v: k for k, v in label_to_index.items()}

if not ner_already_trained:
    # Function to encode the examples
    def encode_example(sentence):
        inputs = tokenizer(sentence['text'], is_split_into_words=True, padding='max_length', truncation=True, max_length=128)
        labels = [-100 if token_id==tokenizer.pad_token_id else label_to_index[label] for token_id, label in zip(inputs['input_ids'], sentence['ner'])]  
        labels += [-100] * (128 - len(labels))  # pad labels to the max length
        inputs['labels'] = labels
        return inputs


    # Encode all examples
    train_encodings = [encode_example(s) for s in train_sentences]
    train_encodings = [ {k: torch.tensor(v) for k, v in enc.items()} for enc in train_encodings]

    # Define the training arguments
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=20,
        per_device_train_batch_size=64,
        weight_decay=0.01,
        seed=42,  # for reproducibility
        logging_steps=100,  # log loss every 100 steps
        # logging_dir='./logs',
    )

    # Define the Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_encodings,
    )

    # Train the model
    trainer.train()
    model.save_pretrained(trained_ner_location)

# Function to extract settings and states
def extract_settings_and_states(sentence):
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=2)

    predicted_labels = [index_to_label[p] for p in predictions[0].tolist()]

    # Match up the original tokens with their predicted labels
    original_tokens = tokenizer.tokenize(sentence)
    aligned_labels = []
    current_tokens = []
    current_label = None

    for token, label in zip(original_tokens, predicted_labels):
        # Handle subwords
        if token.startswith("##"):
            if current_tokens:  # ensure current_tokens is not empty
                current_tokens[-1] += token[2:]
        else:                
            # If the label is not 'O', add the token to current_tokens
            # and update current_label
            if label != 'O':
                current_tokens.append(token)
                current_label = label
            else:
                # If current_label is not None, add the assembled entity and its label
                # to aligned_labels
                if current_label:
                    aligned_labels.append([' '.join(current_tokens), current_label])
                    current_tokens = []
                    current_label = None
    
    # Handle the last entity if its label is not 'O'
    if current_label:
        aligned_labels.append([' '.join(current_tokens), current_label])

    return aligned_labels



Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Step,Training Loss


# Test model

In [3]:
print("Predicted entities:", extract_settings_and_states("How can I publish my profile?"))
print("Predicted entities:", extract_settings_and_states("How do I disable the connect with me setting?"))
print("Predicted entities:", extract_settings_and_states("How do i make myself visible in the search?"))
print("Predicted entities:", extract_settings_and_states("Can i make myself only findable to logged in users?"))


Predicted entities: [['publish', 'state'], ['profile', 'setting_name']]
Predicted entities: [['disable the connect with me', 'setting_name']]
Predicted entities: [['search', 'setting_name']]
Predicted entities: [['logged in', 'state']]
