In [1]:
%pip install transformers==4.45.0 datasets evaluate seqeval

Collecting transformers==4.45.0
  Using cached transformers-4.45.0-py3-none-any.whl (9.9 MB)
Collecting seqeval
  Using cached seqeval-1.2.2-py3-none-any.whl
Installing collected packages: seqeval, transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.45.1
    Uninstalling transformers-4.45.1:
      Successfully uninstalled transformers-4.45.1
Successfully installed seqeval-1.2.2 transformers-4.45.0
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.0.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
from datasets import Dataset, DatasetDict
import pandas as pd

datasetName = "play_audio"
df = pd.read_json(f"../../train/data/{datasetName}.json")
dataset = Dataset.from_pandas(df)
#dataset = dataset.class_encode_column("ner_tags")
trainTest = dataset.train_test_split(0.2)
testVal = trainTest["test"].train_test_split(0.3)
dataset = DatasetDict({
    "train": trainTest["train"],
    "test": testVal["train"],
    "validation": testVal["test"]
})
dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'id'],
        num_rows: 2184
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'id'],
        num_rows: 382
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'id'],
        num_rows: 164
    })
})

In [4]:
dataset["train"][0]

{'tokens': ['Why',
  "don't",
  'you',
  'Play',
  'Smells',
  'like',
  'Teen',
  'Spirit',
  'by',
  'Bruno',
  'Mars'],
 'ner_tags': [0, 0, 0, 0, 1, 2, 2, 2, 0, 3, 4],
 'id': 2589}

In [5]:
label_list = [
    "O",
    "B-SongName",
    "I-SongName",
    "B-ArtistName",
    "I-ArtistName"
]

#print(label_list)

In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")



In [7]:
example = dataset["train"][0]

tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)

tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])

tokens

['[CLS]',
 'why',
 'don',
 "'",
 't',
 'you',
 'play',
 'smells',
 'like',
 'teen',
 'spirit',
 'by',
 'bruno',
 'mars',
 '[SEP]']

In [8]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [9]:
tokenized_wnut = dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/2184 [00:00<?, ? examples/s]

Map:   0%|          | 0/382 [00:00<?, ? examples/s]

Map:   0%|          | 0/164 [00:00<?, ? examples/s]

In [10]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors="tf")

In [11]:
import evaluate

seqeval = evaluate.load("seqeval")

In [12]:
import numpy as np

labels = [label_list[i] for i in example[f"ner_tags"]]


def compute_metrics(p):

    predictions, labels = p

    predictions = np.argmax(predictions, axis=2)

    true_predictions = [

        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]

        for prediction, label in zip(predictions, labels)

    ]

    true_labels = [

        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]

        for prediction, label in zip(predictions, labels)

    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)

    return {

        "precision": results["overall_precision"],

        "recall": results["overall_recall"],

        "f1": results["overall_f1"],

        "accuracy": results["overall_accuracy"],

    }

In [13]:
id2label = {x: label_list[x] for x in range(len(label_list))}

label2id = {v: k for k, v in id2label.items()}

In [14]:
from transformers import create_optimizer

batch_size = 16
num_train_epochs = 3
num_train_steps = (len(tokenized_wnut["train"]) // batch_size) * num_train_epochs
optimizer, lr_schedule = create_optimizer(
    init_lr=2e-5,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
    num_warmup_steps=0,
)




In [15]:
from transformers import TFAutoModelForTokenClassification

model = TFAutoModelForTokenClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=len(label_list), id2label=id2label, label2id=label2id
)




Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForTokenClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing TFDistilBertForTokenClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForTokenClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able t

In [16]:
tf_train_set = model.prepare_tf_dataset(
    tokenized_wnut["train"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

tf_validation_set = model.prepare_tf_dataset(
    tokenized_wnut["validation"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

In [17]:
import tensorflow as tf

model.compile(optimizer=optimizer, run_eagerly=True)

In [18]:
from transformers.keras_callbacks import KerasMetricCallback

metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)

In [19]:
from transformers.keras_callbacks import PushToHubCallback

push_to_hub_callback = PushToHubCallback(
    output_dir="Project_Nigel_Slot_Filling",
    tokenizer=tokenizer
)

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
Cloning https://huggingface.co/Nicknotname/Project_Nigel_Slot_Filling into local empty directory.


Download file tf_model.h5:   0%|          | 8.00k/253M [00:00<?, ?B/s]

Clean file tf_model.h5:   0%|          | 1.00k/253M [00:00<?, ?B/s]

In [20]:
callbacks = [metric_callback, push_to_hub_callback]

In [21]:
model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=1, callbacks=callbacks)




<tf_keras.src.callbacks.History at 0x266ccfbb910>

In [25]:
import os

modelName = "0.1"

if not os.path.exists("../models"):
    os.mkdir("../models")
    
modelDir = f"../models/{modelName}/"
if not os.path.exists(modelDir):
    os.mkdir(modelDir)
model.save_pretrained(f"{modelDir}/")

In [23]:
text = "i want you to play too cool to be careless by big steve and the rocking chair"
inputs = tokenizer(text, return_tensors="tf")
logits = model(**inputs).logits
predicted_token_class_ids = tf.math.argmax(logits, axis=-1)

predicted_token_class = [model.config.id2label[t] for t in predicted_token_class_ids[0].numpy().tolist()]

predicted_token_class

['O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-SongName',
 'I-SongName',
 'I-SongName',
 'I-SongName',
 'I-SongName',
 'O',
 'B-ArtistName',
 'I-ArtistName',
 'I-ArtistName',
 'I-ArtistName',
 'I-ArtistName',
 'I-ArtistName',
 'I-SongName']

In [24]:
result = classifier()
print(result)
combined_entities = []
current_entity = None
current_label = None

for item in result:
    entity = item['entity']
    word = item['word']

    if entity.startswith('B-'):
        # Start of a new entity, so save the previous one
        if current_entity:
            combined_entities.append(f"{current_label}: {current_entity}")
        current_entity = word
        current_label = entity[2:]  # Remove 'B-' prefix
    elif entity.startswith('I-') and entity[2:] == current_label:
        # Continuation of the current entity
        current_entity += f" {word}"
    else:
        # Non-B/I entity, just append directly
        if current_entity:
            combined_entities.append(f"{current_label}: {current_entity}")
        current_entity = None
        current_label = None
        combined_entities.append(f"{entity}: {word}")

# Append the last entity if it exists
if current_entity:
    combined_entities.append(f"{current_label}: {current_entity}")

# Display the combined entities
for entity in combined_entities:
    print(entity)

NameError: name 'classifier' is not defined