In [23]:
%pip install transformers datasets evaluate ipywidgets tensorflow scikit-learn numpy==1.26.1 tf-keras

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.0.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [24]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [25]:
import json
import os
from datasets import Dataset, DatasetDict
dataPath = "../../train/data/"
data = {"label": [], "text": []}
for x in os.listdir(dataPath):
    name = x.replace(".json", "")
    f = open(f"{dataPath}/{x}")
    d = json.load(f)
    for item in d:
        data["label"].append(name)
        data["text"].append(" ".join(item["tokens"]))
        
dataset = Dataset.from_dict(data)
trainTest = dataset.train_test_split(0.2)
testVal = trainTest["test"].train_test_split(0.3)
dataset = DatasetDict({
    "train": trainTest["train"],
    "test": testVal["train"],
    "validation": testVal["test"]
})
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 4184
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 732
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 314
    })
})

In [26]:
unique_sentiments = dataset['train'].unique('label')
unique_sentiments

Flattening the indices:   0%|          | 0/4184 [00:00<?, ? examples/s]

['play_audio', 'greeting']

In [27]:
label2id = {sentiment: i for i, sentiment in enumerate(unique_sentiments)}
print(label2id)
id2label = {sentiment: i for i, sentiment in label2id.items()}
print(id2label)

{'play_audio': 0, 'greeting': 1}
{0: 'play_audio', 1: 'greeting'}


In [28]:
def map_sentiment(example):
    return {'label': label2id[example['label']]}
dataset['train'] = dataset['train'].map(map_sentiment)

Map:   0%|          | 0/4184 [00:00<?, ? examples/s]

In [29]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")



In [30]:
def preprocess_func(examples):
    return tokenizer(examples["text"], truncation=True)

In [31]:
tokenized_dataset = dataset.map(preprocess_func, batched=True)

Map:   0%|          | 0/4184 [00:00<?, ? examples/s]

Map:   0%|          | 0/732 [00:00<?, ? examples/s]

Map:   0%|          | 0/314 [00:00<?, ? examples/s]

In [32]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

In [33]:
import evaluate

accuracy = evaluate.load("accuracy")

KeyboardInterrupt: 

In [None]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
from transformers import create_optimizer
import tensorflow as tf

batch_size = 16
num_epochs = 2
batches_per_epoch = len(tokenized_dataset["train"]) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)
print(total_train_steps)
optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)


522


In [None]:
from transformers import TFAutoModelForSequenceClassification

model = TFAutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)




Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

In [None]:
train_test_split = tokenized_dataset['train'].train_test_split(test_size=0.1)
print(train_test_split)

train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

train_valid_split = train_dataset.train_test_split(test_size=0.2)

train_dataset = train_valid_split['train']
valid_dataset = train_valid_split['test']
print(valid_dataset)

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 3765
    })
    test: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 419
    })
})
Dataset({
    features: ['label', 'text', 'input_ids', 'attention_mask'],
    num_rows: 753
})


In [None]:
tf_train_set = model.prepare_tf_dataset(
    train_dataset,
    shuffle=True,
    batch_size=batch_size,
    collate_fn=data_collator,
)

tf_validation_set = model.prepare_tf_dataset(
    valid_dataset,
    shuffle=False,
    batch_size=batch_size,
    collate_fn=data_collator,
)

In [None]:
model.compile(optimizer=optimizer)

In [None]:
from transformers.keras_callbacks import KerasMetricCallback

metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)

In [None]:
from transformers.keras_callbacks import PushToHubCallback

push_to_hub_callback = PushToHubCallback(
    output_dir="Project_Nigel_Intent_Detection",
    tokenizer=tokenizer
)

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
Cloning https://huggingface.co/Nicknotname/Project_Nigel_Intent_Detection into local empty directory.


In [None]:
callbacks = [metric_callback, push_to_hub_callback]
model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=num_epochs, callbacks=callbacks)

Epoch 1/2

Epoch 2/2


<tf_keras.src.callbacks.History at 0x2695c68fe20>

In [None]:
text = "Can you play music for me"
inputs = tokenizer(text, return_tensors="tf")

result = model(**inputs).logits
predicted_class_id = int(tf.math.argmax(result, axis=-1)[0])
model.config.id2label[predicted_class_id]

'play_audio'

In [36]:
import os

modelName = "0.1"

if not os.path.exists("../models"):
    os.mkdir("../models")
    
modelDir = f"../models/{modelName}/"
if not os.path.exists(modelDir):
    os.mkdir(modelDir)
model.save_pretrained(f"{modelDir}/")