In [None]:
import subprocess

def ensure_t4_gpu():
    # Check current GPU type
    gpu_info = subprocess.run(["nvidia-smi", "--query-gpu=gpu_name", "--format=csv,noheader"], capture_output=True, text=True)
    gpu_name = gpu_info.stdout.strip()

    if "T4" in gpu_name:
        print(f"✅ T4 GPU is already assigned: {gpu_name}")
    else:
        print(f"⚠️ Current GPU is {gpu_name}. Restarting runtime for T4 assignment...")
        from google.colab import runtime
        runtime.unassign()  # Force runtime restart

# Run the function
ensure_t4_gpu()


✅ T4 GPU is already assigned: Tesla T4


In [None]:
!pip install evaluate


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.17-py311-none-any.whl.metadata (7.2 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [

In [None]:
import torch
import evaluate
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict
import json
from sklearn.model_selection import train_test_split

In [None]:
# Load and prepare dataset
with open('total_normalized_data.json', 'r') as file:
    dataset = json.load(file)

dataset = Dataset.from_dict({
    key: [item[key] for item in dataset]
    for key in dataset[0].keys()
})

In [None]:
# Split dataset
dataset_dict = dataset.train_test_split(test_size=0.2, shuffle=True, seed=42)
train_val_split = dataset_dict['train'].train_test_split(test_size=0.111, shuffle=True, seed=42)

dataset = DatasetDict({
    'train': train_val_split['train'],
    'validation': train_val_split['test'],
    'test': dataset_dict['test']
})

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'intent', 'entities', 'label'],
        num_rows: 2222
    })
    validation: Dataset({
        features: ['text', 'intent', 'entities', 'label'],
        num_rows: 278
    })
    test: Dataset({
        features: ['text', 'intent', 'entities', 'label'],
        num_rows: 625
    })
})

In [None]:
dataset["train"][1]

{'text': 'Update the reminder to book the conference room on October fifth atomic number 85 I PM.',
 'intent': 'update_remainder',
 'entities': [{'date': None,
   'time': None,
   'title': 'book the conference room'},
  {'date': 'October 5th', 'time': None, 'title': None},
  {'date': None, 'time': '1 PM', 'title': None}],
 'label': 'update_remainder'}

In [None]:
# Map intents to numerical labels
unique_intents = list(set(dataset["train"]["intent"]))
intent_to_label = {intent: idx for idx, intent in enumerate(unique_intents)}

def map_intent_to_label(example):
    example["label"] = intent_to_label[example["intent"]]
    return example

dataset = dataset.map(map_intent_to_label)

Map:   0%|          | 0/2222 [00:00<?, ? examples/s]

Map:   0%|          | 0/278 [00:00<?, ? examples/s]

Map:   0%|          | 0/625 [00:00<?, ? examples/s]

In [None]:
intent_to_label

{'add_task': 0,
 'add_remainder': 1,
 'update_remainder': 2,
 'delete_remainder': 3,
 'delete_task': 4,
 'update_task': 5}

In [None]:
# Tokenize dataset
MODEL_NAME = "google-bert/bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_function(example):
    return tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/2222 [00:00<?, ? examples/s]

Map:   0%|          | 0/278 [00:00<?, ? examples/s]

Map:   0%|          | 0/625 [00:00<?, ? examples/s]

In [None]:
# Split into train, validation, and test sets
train_dataset = tokenized_datasets["train"]
val_dataset = tokenized_datasets["validation"]
test_dataset = tokenized_datasets["test"]

In [None]:
# Initialize model
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(unique_intents)
)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)



In [None]:
# Define accuracy metric
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1)
    return accuracy.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3484,0.204196,0.938849
2,0.1106,0.138377,0.953237
3,0.1023,0.144487,0.94964


TrainOutput(global_step=417, training_loss=0.3036820813477468, metrics={'train_runtime': 265.7264, 'train_samples_per_second': 25.086, 'train_steps_per_second': 1.569, 'total_flos': 438490321302528.0, 'train_loss': 0.3036820813477468, 'epoch': 3.0})

In [None]:
results = trainer.evaluate(test_dataset)
print("Test Results:", results)

Test Results: {'eval_loss': 0.28592729568481445, 'eval_accuracy': 0.9152, 'eval_runtime': 4.4886, 'eval_samples_per_second': 139.243, 'eval_steps_per_second': 8.912, 'epoch': 3.0}


In [None]:
trainer.save_model("./results")
tokenizer.save_pretrained("./results")

('./results/tokenizer_config.json',
 './results/special_tokens_map.json',
 './results/vocab.txt',
 './results/added_tokens.json',
 './results/tokenizer.json')

In [None]:
# Load the saved model and tokenizer
MODEL_NAME = "./results"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)

In [None]:
label_to_intent  = {v: k for k, v in intent_to_label.items()}

def predict_intent(text, model, tokenizer):
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=128,
    )
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=-1).item()
    return predicted_class


text = "set an alram for 8 pm tomorrow"
predicted_class = predict_intent(text, model, tokenizer)
print(f"Predicted Intent Class: {label_to_intent[predicted_class]}")

Predicted Intent Class: add_task


In [25]:
import shutil

folder_path = '/content/results'
shutil.make_archive('results', 'zip', folder_path)


'/content/results.zip'

In [None]:
from google.colab import files

files.download('results.zip')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>