In [1]:
import subprocess

def ensure_t4_gpu():
    # Check current GPU type
    gpu_info = subprocess.run(["nvidia-smi", "--query-gpu=gpu_name", "--format=csv,noheader"], capture_output=True, text=True)
    gpu_name = gpu_info.stdout.strip()

    if "T4" in gpu_name:
        print(f"✅ T4 GPU is already assigned: {gpu_name}")
    else:
        print(f"⚠️ Current GPU is {gpu_name}. Restarting runtime for T4 assignment...")
        from google.colab import runtime
        runtime.unassign()  # Force runtime restart

# Run the function
ensure_t4_gpu()


✅ T4 GPU is already assigned: Tesla T4


In [2]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [3]:
# Install necessary libraries (if not installed yet)
# !pip install transformers datasets

import pandas as pd
from datasets import Dataset
from transformers import BertTokenizerFast, BertForTokenClassification
from transformers import Trainer, TrainingArguments
from sklearn.metrics import accuracy_score
import numpy as np

# Step 1: Load the dataset
# Assuming you have a CSV file 'ner_dataset.csv' with 'tokens' and 'ner_tags' columns
df = pd.read_csv('ner_dataset.csv')

# Step 2: Prepare the dataset
# Convert the lists in the columns to actual Python lists (if they are stored as strings in CSV)
df['tokens'] = df['tokens'].apply(eval)
df['ner_tags'] = df['ner_tags'].apply(eval)

# Convert to Hugging Face dataset format
dataset = Dataset.from_pandas(df)

# Step 3: Prepare the label map
label_list = ['O', 'B-TITLE', 'I-TITLE', 'B-DATE', 'I-DATE', 'B-TIME', 'I-TIME']  # Modify based on your dataset
label_map = {label: i for i, label in enumerate(label_list)}

# Function to map NER labels to integers
def map_labels(example):
    example['ner_tags'] = [label_map[label] for label in example['ner_tags']]
    return example

# Step 4: Map the labels in the dataset
dataset = dataset.map(map_labels)

# Step 5: Initialize the BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# Tokenization function with label alignment
def tokenize_and_align_labels(examples):
    # Tokenize the 'tokens' column, which contains a list of words
    tokenized_inputs = tokenizer(examples['tokens'], padding=True, truncation=True, max_length=128, is_split_into_words=True)

    # Align labels with tokens (convert to the correct shape)
    labels = []
    for i, label in enumerate(examples['ner_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their word IDs

        # Align labels with tokenized words and convert to integers using the label_map
        label_ids = [-100 if word_id is None else label[word_id] for word_id in word_ids]
        labels.append(label_ids)

    # Add the labels to the tokenized inputs
    tokenized_inputs['labels'] = labels
    return tokenized_inputs

# Step 6: Apply tokenization and label alignment to the dataset
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

# Split the dataset into train and test sets (if not already split)
train_test_split = tokenized_dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

# Step 7: Initialize the BERT model for token classification
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(label_list))

# Step 8: Define the compute_metrics function to evaluate accuracy
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=2)  # Get the most probable predictions (class with the highest score)
    labels = p.label_ids
    # Mask out -100 (padding) labels so they don't contribute to the accuracy calculation
    mask = labels != -100
    preds = preds[mask]
    labels = labels[mask]
    accuracy = accuracy_score(labels, preds)
    return {"accuracy": accuracy}

# Step 9: Define training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # number of training epochs
    per_device_train_batch_size=8,   # batch size for training
    per_device_eval_batch_size=8,    # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    evaluation_strategy="epoch",     # Evaluate after each epoch
    save_strategy="epoch",          # Save the model after each epoch
)

# Step 10: Set up the Trainer
trainer = Trainer(
    model=model,                         # the model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,           # evaluation dataset
    compute_metrics=compute_metrics      # use the compute_metrics function to calculate accuracy
)

# Step 11: Start the training process
trainer.train()

# Step 12: Evaluate on the test set after training
test_results = trainer.evaluate(test_dataset)
print("Test Accuracy:", test_results['eval_accuracy'])

# Step 13: Evaluate on the train set after training (optional)
train_results = trainer.evaluate(train_dataset)
print("Train Accuracy:", train_results['eval_accuracy'])

# Optional: Save the model and tokenizer
model.save_pretrained('./ner_model')
tokenizer.save_pretrained('./ner_model')


Map:   0%|          | 0/812 [00:00<?, ? examples/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Map:   0%|          | 0/812 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.44878,0.895577
2,No log,0.060241,0.986077
3,No log,0.045007,0.992219


Test Accuracy: 0.9922194922194922
Train Accuracy: 0.9971889640812077


('./ner_model/tokenizer_config.json',
 './ner_model/special_tokens_map.json',
 './ner_model/vocab.txt',
 './ner_model/added_tokens.json',
 './ner_model/tokenizer.json')

In [4]:
from transformers import BertForTokenClassification, BertTokenizerFast

# Load the trained model and tokenizer
model = BertForTokenClassification.from_pretrained('./ner_model')
tokenizer = BertTokenizerFast.from_pretrained('./ner_model')


In [15]:
import torch

# Function for inference
def predict_ner(text):
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, is_split_into_words=True, max_length=128)

    # Get predictions
    with torch.no_grad():
        outputs = model(**inputs)

    # Get predicted labels
    predictions = torch.argmax(outputs.logits, dim=2)

    # Decode the predictions (convert label indices back to label names)
    predicted_labels = predictions[0].cpu().numpy()  # Assuming batch size is 1
    tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])

    # Map predictions back to label names
    label_map = {i: label for i, label in enumerate(label_list)}  # Use the same label list from earlier
    predicted_labels = [label_map[label] for label in predicted_labels]

    # Filter out special tokens [CLS], [SEP], and padding
    filtered_tokens = []
    filtered_labels = []
    for token, label in zip(tokens, predicted_labels):
        if token not in ['[CLS]', '[SEP]'] :  # Ignore special tokens and labels that are "O"
            filtered_tokens.append(token)
            filtered_labels.append(label)

    # Return the tokens and their predicted labels
    return list(zip(filtered_tokens, filtered_labels))

# Example text for NER
example_text = "delete the alram of meeting with fazil which is set for 8 pm tomorrow"
# example_text = "set an alarm for meeting with fazil tomorrow at 8 pm"
example_text = example_text.split()

# Get NER predictions
predictions = predict_ner(example_text)
print(predictions)


[('del', 'O'), ('##ete', 'O'), ('the', 'O'), ('al', 'O'), ('##ram', 'O'), ('of', 'O'), ('meeting', 'B-TITLE'), ('with', 'I-TITLE'), ('fa', 'I-TITLE'), ('##zi', 'I-TITLE'), ('##l', 'I-TITLE'), ('which', 'O'), ('is', 'O'), ('set', 'O'), ('for', 'O'), ('8', 'B-TIME'), ('pm', 'I-TIME'), ('tomorrow', 'B-DATE')]


In [16]:
def extract_entities(a):
    title, date, time = [], [], []

    entity_dict = {"B-TITLE": title, "I-TITLE": title,
                   "B-DATE": date, "I-DATE": date,
                   "B-TIME": time, "I-TIME": time}

    for word, tag in a:
        if tag in entity_dict:
            if word.startswith("##"):
                entity_dict[tag][-1] += word[2:]  # Merge with previous word
            else:
                entity_dict[tag].append(word)  # Add new word

    # Join words to form meaningful phrases
    title = " ".join(title) if title else None
    date = " ".join(date) if date else None
    time = " ".join(time) if time else None

    return [title, date, time]

a = predictions

print(extract_entities(a))


['meeting with fazil', 'tomorrow', '8 pm']
