In [None]:

%%capture
!pip install datasets
!pip install seqeval

# import necseccry library
import pandas as pd
import json
from datasets import load_dataset
from transformers import BertTokenizerFast, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification, pipeline
import numpy as np
import datasets

# load the dataset
try:
    data = load_dataset("eriktks/conll2003")
except Exception as e:
    print(f"Error loading dataset: {e}")
    raise

# load the tokenizer
try:
    tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
except Exception as e:
    print(f"Error loading tokenizer: {e}")
    raise

# example text for testing
try:
    example_text = data['train'][1]
    tokenized_input = tokenizer(example_text["tokens"], is_split_into_words=True)
except Exception as e:
    print(f"Error tokenizing example text: {e}")
    raise

# define tokenization function
def tokenize_and_align_labels(examples, label_all_tokens=True):
    try:
        tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
        labels = []
        for i, label in enumerate(examples["ner_tags"]):
            word_ids = tokenized_inputs.word_ids(batch_index=i)
            previous_word_idx = None
            label_ids = []
            for word_idx in word_ids:
                if word_idx is None:
                    label_ids.append(-100)
                elif word_idx != previous_word_idx:
                    label_ids.append(label[word_idx])
                else:
                    label_ids.append(label[word_idx] if label_all_tokens else -100)
                previous_word_idx = word_idx
            labels.append(label_ids)
        tokenized_inputs["labels"] = labels
        return tokenized_inputs
    except Exception as e:
        print(f"Error during tokenization and alignment: {e}")
        raise

# tokenize dataset
try:
    tokenized_datasets = data.map(tokenize_and_align_labels, batched=True)
except Exception as e:
    print(f"Error mapping dataset: {e}")
    raise

# load the model
try:
    model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", num_labels=9)
except Exception as e:
    print(f"Error loading model: {e}")
    raise

# define training arguments
args = TrainingArguments(
    "test-ner",
    evaluation_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
)

# prepare data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# load the metric
try:
    metric = datasets.load_metric("seqeval")
    label_list = data["train"].features["ner_tags"].feature.names
except Exception as e:
    print(f"Error loading metric: {e}")
    raise

# define compute metrics function
def compute_metrics(eval_preds):
    try:
        pred_logits, labels = eval_preds
        pred_logits = np.argmax(pred_logits, axis=2)
        predictions = [
            [label_list[eval_preds] for (eval_preds, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(pred_logits, labels)
        ]
        true_labels = [
            [label_list[l] for (eval_preds, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(pred_logits, labels)
        ]
        results = metric.compute(predictions=predictions, references=true_labels)
        return {
            "precision": results["overall_precision"],
            "recall": results["overall_recall"],
            "f1": results["overall_f1"],
            "accuracy": results["overall_accuracy"],
        }
    except Exception as e:
        print(f"Error computing metrics: {e}")
        raise

# set up the trainer
try:
    trainer = Trainer(
        model,
        args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"],
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )
    trainer.train()
except Exception as e:
    print(f"Error during model training: {e}")
    raise

# save the trained model and tokenizer
try:
    model.save_pretrained("ner_model")
    tokenizer.save_pretrained("tokenizer")
except Exception as e:
    print(f"Error saving model or tokenizer: {e}")
    raise

# update config with label mappings
try:
    config = json.load(open("ner_model/config.json"))
    config["id2label"] = {str(i): label for i, label in enumerate(label_list)}
    config["label2id"] = {label: str(i) for i, label in enumerate(label_list)}
    json.dump(config, open("ner_model/config.json", "w"))
except Exception as e:
    print(f"Error updating model config: {e}")
    raise

# load the fine-tuned model
try:
    model_fine_tuned = AutoModelForTokenClassification.from_pretrained("ner_model")
except Exception as e:
    print(f"Error loading fine-tuned model: {e}")
    raise

# run inference
try:
    ner_pipeline = pipeline("ner", model=model_fine_tuned, tokenizer=tokenizer)
    test_sentence = "Apple Inc. is a technology company based in Cupertino, California. Tim Cook is the CEO of Apple. The company was founded in 1976 by Steve Jobs, Steve Wozniak, and Ronald Wayne. Apple's headquarters are located in the United States."
    ner_results = ner_pipeline(test_sentence)
    for entity in ner_results:
        print(f"Word: {entity['word']}, Entity: {entity['entity']}, Confidence: {entity['score']:.4f}")
except Exception as e:
    print(f"Error during inference: {e}")
    raise

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
