In [None]:
!pip install datasets evaluate transformers seqeval
from transformers import AutoTokenizer
from transformers import DataCollatorForTokenClassification
from datasets import load_dataset
from transformers import TFAutoModelForTokenClassification
from transformers import create_optimizer
import tensorflow as tf
import evaluate
import numpy as np

In [None]:
raw_datasets = load_dataset("conll2003")

In [None]:
raw_datasets

In [None]:
raw_datasets['train'].features

In [None]:
raw_datasets['train'][0]

In [None]:
labels=raw_datasets['train'].features['ner_tags'].feature.names

In [None]:
id2label={idd:label for idd,label in enumerate(labels)}
label2id={label:idd for idd,label in enumerate(labels)}

In [None]:
print(id2label)
print(label2id)

In [None]:
model_checkpoint = "bert-base-cased"
tokenizer=AutoTokenizer.from_pretrained(model_checkpoint)


In [None]:
print(raw_datasets['train']['tokens'][0])
ecoding=tokenizer(raw_datasets['train']['tokens'][0],is_split_into_words=True)
print(ecoding.tokens())
print(ecoding)
print(raw_datasets['train']['ner_tags'][0])
print(ecoding.word_ids())

In [None]:
def aligen(word_id,label):
    last_ids=None
    new_label=[]
    for idd in (word_id):
        if idd==None:
            new_label.append(-100)
        elif idd!=last_ids:
            new_label.append(label[idd])
            last_ids=idd
        else:
            labels=new_label[-1]
            if labels%2==1:
                labels+=1
            new_label.append(labels)
    return new_label

In [None]:
def tokenize_function(example):
    tokenize_input=tokenizer(example['tokens'],is_split_into_words=True,truncation=True)
    print(tokenize_input)
    print('-'*100)
    labels=example['ner_tags']
    new_labels=[]
    for idx,label in enumerate(labels):
    
        word_ids=tokenize_input.word_ids(idx)

        new_label=aligen(word_ids,label)
        new_labels.append(new_label)
    
    tokenize_input['labels']= new_labels   
    return(tokenize_input)
        
    

In [None]:
raw_datasets = raw_datasets.map(
    tokenize_function,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

In [None]:
raw_datasets

In [None]:
print(len(raw_datasets['train']['input_ids'][0]))
print(len(raw_datasets['train']['labels'][0]))

In [None]:
data_collator=DataCollatorForTokenClassification(tokenizer=tokenizer,)

In [None]:
batch = data_collator([raw_datasets["train"][i] for i in range(2)])
batch['labels']

In [None]:
train_data=raw_datasets['train'].to_tf_dataset(columns=['input_ids', 'token_type_ids', 'attention_mask'],
                                                     batch_size=32, shuffle=True, collate_fn=data_collator
                                                    ,label_cols='labels')
valid_data=raw_datasets['validation'].to_tf_dataset(columns=['input_ids', 'token_type_ids', 'attention_mask'],
                                                     batch_size=32, shuffle=True, collate_fn=data_collator
                                                    ,label_cols='labels')
test_data=raw_datasets['test'].to_tf_dataset(columns=['input_ids', 'token_type_ids', 'attention_mask','labels'],
                                                     batch_size=32, collate_fn=data_collator,shuffle=False
                                                    )

In [None]:

model=TFAutoModelForTokenClassification.from_pretrained(model_checkpoint,label2id=label2id,id2label=id2label)

In [None]:
model.config.id2label

In [None]:


num_epochs = 5
num_train_steps = len(train_data) * num_epochs

optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=0,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)

In [None]:
model.fit(
    train_data,
    validation_data=valid_data,
    epochs=num_epochs,
)

In [None]:
labels=[]
predictions=[]
for batch in test_data:
    prediction=np.argmax(model.predict(batch,verbose=0)['logits'],axis=-1)
    label=batch['labels']
    for i,j in zip(prediction,label):
        for x,y in zip(i,j):
            if int(y)==-100:
                continue
            labels.append(id2label[int(y)])
            predictions.append(id2label[int(x)])

In [None]:
metrics=evaluate.load('seqeval')

In [None]:
print(len(predictions))
print(len(labels))


In [None]:
metrics.compute(references=[labels],predictions=[predictions])

In [None]:
from transformers import pipeline

# Replace this with your own checkpoint
token_classifier = pipeline(
"token-classification", model=model, aggregation_strategy="simple",tokenizer=tokenizer
)
token_classifier("My name is Sylvain and I work at Hugging Face in Brooklyn.")