In [34]:
%pip install transformers datasets huggingface_hub ipywidgets pyarrow==11.0.0 transformers[torch] accelerate



In [2]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
from transformers import pipeline

token_classifier = pipeline(
  "token-classification",
  "HooshvareLab/bert-base-parsbert-ner-uncased",
  grouped_entities=True,
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of the model checkpoint at HooshvareLab/bert-base-parsbert-ner-uncased were not used when initializing BertForTokenClassification: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical

In [4]:
token_classifier(
    "سلام من مهدی هستم،‌ امروز جمعه است و در حال پیاده سازی تمرین چهارم درس پردازش زبان طبیعی هستم."
)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[{'entity_group': 'person',
  'score': 0.99806577,
  'word': 'مهدی',
  'start': 8,
  'end': 12},
 {'entity_group': 'date',
  'score': 0.93445253,
  'word': 'امروز جمعه',
  'start': 20,
  'end': 30}]

In [5]:
from datasets import load_dataset

train_path = "train.csv"
dev_path = "validation.csv"
test_path = "test.csv"
data = load_dataset(
    "arman",
    data_files={
        "train": train_path,
        "validation": dev_path,
        "test": test_path,
    },
)

In [6]:
data

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 10241
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 5121
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7681
    })
})

In [7]:
label_names = ['B-pers','I-pers', 'B-pro', 'I-pro','B-loc','I-loc','B-fac','I-fac','B-event','I-event','B-org','I-org','O']

In [8]:
len(label_names)

13

In [9]:
def map_labels(example):
    example['text'] = example['text'].split(' ')  # Split text by space
    ner_tags = example['label'].split(' ')  # Split labels by space
    for i, tag in enumerate(ner_tags):
        ner_tags[i] = label_names.index(tag)
    example['label'] = ner_tags  # Map label to index
    return example

# Apply the function to create a new dataset
data = data.map(map_labels)


In [11]:
data["train"].column_names

['text', 'label']

In [10]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("HooshvareLab/bert-base-parsbert-ner-uncased")

In [11]:
example = data["train"][10]

inputs = tokenizer(
    example["text"],
    is_split_into_words=True,
)

inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [12]:
inputs["input_ids"]

[2,
 3500,
 34801,
 2028,
 1,
 7247,
 4317,
 4132,
 2232,
 2192,
 331,
 2037,
 2426,
 2031,
 15675,
 3673,
 4785,
 12473,
 34801,
 300,
 15661,
 4119,
 28238,
 1157,
 12189,
 10008,
 50239,
 1173,
 1198,
 331,
 2692,
 12288,
 6558,
 12556,
 300,
 2202,
 2101,
 2334,
 4401,
 7209,
 2232,
 3359,
 15,
 4]

In [13]:
len(inputs["input_ids"])

44

In [14]:
len(example["label"])

38

In [15]:
def align_labels_with_input_ids(word_ids, old_labels):
  """
  Returns new labels which are of the same length as the word ids.

  Example inputs:

  # word_ids = [None, 0, 0, 0, 1, 2, 3, 4, 4, 4, 4, 4, 5, 6, 7, 8, 8, 9, 10, 11, 12, 13, 14, 14, 14, 14, 14, 14, 14, 15, None]
  # labels = [1, 2, 0, 0, 9, 10, 0, 0, 9, 0, 0, 0, 0, 0, 9, 0]

  Output:

  """

  new_labels = []
  prev_word_id = None

  for word_id in word_ids:
    if word_id is None:
      # if the word_id is None, i.e. the token is [CLS] or [SEP]
      new_labels.append(-100)
    else:
      label = old_labels[word_id]
      if prev_word_id == word_id and label % 2 == 1:
        # label is intermediate i.e. I-XXX
          label += 1
      new_labels.append(label)
    prev_word_id = word_id

  return new_labels

In [16]:
new_labels = align_labels_with_input_ids(inputs.word_ids(), example["label"])

In [17]:
len(inputs["input_ids"])

44

In [18]:
len(new_labels)

44

In [19]:
def tokenize_and_align(examples):
  # tokenize examples
  model_inputs = tokenizer(
      examples["text"],
      truncation=True,
      is_split_into_words=True,
  )

  # align labels
  model_inputs["labels"] = []
  # iterate over each example
  for i in range(len(model_inputs["input_ids"])):
    # get word_ids
    word_ids = model_inputs.word_ids(i)
    # get labels
    ner_tags = examples["label"][i]
    # compute new labels
    new_labels = align_labels_with_input_ids(word_ids, ner_tags)
    # store new labels
    model_inputs["labels"].append(new_labels)

  return model_inputs

In [20]:
tokenized_datasets = data.map(
    tokenize_and_align,
    batched=True,
    remove_columns=data["train"].column_names,
)

Map:   0%|          | 0/5121 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [21]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 10241
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 5121
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 7681
    })
})

In [22]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

In [23]:
batch_pre_collation = [ tokenized_datasets["train"][i] for i in range(5) ]

In [24]:
for example in batch_pre_collation:
  print(f">>> Length: {len(example['input_ids'])}")

>>> Length: 238
>>> Length: 41
>>> Length: 55
>>> Length: 28
>>> Length: 21


In [25]:
batch_collated = data_collator(batch_pre_collation)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [26]:
for example in batch_collated["input_ids"]:
  print(f">>> Length: {len(example)}")

>>> Length: 238
>>> Length: 238
>>> Length: 238
>>> Length: 238
>>> Length: 238


In [27]:
columns = ["attention_mask", "input_ids", "labels", "token_type_ids"]
batch_size = 16

tf_train_dataset = tokenized_datasets["train"].to_tf_dataset(
    columns=columns,
    collate_fn=data_collator,
    batch_size=batch_size,
    shuffle=True,
)

tf_eval_dataset = tokenized_datasets["validation"].to_tf_dataset(
    columns=columns,
    collate_fn=data_collator,
    batch_size=batch_size,
    shuffle=False,
)

In [28]:
id2label = { i:k for i, k in enumerate(label_names) }
label2id = { v:k for k, v in id2label.items() }

In [29]:
id2label

{0: 'B-pers',
 1: 'I-pers',
 2: 'B-pro',
 3: 'I-pro',
 4: 'B-loc',
 5: 'I-loc',
 6: 'B-fac',
 7: 'I-fac',
 8: 'B-event',
 9: 'I-event',
 10: 'B-org',
 11: 'I-org',
 12: 'O'}

In [30]:
label2id

{'B-pers': 0,
 'I-pers': 1,
 'B-pro': 2,
 'I-pro': 3,
 'B-loc': 4,
 'I-loc': 5,
 'B-fac': 6,
 'I-fac': 7,
 'B-event': 8,
 'I-event': 9,
 'B-org': 10,
 'I-org': 11,
 'O': 12}

In [31]:
from transformers import AutoConfig, AutoModel, AutoModelForTokenClassification

# AutoModelForTokenClassification.from_pretrained(
#     "HooshvareLab/bert-base-parsbert-ner-uncased",
#     label2id=label2id,
#     id2label=id2label,
#     ignore_mismatched_sizes=True,
# )
config = AutoConfig.from_pretrained("HooshvareLab/bert-base-parsbert-ner-uncased")

config.label2id = label2id
config.id2label = id2label
config._num_labels = len(label2id)

parsbert = AutoModel.from_pretrained('HooshvareLab/bert-base-parsbert-ner-uncased')
model = AutoModelForTokenClassification.from_config(config)
model.bert = parsbert
# Now you can use the model for your tasks


In [32]:
model.config.num_labels

13

In [45]:
!pip install accelerate -U




In [46]:
!pip install transformers[torch] accelerate




In [48]:
%load_ext autoreload
%autoreload 2

In [33]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,0.0568,0.003001


Epoch,Training Loss,Validation Loss
1,0.0568,0.003001
2,0.0047,0.000938
3,0.0026,0.000556


TrainOutput(global_step=1923, training_loss=0.016977896029541285, metrics={'train_runtime': 657.0357, 'train_samples_per_second': 46.76, 'train_steps_per_second': 2.927, 'total_flos': 1278768416373246.0, 'train_loss': 0.016977896029541285, 'epoch': 3.0})

In [41]:
# trainer.save_model("parsbert-finetuned-ner-arman")


In [56]:
trainer.model.save_pretrained("./models/bert-finetuned-ner-arman")
# trainer.save_model("parsbert-finetuned-ner-arman")


In [59]:
model.push_to_hub('bert-finetuned-ner-arman')

model.safetensors:   0%|          | 0.00/651M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/MahdiGheidi/bert-finetuned-ner-arman/commit/6ad46eba40878de4ab9d6835c631e8982755f35a', commit_message='Upload BertForTokenClassification', commit_description='', oid='6ad46eba40878de4ab9d6835c631e8982755f35a', pr_url=None, pr_revision=None, pr_num=None)

In [60]:
tokenizer.push_to_hub('bert-finetuned-ner-arman')

CommitInfo(commit_url='https://huggingface.co/MahdiGheidi/bert-finetuned-ner-arman/commit/3bc0f5f87d00a8a59c68832d593ee552b3fea8bc', commit_message='Upload tokenizer', commit_description='', oid='3bc0f5f87d00a8a59c68832d593ee552b3fea8bc', pr_url=None, pr_revision=None, pr_num=None)

In [61]:
from transformers import pipeline

token_classifier = pipeline(
  "token-classification",
  "MahdiGheidi/bert-finetuned-ner-arman",
  grouped_entities=True,
)

config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/651M [00:00<?, ?B/s]

Some weights of the model checkpoint at MahdiGheidi/bert-finetuned-ner-arman were not used when initializing BertForTokenClassification: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/1.26k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.22M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.81M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]



In [67]:
token_classifier(
    "در استان اردبیل چشمه‌های آبگرم وجود دارند. همچنین در کشور ایران در فصل بهار و تابستان می‌توان زیبایی‌های فراوانی را یافت."
)

[{'entity_group': 'loc',
  'score': 0.99951667,
  'word': 'استان اردبیل',
  'start': 3,
  'end': 15},
 {'entity_group': 'loc',
  'score': 0.733691,
  'word': 'کشور ایران',
  'start': 53,
  'end': 63}]