In [None]:
!pip install datasets
!pip install tokenizers
!pip install transformers
!pip install seqeval
!pip install accelerate==0.24.1
!pip install wandb

Collecting datasets
  Downloading datasets-2.14.6-py3-none-any.whl (493 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.14.0 (from datasets)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m26.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, huggingface-hub, datasets
Successfully installed datasets-2.14

# NER þjálfun

In [None]:
from datasets import load_dataset
dataset = load_dataset("wikiann", "is")
label_names = dataset["train"].features["ner_tags"].feature.names

Downloading builder script:   0%|          | 0.00/11.6k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/617k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/131k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/234M [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

#Get the values for input_ids, token_type_ids, attention_mask
def tokenize_adjust_labels(all_samples_per_split):
  tokenized_samples = tokenizer.batch_encode_plus(all_samples_per_split["tokens"], is_split_into_words=True)
  #tokenized_samples is not a datasets object so this alone won't work with Trainer API, hence map is used
  #so the new keys [input_ids, labels (after adjustment)]
  #can be added to the datasets dict for each train test validation split
  total_adjusted_labels = []
  print(len(tokenized_samples["input_ids"]))
  for k in range(0, len(tokenized_samples["input_ids"])):
    prev_wid = -1
    word_ids_list = tokenized_samples.word_ids(batch_index=k)
    existing_label_ids = all_samples_per_split["ner_tags"][k]
    i = -1
    adjusted_label_ids = []

    for wid in word_ids_list:
      if(wid is None):
        adjusted_label_ids.append(-100)
      elif(wid!=prev_wid):
        i = i + 1
        adjusted_label_ids.append(existing_label_ids[i])
        prev_wid = wid
      else:
        label_name = label_names[existing_label_ids[i]]
        adjusted_label_ids.append(existing_label_ids[i])

    total_adjusted_labels.append(adjusted_label_ids)
  tokenized_samples["labels"] = total_adjusted_labels
  return tokenized_samples

tokenized_dataset = dataset.map(tokenize_adjust_labels, batched=True)

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

1000


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

1000


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

1000


In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

In [None]:
import os
import wandb
os.environ["WANDB_API_KEY"]="ca0cd6296cf7d3847003bab85bd18ab7ee4061f2"
os.environ["WANDB_ENTITY"]="ofurtumi"
os.environ["WANDB_PROJECT"]="finetune_bert_ner"

In [None]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
import numpy as np
from datasets import load_metric
metric = load_metric("seqeval")
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_names[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    flattened_results = {
        "overall_precision": results["overall_precision"],
        "overall_recall": results["overall_recall"],
        "overall_f1": results["overall_f1"],
        "overall_accuracy": results["overall_accuracy"],
    }
    for k in results.keys():
      if(k not in flattened_results.keys()):
        flattened_results[k+"_f1"]=results[k]["f1"]

    return flattened_results


  metric = load_metric("seqeval")


Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

In [None]:
model = AutoModelForTokenClassification.from_pretrained("bert-base-multilingual-cased", num_labels=len(label_names))
training_args = TrainingArguments(
    output_dir="./fine_tune_bert_output",
    evaluation_strategy="steps",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=7,
    weight_decay=0.01,
    logging_steps = 1000,
    report_to="wandb",
    run_name = "ep_10_tokenized_11",
    save_strategy='no'
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()
wandb.finish()

Downloading model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Currently logged in as: [33mofurtumi[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train/epoch,▁
train/global_step,▁
train/total_flos,▁
train/train_loss,▁
train/train_runtime,▁
train/train_samples_per_second,▁
train/train_steps_per_second,▁

0,1
train/epoch,7.0
train/global_step,441.0
train/total_flos,171426586878624.0
train/train_loss,0.20669
train/train_runtime,4667.2368
train/train_samples_per_second,1.5
train/train_steps_per_second,0.094


In [None]:
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])

predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)

true_predictions = [

[label_names[p] for (p, l) in zip(prediction, label) if l != -100]

for prediction, label in zip(predictions, labels)

]

true_labels = [

[label_names[l] for (p, l) in zip(prediction, label) if l != -100]

for prediction, label in zip(predictions, labels)

]

results = metric.compute(predictions=true_predictions, references=true_labels)

results

{'LOC': {'precision': 0.8496058217101273,
  'recall': 0.8563569682151589,
  'f1': 0.8529680365296803,
  'number': 1636},
 'ORG': {'precision': 0.7937995674116799,
  'recall': 0.7943722943722944,
  'f1': 0.7940858276235123,
  'number': 1386},
 'PER': {'precision': 0.8893178893178894,
  'recall': 0.8847631241997439,
  'f1': 0.8870346598202824,
  'number': 781},
 'overall_precision': 0.8373983739837398,
 'overall_recall': 0.8396003155403629,
 'overall_f1': 0.8384978991596639,
 'overall_accuracy': 0.9297525576968831}

Meðal f1 skorið var ca 0.83

## heimildir:

https://www.analyticsvidhya.com/blog/2022/06/fine-tune-bert-model-for-named-entity-recognition-in-google-colab/

https://www.freecodecamp.org/news/getting-started-with-ner-models-using-huggingface/

# Plan

```mermaid
timeline
title Lokaverkefni
section Scraping
    29/10-4/11: snerpa
    5/11-11/11: sol.heimsnet
    12/11-18/11: guitarparty
section Gagnavinnsla
    19/11-26/11: Hreinsun: Filtering: Flokkun
section Þjálfun
    27/11-3/12: Þjálfa gpt2
section Skil
    4/12-10/12: Skýrsla
```

Hérna er tímalína fyrir verkefnið, mesti tíminn mun fara í að safna og vinna úr gögnum. Þar sem að fókus á verkefnið er að búa til þetta gagnasafn þá set ég minni tíma í að þjálfa módel.

Ég er byrjaður að safna gögnum og reikna með að hver síða muni taki um það bil viku. Þetta hljómar eins og mikill tími til að scrape-a eina síðu, það er vegna þess að ég padda aðeins til að gera ráð fyrir vinnu og örðum áföngum.

Gagnavinnslan felst í því að taka nú söfnuðu gögnin mín og taka út endurtekna texta, ef einhverjir eru og setja á form sem hægt væri að nota til að fínþjálfa gpt2

Þjálfunin útskýrir sig frekar sjálf, þjálfa gpt-2 til að geta búið til texta.

Skil, þar sem það má gera video þá mun ég líklega gera það. Þar mun ég sýna rannsóknarferlið (að leita að síðum með texta), skröpunina, vinnslu og að lokum þjálfun