In [1]:
# asennetaan transformers ja dataset

!pip install --upgrade --force-reinstall tqdm
!pip --quiet install transformers
!pip --quiet install datasets
!pip --quiet install sklearn

Collecting tqdm
  Downloading tqdm-4.62.2-py2.py3-none-any.whl (76 kB)
[K     |████████████████████████████████| 76 kB 166 kB/s 
[?25hInstalling collected packages: tqdm
  Attempting uninstall: tqdm
    Found existing installation: tqdm 4.61.2
    Uninstalling tqdm-4.61.2:
      Successfully uninstalled tqdm-4.61.2
Successfully installed tqdm-4.62.2


In [2]:
# tuodaan kirjastot

from transformers import AutoTokenizer
from transformers import TrainingArguments
from transformers import Trainer
from transformers import AutoModelForSequenceClassification
from transformers import DataCollatorForTokenClassification
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch

In [3]:
# haetaan aineisto
!wget -nc -O PER-ORG-relations-combined-group-2.tsv http://dl.turkunlp.org/TKO_8964_2021/PER-ORG-relations-combined-group-2.tsv
data = pd.read_csv(("PER-ORG-relations-combined-group-2.tsv"), header=0, names=['label','text'], sep='\t')

print(data)

File ‘PER-ORG-relations-combined-group-2.tsv’ already there; not retrieving.
                                       label  \
0421f5ce-eab2-3eb4-9bec-a4067cd3c870  reject   
17c04447-9a9f-37aa-9eba-73b368f33895  reject   
2ac2440b-9cfe-3537-a81d-f8ff139f3ff5  accept   
d147dc04-7271-3fec-bad8-611bb47a6c89  accept   
2ffd7378-a465-30d6-a1e9-01ab31827676  reject   
...                                      ...   
bedd86bc-292a-362d-a217-f59663f3fa28  accept   
d08e9051-1c74-318f-a936-9c77b3c5bb51  accept   
9bedf73b-e82a-35e4-9ca7-1a4d052c45be  accept   
5ee79906-1306-3d56-a47a-38aa399967bf  reject   
ea9b68b2-202a-3105-9c0b-82ecc7f10e6d  accept   

                                                                                   text  
0421f5ce-eab2-3eb4-9bec-a4067cd3c870  Kuolleita 1676 – Pietro Francesco Caletti - Br...  
17c04447-9a9f-37aa-9eba-73b368f33895  <ORG>Terve . fi</ORG> - sivusto 29 . 10 . 2014...  
2ac2440b-9cfe-3537-a81d-f8ff139f3ff5  22 . syyskuuta 1937 <ORG>NKVD : n</ORG

In [4]:
# jaetaan aineisto train- ja validation-osiin

x_train, x_valid, y_train, y_valid = train_test_split(data["text"], data["label"], test_size=0.2)
print("\nx_train:\n")
print(x_train.head())
print(x_train.shape)
print(y_train.head())

print("\nx_valid:\n")
print(x_valid.head())
print(x_valid.shape)
print(y_valid.head())


x_train:

3d7685a2-9339-31f1-80d1-2efcd7da0f3d    Vuoden 1979 vaalien jälkeen Koivisto nousi pää...
90561712-1244-373e-9b4b-3741c155beb9    Esille julkisuuteen nousi erityisesti hallituk...
ca2ed8ac-c5b4-3a42-886a-9e73a055a6bf    Johanna Uotinen & Sari Tuuva & <PERSON>Marja V...
d1f31a11-906a-3a1b-a0d3-1adb82ff8a95    Kokoonpanoista senverran , että lukossa pelasi...
e8a9aeaa-c027-38cf-b4fd-5bf0327ac3da    Syntyneitä 356 eaa . – Aleksanteri Suuri , mak...
Name: text, dtype: object
(1520,)
3d7685a2-9339-31f1-80d1-2efcd7da0f3d    reject
90561712-1244-373e-9b4b-3741c155beb9    accept
ca2ed8ac-c5b4-3a42-886a-9e73a055a6bf    accept
d1f31a11-906a-3a1b-a0d3-1adb82ff8a95    accept
e8a9aeaa-c027-38cf-b4fd-5bf0327ac3da    reject
Name: label, dtype: object

x_valid:

be951c25-881b-3311-a3a8-addcec6a3a02    19 . huhtikuuta – Prikaatikenraali <PERSON>Kou...
ee85681e-a875-3d29-8a02-7126dd3083fe    Jep , pointsit <PERSON>Ronkaiselle</PERSON> , ...
4727cf10-b03e-34e4-86b5-b60bd52aeda8    Berglingin l

In [5]:
# setting model, dataset and hyperparameters

MODEL_NAME = 'TurkuNLP/bert-base-finnish-cased-v1'

LEARNING_RATE=1e-4
BATCH_SIZE=8
TRAIN_EPOCHS=1

# Magic value that comes from pytorch's default ignore_index
DUMMY_LABEL_ID = -100

In [6]:
dataset = x_train
num_labels = len(set(dataset))
num_labels

1520

In [7]:
print(dataset)
print(f'number of distinct labels: {num_labels}')

3d7685a2-9339-31f1-80d1-2efcd7da0f3d    Vuoden 1979 vaalien jälkeen Koivisto nousi pää...
90561712-1244-373e-9b4b-3741c155beb9    Esille julkisuuteen nousi erityisesti hallituk...
ca2ed8ac-c5b4-3a42-886a-9e73a055a6bf    Johanna Uotinen & Sari Tuuva & <PERSON>Marja V...
d1f31a11-906a-3a1b-a0d3-1adb82ff8a95    Kokoonpanoista senverran , että lukossa pelasi...
e8a9aeaa-c027-38cf-b4fd-5bf0327ac3da    Syntyneitä 356 eaa . – Aleksanteri Suuri , mak...
                                                              ...                        
45689f05-6569-36c4-8e2e-fabba10cb498    Bartlett , A . C . & Staten , R . T . 2003 : T...
cd8b9020-0659-35fb-b5b5-da6a5e52e491    Se ohitti Tenavat vasta Schulzin kuoleman jälk...
eb8bde30-f05c-3b03-bb01-8a1ad83c0324    Kun silloinen pääministeri <PERSON>Urho Kekkon...
8bf165a4-ddca-3502-8368-0a1c500406a3    1906 ( 1906 , <ORG>Turun kansallisseura</ORG> ...
9b1a4277-4200-3f2a-98cf-b884b8e72e98    Hän osti keskustaoikeistolaisen sanomalehden <...
Name: text

In [8]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/424k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/816k [00:00<?, ?B/s]

In [9]:
list(x_train)

['Vuoden 1979 vaalien jälkeen Koivisto nousi pääministeriksi ja <ORG>SDP : n</ORG> epäviralliseksi ehdokkaaksi <PERSON>Kekkosen</PERSON> seuraajaksi .',
 'Esille julkisuuteen nousi erityisesti hallituksen puheenjohtaja <PERSON>Tapio Hintikka</PERSON> , kun emoyhtiö <ORG>Soneralle</ORG> etsittiin uutta toimitusjohtajaa .',
 'Johanna Uotinen & Sari Tuuva & <PERSON>Marja Vehviläinen</PERSON> & Seppo Knuuttila ( toim . ) Verkkojen kokijat paikallista tietoyhteiskuntaa tekemässä <ORG>Suomen Kansantietouden Tutkijain Seura</ORG> , Joensuu , 2001 .',
 'Kokoonpanoista senverran , että lukossa pelasi ainakin tämä <ORG>SM - liigan</ORG> pistepörssin suvereenisti voittamaan tuleva <PERSON>Larouche</PERSON> ( vai mikä se nyt on ) ja teki jopa ässien " junnuja " vastaan maalin . . . rankkarista .',
 'Syntyneitä 356 eaa . – Aleksanteri Suuri , makedoniankreikkalainen kuningas ja sotilas ( k . 323 eaa . ) 1304 – Francesco Petrarca , italialainen runoilija ja humanisti ( Sonetteja Lauralle ) ( k . 137

In [47]:
# Convert labels to numbers for torch.tensor
label_map = {
    '-100': -100,
    'accept': 0,
    'reject': 1
}

def encode_dataset(texts, labels):
	words = list(map(lambda t: t.split(), texts))
	tokenized = tokenizer(words, is_split_into_words=True, padding=True, truncation=True)
	labels_with_separators = []
	prev_word_idx = None
	for word_idx in tokenized.word_ids():
			if word_idx is None or word_idx == prev_word_idx:
					# Special token (e.g. [SEP]) or part of the previous word
					labels_with_separators.append(DUMMY_LABEL_ID)
			else:
					# Word start
					labels_with_separators.append(label_map[labels[word_idx]])
			prev_word_idx = word_idx
	assert len(labels_with_separators) == len(tokenized.tokens())
	return (tokenized, labels_with_separators)


encoded_train_data, encoded_train_labels = encode_dataset(x_train, y_train)
encoded_validation_data, encoded_validation_labels = encode_dataset(x_valid, y_valid)
(len(encoded_train_data.tokens()), len(encoded_train_labels))

(512, 512)

In [64]:
class TdaDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        try:
            item = {key: torch.tensor(val[idx])
                    for key, val in self.encodings.items()}
            item['labels'] = self.labels[idx]

            return item
        except Exception as e:
            print("Fail", idx)  # Debug
            raise e

    def __len__(self):
      r = min(len(self.labels), len(self.encodings['input_ids']))
      return r


train_dataset = TdaDataset(encoded_train_data, encoded_train_labels)
val_dataset = TdaDataset(encoded_validation_data, encoded_validation_labels)


In [66]:
min(len(train_dataset.labels), len(train_dataset))

called 512


512

TODO: Ylläolevan ajaminen (ensimmäistä kertaa), tulostaa virheen

"Token indices sequence length is longer than the specified maximum sequence length for this model (148796 > 512). Running this sequence through the model will result in indexing errors"

Mistä johtuu? 

Edit: truncation=True ehkä korjaa?

In [49]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)

loading configuration file https://huggingface.co/TurkuNLP/bert-base-finnish-cased-v1/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/e27939251243299384d3c49756d6710f25a683fa4d5e00e6f42fe6cc59202f07.1b2c5b5f39fed7ac39db55c0d2566730a96257ac7215ad6c2a8a109e2ccf1ccd
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.10.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 50105
}

loading weights file https://huggingface.co/TurkuNLP/bert-base-finnish-cased-v1/resolv

In [67]:
train_args = TrainingArguments(
    'output_dir',    # output directory for checkpoints and predictions
    save_strategy='no',
    evaluation_strategy='epoch',
    logging_strategy='epoch',
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    num_train_epochs=TRAIN_EPOCHS,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [14]:
def compute_accuracy(pred):
    y_pred = pred.predictions.argmax(axis=1)
    y_true = pred.label_ids
    return { 'accuracy': sum(y_pred == y_true) / len(y_true) }

In [68]:
data_collator = DataCollatorForTokenClassification(
    tokenizer,
    label_pad_token_id=DUMMY_LABEL_ID
)

trainer = Trainer(
      model,
      train_args,
      train_dataset=train_dataset,
      eval_dataset=val_dataset,
      # data_collator=data_collator,
      compute_metrics=compute_accuracy
)
trainer.train()

***** Running training *****
  Num examples = 512
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 64


called 512
called 512
called 512
called 512
called 512
called 512
called 512


KeyboardInterrupt: 