In [2]:
# asennetaan transformers ja dataset

!pip install --upgrade --force-reinstall tqdm
!pip --quiet install transformers
!pip --quiet install datasets
!pip --quiet install sklearn

Collecting tqdm
  Using cached tqdm-4.62.2-py2.py3-none-any.whl (76 kB)
Installing collected packages: tqdm
  Attempting uninstall: tqdm
    Found existing installation: tqdm 4.62.2
    Uninstalling tqdm-4.62.2:
      Successfully uninstalled tqdm-4.62.2
Successfully installed tqdm-4.62.2


In [3]:
# tuodaan kirjastot

from transformers import AutoTokenizer
from transformers import TrainingArguments
from transformers import Trainer
from transformers import AutoModelForSequenceClassification
from transformers import DataCollatorForTokenClassification
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch

In [4]:
# haetaan aineisto
!wget -nc -O PER-ORG-relations-combined-group-2.tsv http://dl.turkunlp.org/TKO_8964_2021/PER-ORG-relations-combined-group-2.tsv
data = pd.read_csv(("PER-ORG-relations-combined-group-2.tsv"), header=0, names=['label','text'], sep='\t')

print(data)

--2021-09-01 20:16:52--  http://dl.turkunlp.org/TKO_8964_2021/PER-ORG-relations-combined-group-2.tsv
Resolving dl.turkunlp.org (dl.turkunlp.org)... 195.148.30.23
Connecting to dl.turkunlp.org (dl.turkunlp.org)|195.148.30.23|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 505912 (494K) [application/octet-stream]
Saving to: ‘PER-ORG-relations-combined-group-2.tsv’


2021-09-01 20:16:53 (641 KB/s) - ‘PER-ORG-relations-combined-group-2.tsv’ saved [505912/505912]

                                       label  \
0421f5ce-eab2-3eb4-9bec-a4067cd3c870  reject   
17c04447-9a9f-37aa-9eba-73b368f33895  reject   
2ac2440b-9cfe-3537-a81d-f8ff139f3ff5  accept   
d147dc04-7271-3fec-bad8-611bb47a6c89  accept   
2ffd7378-a465-30d6-a1e9-01ab31827676  reject   
...                                      ...   
bedd86bc-292a-362d-a217-f59663f3fa28  accept   
d08e9051-1c74-318f-a936-9c77b3c5bb51  accept   
9bedf73b-e82a-35e4-9ca7-1a4d052c45be  accept   
5ee79906-1306-3d56-a47a-38aa399

In [5]:
# jaetaan aineisto train- ja validation-osiin

x_train, x_valid, y_train, y_valid = train_test_split(data["text"], data["label"], test_size=0.2)
print("\nx_train:\n")
print(x_train.head())
print(x_train.shape)
print(y_train.head())

print("\nx_valid:\n")
print(x_valid.head())
print(x_valid.shape)
print(y_valid.head())


x_train:

a86d1cad-c0ba-3f17-b1f8-ee9dcd53bb24    Ei mene <PERSON>Kimi</PERSON> <ORG>Red Bullill...
2bc6211b-62d9-3d38-b886-4f7ab5688f98    R RAC Rada Ragga metal Raggamuffin eli ragga R...
0c7338d8-f29a-3b8f-8a29-a70877669e39    Aiheesta muualla <ORG>Allmusic . com</ORG> : V...
22d8f381-5e22-3ff3-9fd6-64c9d9c5f6a3    <PERSON>Reith</PERSON> istui <ORG>Australian p...
99f71a38-5ebe-31d2-9c20-e1e1b2f27e5b    <PERSON>Duba</PERSON> siirtyikin kaudeksi <ORG...
Name: text, dtype: object
(1520,)
a86d1cad-c0ba-3f17-b1f8-ee9dcd53bb24    reject
2bc6211b-62d9-3d38-b886-4f7ab5688f98    reject
0c7338d8-f29a-3b8f-8a29-a70877669e39    reject
22d8f381-5e22-3ff3-9fd6-64c9d9c5f6a3    accept
99f71a38-5ebe-31d2-9c20-e1e1b2f27e5b    accept
Name: label, dtype: object

x_valid:

0bb6e5db-a62c-30f5-84ad-57887d6aee4f    <PERSON>Paula Talaskivi</PERSON> ( <ORG>Helsin...
55032fd4-7da4-37b2-89e2-14e902dc4c50    <PERSON>Da Silvan</PERSON> ensimmäinen maaotte...
ee1dfdb5-0f06-3c44-b261-59f7566fff0d    <PERSON>Guil

In [6]:
# setting model, dataset and hyperparameters

MODEL_NAME = 'TurkuNLP/bert-base-finnish-cased-v1'

LEARNING_RATE=1e-5
BATCH_SIZE=64
TRAIN_EPOCHS=1

# Magic value that comes from pytorch's default ignore_index
DUMMY_LABEL_ID = -100

In [7]:
dataset = x_train
num_labels = len(set(dataset))
num_labels

1520

In [8]:
print(dataset)
print(f'number of distinct labels: {num_labels}')

a86d1cad-c0ba-3f17-b1f8-ee9dcd53bb24    Ei mene <PERSON>Kimi</PERSON> <ORG>Red Bullill...
2bc6211b-62d9-3d38-b886-4f7ab5688f98    R RAC Rada Ragga metal Raggamuffin eli ragga R...
0c7338d8-f29a-3b8f-8a29-a70877669e39    Aiheesta muualla <ORG>Allmusic . com</ORG> : V...
22d8f381-5e22-3ff3-9fd6-64c9d9c5f6a3    <PERSON>Reith</PERSON> istui <ORG>Australian p...
99f71a38-5ebe-31d2-9c20-e1e1b2f27e5b    <PERSON>Duba</PERSON> siirtyikin kaudeksi <ORG...
                                                              ...                        
21bb9d9e-0712-36dd-9a8d-2c391c3d36a2    Jaetulla toisella sijalla olivat <PERSON>Antti...
f9b81455-418c-3dd4-9291-0d3a1427faa8    Perhe <PERSON>Thorin</PERSON> isä Björgolfur G...
5e49f726-d9ef-3099-9b7f-d6b307f48d34    Aiheesta muualla <PERSON>Juhana</PERSON> <ORG>...
a6611404-4080-3917-8aaf-0a7ae770ca84    Kappaleet CD1 The Sound Of Silence Wednesday M...
045f6546-7e5a-317b-a34c-38eb2a52db93    <ORG>Forbes Magazinen</ORG> mukaan <PERSON>Cop...
Name: text

In [9]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/424k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/816k [00:00<?, ?B/s]

In [10]:
list(x_train)

['Ei mene <PERSON>Kimi</PERSON> <ORG>Red Bullille</ORG> !',
 'R RAC Rada Ragga metal Raggamuffin eli ragga Ragtime Rai Rambutan Ramopunk Rap Rapcore ( eli rap metal ) Rapsodia Rautalankamusiikki <ORG>Rave</ORG> Reggae Reggaeton Rekilaulu Renessanssimusiikki Rhythm and blues Rhythmic noise ( power noise ) Ricercar Rock Rock and roll Rock - ooppera Rockabilly Rocksteady <PERSON>Rokokoon</PERSON> musiikki eli Galantti tyyli Romantiikan musiikki Rumba Runolaulu Rytmimusiikki Siirry kirjaimeen → A B C D E F G H I J K L M N O P Q R S T U V W X Y Z Å Ä Ö',
 'Aiheesta muualla <ORG>Allmusic . com</ORG> : Viola Concerto ( completed in 1949 by <PERSON>Tibor Serly</PERSON> ) , Sz . 120 , BB 128',
 '<PERSON>Reith</PERSON> istui <ORG>Australian parlamentin</ORG> jäsenenä vuosina 1982 – 1983 ja 1984 – 2001 .',
 '<PERSON>Duba</PERSON> siirtyikin kaudeksi <ORG>HC Znojemští Orliin</ORG> .',
 '* * * Helsingissä 15 . 8 . 2012 <PERSON>Sami Myllyniemi</PERSON> Tilastotutkija Nuorisotutkimusverkosto Leena Su

In [55]:
def encode_dataset(texts, labels):
	words = list(map(lambda t: t.split(), texts))
	tokenized = tokenizer(words, is_split_into_words=True, padding=True, truncation=True)
	labels_with_separators = []
	prev_word_idx = None
	for word_idx in tokenized.word_ids():
			if word_idx is None or word_idx == prev_word_idx:
					# Special token (e.g. [SEP]) or part of the previous word
					labels_with_separators.append(DUMMY_LABEL_ID)
			else:
					# Word start
					labels_with_separators.append(labels[word_idx])
			prev_word_idx = word_idx
	assert len(labels_with_separators) == len(tokenized.tokens())
	return (tokenized, labels_with_separators)


encoded_train_data, encoded_train_labels = encode_dataset(x_train, y_train)
encoded_validation_data, encoded_validation_labels = encode_dataset(x_valid, y_valid)
(len(encoded_train_data.tokens()), len(encoded_train_labels))

(512, 512)

In [31]:
(len(encoded_train_data), len(y_train))

(3, 1520)

In [12]:
set(encoded_train_labels)

{-100, 'accept', 'reject'}

In [56]:

# Convert labels to numbers for torch.tensor
label_map = {
    '-100': -100,
    'accept': 0,
    'reject': 1
}


class TdaDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        try:
            item = {key: torch.tensor(val[idx])
                    for key, val in self.encodings.items()}
            # item['labels'] = label_map[str(self.labels[idx])]
            item['labels'] = self.labels[idx]
            if item['labels'] == -100:
              item['labels'] = '[SEP]'
            
            print(item, item['labels'])

            return item
        except Exception as e:
            print("Fail", idx)  # Debug
            raise e

    def __len__(self):
        return len(self.labels)


train_dataset = TdaDataset(encoded_train_data, encoded_train_labels)
val_dataset = TdaDataset(encoded_validation_data, encoded_validation_labels)


In [50]:
# Debug
{key: torch.tensor(val[29]) for key, val in val_dataset.encodings.items()}
torch.tensor(label_map[train_dataset.labels[29]])

tensor(1)

TODO: Ylläolevan ajaminen (ensimmäistä kertaa), tulostaa virheen

"Token indices sequence length is longer than the specified maximum sequence length for this model (148796 > 512). Running this sequence through the model will result in indexing errors"

Mistä johtuu? 

Edit: truncation=True ehkä korjaa?

In [15]:
lengths = [len(x) for x in x_train]
print(max(lengths), len(x_train))

lengths = [len(x) for x in y_train]
print(max(lengths), len(y_train))


3439 1520
6 1520


In [51]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)

loading configuration file https://huggingface.co/TurkuNLP/bert-base-finnish-cased-v1/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/e27939251243299384d3c49756d6710f25a683fa4d5e00e6f42fe6cc59202f07.1b2c5b5f39fed7ac39db55c0d2566730a96257ac7215ad6c2a8a109e2ccf1ccd
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.10.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 50105
}

loading weights file https://huggingface.co/TurkuNLP/bert-base-finnish-cased-v1/resolv

In [57]:
train_args = TrainingArguments(
    'output_dir',    # output directory for checkpoints and predictions
    save_strategy='no',
    evaluation_strategy='epoch',
    logging_strategy='epoch',
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    num_train_epochs=TRAIN_EPOCHS,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [53]:
def compute_accuracy(pred):
    y_pred = pred.predictions.argmax(axis=1)
    y_true = pred.label_ids
    return { 'accuracy': sum(y_pred == y_true) / len(y_true) }

In [58]:
data_collator = DataCollatorForTokenClassification(
    tokenizer,
    label_pad_token_id=DUMMY_LABEL_ID
)

trainer = Trainer(
      model,
      train_args,
      train_dataset=train_dataset,
      eval_dataset=val_dataset,
      data_collator=data_collator,
      compute_metrics=compute_accuracy
)
trainer.train()

***** Running training *****
  Num examples = 512
  Num Epochs = 1
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 8


labels -100
labels -100
labels -100
labels -100
labels -100
labels -100
labels -100
labels -100
labels -100
labels -100
labels -100
labels -100
labels -100
labels -100
labels -100
labels -100
labels -100
labels -100
labels -100
labels -100
labels -100
labels -100
labels -100
labels -100
labels -100
labels -100
labels -100
labels -100
labels -100
labels -100
labels -100
labels -100
labels -100
labels -100
labels -100
labels -100
labels -100
labels -100
labels -100
labels -100
labels -100
labels -100
labels -100
labels -100
labels -100
labels -100
labels -100
labels -100
labels -100
labels -100
labels -100
labels -100
labels -100
labels -100
labels -100
labels -100
labels -100
labels -100
labels -100
labels -100
labels -100
labels -100
labels -100
labels -100


  sequence_length = torch.tensor(batch["input_ids"]).shape[1]


TypeError: object of type 'int' has no len()

***** Running training *****
  Num examples = 4
  Num Epochs = 1
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 1


IndexError: ignored