In [1]:
!nvidia-smi

Thu May 12 14:24:10 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   70C    P8    33W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install transformers
!pip install datasets
!pip install seqeval



In [3]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch
from google.colab import drive
from datasets import load_dataset
from transformers import AutoTokenizer
from datasets import ClassLabel
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
import numpy as np
import pandas as pd
from datasets import load_metric
from transformers import Trainer
from sklearn.metrics import classification_report

drive.mount('/content/drive')
google_path = 'drive/MyDrive/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
all_langs = [
    'ru',
    'uk',
    'ka',
    'he',
    'en',
    'de',
    'be',
    'kk',
    'az',
    'hy',
]

labels = ClassLabel(num_classes=10, names=all_langs)

In [5]:
import datasets

dataset = datasets.load_from_disk(
                       google_path+'dataset',
                       )
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 89996
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 10000
    })
})

In [6]:
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
label_all_tokens = True

In [7]:
def tokenize(examples):
    tokenized_inputs = tokenizer(examples["text"], truncation=True, is_split_into_words=True, max_length=128)

    labels = []
    for i, label in enumerate(examples["label"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = dataset.map(tokenize, batched=True)

  0%|          | 0/90 [00:00<?, ?ba/s]

Loading cached processed dataset at drive/MyDrive/dataset/test/cache-0602777371272ba2.arrow


In [8]:
train_dataset = tokenized_datasets["train"].shuffle(seed=42)
eval_dataset = tokenized_datasets["test"].shuffle(seed=42)

Loading cached shuffled indices for dataset at drive/MyDrive/dataset/test/cache-2ebd8a3075a31572.arrow


In [9]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    "xlm-roberta-base",
     num_labels=10, 
)
model.config.id2label = {id: lang for id, lang in enumerate(all_langs)}
model.config.label2id = {lang: id for id, lang in enumerate(all_langs)}

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForTokenClassification: ['lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-st

In [10]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

In [11]:
metric = load_metric("seqeval")

In [12]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [all_langs[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [all_langs[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [13]:
batch_size = 16

training_args = TrainingArguments(
    output_dir=google_path+"token-lang-xlm-roberta-base",
    overwrite_output_dir=True,
    logging_strategy = "epoch",
    save_strategy = "steps",
    # evaluation_strategy = "epoch",
    evaluation_strategy = "steps",
    save_steps=2000,
    eval_steps=2000,
    save_total_limit=1,
    num_train_epochs=1,
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
)

In [14]:
train_dataset = train_dataset.remove_columns(['label'])
eval_dataset = eval_dataset.remove_columns(['label'])

In [15]:
len(train_dataset['labels'][0]),len(train_dataset['input_ids'][0])

(82, 82)

In [16]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [17]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `XLMRobertaForTokenClassification.forward` and have been ignored: text. If text are not expected by `XLMRobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 89996
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 5625


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
2000,No log,0.036881,0.894798,0.91433,0.904458,0.989305
4000,No log,0.029172,0.919623,0.933586,0.926552,0.991883


The following columns in the evaluation set  don't have a corresponding argument in `XLMRobertaForTokenClassification.forward` and have been ignored: text. If text are not expected by `XLMRobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 16
Saving model checkpoint to drive/MyDrive/token-lang-xlm-roberta-base/checkpoint-2000
Configuration saved in drive/MyDrive/token-lang-xlm-roberta-base/checkpoint-2000/config.json
Model weights saved in drive/MyDrive/token-lang-xlm-roberta-base/checkpoint-2000/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `XLMRobertaForTokenClassification.forward` and have been ignored: text. If text are not expected by `XLMRobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 16
Saving model checkpoint to drive/MyDrive/token-la

TrainOutput(global_step=5625, training_loss=0.05178787434895833, metrics={'train_runtime': 5253.1903, 'train_samples_per_second': 17.132, 'train_steps_per_second': 1.071, 'total_flos': 5822978416659840.0, 'train_loss': 0.05178787434895833, 'epoch': 1.0})

In [None]:
trainer.save_model()

Saving model checkpoint to drive/MyDrive/ml-lang-xlm-roberta-base
Configuration saved in drive/MyDrive/ml-lang-xlm-roberta-base/config.json
Model weights saved in drive/MyDrive/ml-lang-xlm-roberta-base/pytorch_model.bin


In [None]:
model_path = 'drive/MyDrive/ml-lang-xlm-roberta-base'
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
model = AutoModelForSequenceClassification.from_pretrained(model_path)

In [None]:
model.config.id2label = {id: lang for id, lang in enumerate(all_langs)}
model.config.label2id = {lang: id for id, lang in enumerate(all_langs)}

In [None]:
model.config

XLMRobertaConfig {
  "_name_or_path": "drive/MyDrive/ml-lang-xlm-roberta-base",
  "architectures": [
    "XLMRobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "ru",
    "1": "uk",
    "2": "ka",
    "3": "he",
    "4": "en",
    "5": "de",
    "6": "be",
    "7": "kk",
    "8": "az",
    "9": "hy"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "az": 8,
    "be": 6,
    "de": 5,
    "en": 4,
    "he": 3,
    "hy": 9,
    "ka": 2,
    "kk": 7,
    "ru": 0,
    "uk": 1
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "multi_label_classification",
  "tor

In [None]:
preds = trainer.predict(eval_dataset)
preds = preds.predictions

The following columns in the test set  don't have a corresponding argument in `XLMRobertaForSequenceClassification.forward` and have been ignored: he, text, uk, en, ka, kk, hy, be, az, de, ru. If he, text, uk, en, ka, kk, hy, be, az, de, ru are not expected by `XLMRobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 9906
  Batch size = 16


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
sigmoid = torch.nn.Sigmoid()
probs = sigmoid(torch.Tensor(preds))
probs

tensor([[0.9976, 0.0113, 0.0602,  ..., 0.0392, 0.0769, 0.9977],
        [0.7149, 0.0705, 0.0783,  ..., 0.0460, 0.9995, 0.0232],
        [0.9826, 0.0191, 0.9967,  ..., 0.0125, 0.9987, 0.0108],
        ...,
        [0.0100, 0.9966, 0.0565,  ..., 0.0212, 0.9942, 0.0138],
        [0.0047, 0.0048, 0.0232,  ..., 0.0070, 0.0074, 0.0080],
        [0.0117, 0.0125, 0.9928,  ..., 0.9895, 0.0143, 0.0074]])

In [None]:
probs[probs > 0.5] = 1
probs[probs <= 0.5] = 0
probs

tensor([[1., 0., 0.,  ..., 0., 0., 1.],
        [1., 0., 0.,  ..., 0., 1., 0.],
        [1., 0., 1.,  ..., 0., 1., 0.],
        ...,
        [0., 1., 0.,  ..., 0., 1., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 1.,  ..., 1., 0., 0.]])

In [None]:
probs = probs.int().tolist()
pd.DataFrame(data=probs, columns=all_langs)

Unnamed: 0,ru,uk,ka,he,en,de,be,kk,az,hy
0,1,0,0,0,1,0,1,0,0,1
1,1,0,0,0,0,0,1,0,1,0
2,1,0,1,0,0,0,0,0,1,0
3,0,0,0,0,0,1,0,1,0,0
4,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...
9901,0,0,0,0,1,0,0,0,0,0
9902,1,1,0,0,0,0,0,0,1,0
9903,0,1,0,0,1,0,0,0,1,0
9904,0,0,0,1,0,0,0,0,0,0


In [None]:
sigmoid = torch.nn.Sigmoid()

preds = model(**tokenizer(input, return_tensors='pt')).logits
probs = sigmoid(preds)
probs = probs.detach()[0]
for ind, prob in enumerate(probs):
    if prob > 0.5:
        print(labels.int2str(ind))

ru
uk
