In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
from pathlib import Path
import re

def read_wnut(file_path):
    file_path = Path(file_path)
    cnt = 0
    raw_text = file_path.read_text('utf-8').strip()
    cnt = 0
    tokens = []
    tags = []
    token_docs = []
    tag_docs = []
    for line in raw_text.split('\n'):
        token, tag = line.split('\t')
        if token == '.':
            cnt += 1
            
        tokens.append(token)
        tags.append(tag)
        if cnt == 3:
            token_docs.append(tokens)
            tag_docs.append(tags)
            cnt = 0
            tokens = []
            tags = []

    return token_docs, tag_docs

texts, tags = read_wnut('drive/MyDrive/train_data.txt')

In [3]:
print(texts[1], tags[1], sep='\n')
print(len(texts))

['اینکه', 'یک', 'رسانه', 'عربی', 'قطری', 'و', 'تاحدی', 'اسلامی', 'بتواند', 'تا', 'این', 'حد', 'جدی', 'و', 'موفق', 'عمل', 'کند', '،', 'بدون', 'تعارف', 'مایه', 'مباهات', 'بسیار', 'است', '؛', 'اما', 'آزادی', 'و', 'استقلال', 'رسانه\u200cای', 'در', 'دهکده', 'جهانی', 'سیاست\u200cزده', 'چیزی', 'است', 'که', 'باور', 'کردنش', 'بسیار', 'مشکل', 'است', '.', 'الجزیره', 'در', 'بحران', 'افغانستان', 'تاکنون', 'موفق', 'عمل', 'کرده', 'است', '،', 'اما', 'ابهامات', 'و', 'تردیدهای', 'بسیاری', 'نیز', 'برانگیخته', 'است', '.', 'یافتن', 'پاسخ', 'بعضی', 'از', 'سؤال\u200cها', 'شاید', 'مهم\u200cترین', 'دلیلی', 'بود', 'که', 'میزبان', 'غسان\u200cبن\u200cجدو', '،', 'رئیس', 'دفتر', 'الجزیره', 'در', 'تهران', 'و', 'سرپرست', 'منطقه\u200cای', 'این', 'شبکه', 'شدیم', 'تا', 'صریح', 'و', 'بی\u200cرودربایستی', 'سؤال\u200cهای', 'خود', 'را', 'از', 'او', 'بکنیم', 'و', 'بیشتر', 'با', 'شبکه', 'تلویزیونی', '-', 'خبری', 'الجزیره', 'و', 'فعالیت\u200cهای', 'آن', 'آشنا', 'شویم', '.']
['O', 'O', 'ye', 'ye', 'O', 'O', 'O', 'O', 'O', 'O', 

In [4]:
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_tags, val_tags = train_test_split(texts, tags, test_size=.1, train_size=.5)

In [5]:
unique_tags = set(tag for doc in tags for tag in doc)
tag2id = {tag: id for id, tag in enumerate(unique_tags)}
id2tag = {id: tag for tag, id in tag2id.items()}

In [6]:
unique_tags

{'@e', 'O', 'e', 've', 'y', 'ye'}

In [7]:
id2tag

{0: '@e', 1: 'ye', 2: 'O', 3: 'y', 4: 've', 5: 'e'}

In [8]:
!pip install transformers
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained("HooshvareLab/distilbert-fa-zwnj-base")
train_encodings = tokenizer(train_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
val_encodings = tokenizer(val_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)

Collecting transformers
  Downloading transformers-4.15.0-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 27.6 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 64.6 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 71.2 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 69.9 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 8.0 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attemp

Downloading:   0%|          | 0.00/416k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/134 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/292 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/500 [00:00<?, ?B/s]

In [9]:
label_all_tokens = True
import numpy as np

def encode_tags(tags, tokenized_inputs):
    labels = []
    tag_labels = [[tag2id[tag] for tag in doc] for doc in tags]
    for i, label in enumerate(tag_labels):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    return labels



In [10]:
train_labels = encode_tags(train_tags, train_encodings)
val_labels = encode_tags(val_tags, val_encodings)

In [11]:
print(train_labels[:10])

[[-100, 2, 2, 2, 5, 5, 2, 2, 2, 2, 5, 5, 5, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 5, 2, 5, 2, 2, 2, 2, 2, 2, 2, 2, 5, 2, 2, 2, 2, 5, 2, 2, 2, 2, 2, 2, 1, 5, 5, 5, 5, 2, 2, 1, 2, 2, 2, 2, 5, 2, 2, 2, 5, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 5, 5, 2, 2, 2, 1, 1, 1, 2, 2, 5, 2, 2, 5, 5, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 5, 2, 2, 5, 2, 2, 2, 5, 2, 2, 2, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -1

In [12]:
import torch

class WNUTDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_encodings.pop("offset_mapping") # we don't want to pass this to the model
val_encodings.pop("offset_mapping")
train_dataset = WNUTDataset(train_encodings, train_labels)
val_dataset = WNUTDataset(val_encodings, val_labels)

In [13]:
from transformers import DistilBertForTokenClassification
model = DistilBertForTokenClassification.from_pretrained("HooshvareLab/distilbert-fa-zwnj-base", num_labels=len(unique_tags))

Downloading:   0%|          | 0.00/289M [00:00<?, ?B/s]

Some weights of the model checkpoint at HooshvareLab/distilbert-fa-zwnj-base were not used when initializing DistilBertForTokenClassification: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at HooshvareLab/distilbert-fa-zwnj-base and are newly initialized: ['classifier.bias', 'classifier.weight']
Y

In [14]:
from transformers import Trainer, TrainingArguments


In [15]:

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    evaluation_strategy = "epoch",
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)


In [16]:
!pip install seqeval
!pip install datasets
from datasets import load_metric

metric = load_metric("seqeval")

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l[K     |███████▌                        | 10 kB 44.4 MB/s eta 0:00:01[K     |███████████████                 | 20 kB 48.9 MB/s eta 0:00:01[K     |██████████████████████▌         | 30 kB 43.0 MB/s eta 0:00:01[K     |██████████████████████████████  | 40 kB 43.7 MB/s eta 0:00:01[K     |████████████████████████████████| 43 kB 2.5 MB/s 
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16180 sha256=59b3397802921e3b11c7ae5a2ea19ff7721e3827cc0dfe6b41692918df17968b
  Stored in directory: /root/.cache/pip/wheels/05/96/ee/7cac4e74f3b19e3158dce26a20a1c86b3533c43ec72a549fd7
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2
Collecting datasets
  Downloading datasets-1.17.0-py3-none-any.whl (306 kB)
[K     |████████████████████████████████

Downloading:   0%|          | 0.00/2.48k [00:00<?, ?B/s]

In [17]:
u_tags =list(unique_tags)
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [u_tags[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [u_tags[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [18]:

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,             # evaluation dataset
    compute_metrics=compute_metrics
)



In [19]:
trainer.train()

***** Running training *****
  Num examples = 44285
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 8304


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0412,0.044728,0.925697,0.932128,0.928901,0.985456
2,0.0317,0.042201,0.933072,0.939019,0.936036,0.986589
3,0.0202,0.043499,0.934126,0.941488,0.937793,0.987101


Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-1500
Configuration saved in ./results/checkpoint-1500/config.json
Model weights saved in ./results/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-2000
Configuration saved in ./results/checkpoint-2000/config.json
Model weights saved in ./results/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-2500
Configuration saved in ./results/checkpoint-2500/config.json
Model weights saved in ./results/checkpoint-2500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 8858
  Batch size = 64
Saving model checkpoint to 

TrainOutput(global_step=8304, training_loss=0.05186508407076975, metrics={'train_runtime': 7188.6916, 'train_samples_per_second': 18.481, 'train_steps_per_second': 1.155, 'total_flos': 1.735917301435392e+16, 'train_loss': 0.05186508407076975, 'epoch': 3.0})

In [20]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 8858
  Batch size = 64




{'epoch': 3.0,
 'eval_accuracy': 0.9871010103354375,
 'eval_f1': 0.9377926803983672,
 'eval_loss': 0.04349938780069351,
 'eval_precision': 0.9341261928500214,
 'eval_recall': 0.9414880636223999,
 'eval_runtime': 181.5843,
 'eval_samples_per_second': 48.782,
 'eval_steps_per_second': 0.765}