# Load And Test different tokenizers for relation extraction

In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
from persian_re.tokenizers import BertEntityMarkerTokenizer,BertPEEMTokenizer
from persian_re.settings import MODEL_NAME_OR_PATH, MAX_LEN
from persian_re.preprocess import PerlexData
from persian_re.utils import pprint_relation_statement

## Load Data

In [3]:
data = PerlexData.get_instance()

### sample data
**enitities and relation type**:

In [4]:
sample_rs = data.x_train[12]
sample_label = data.id2labels[data.y_train[12]]
pprint_relation_statement(sample_rs)
print(sample_label)

جمله: کمیته پولیتزر یک <e1>[92mنقل‌قول [0m</e1>رسمی منتشر کرد که <e2>[91mدلایل [0m</e2>این جایزه را توضیح می دهد .
Message-Topic(e1,e2)


## Entity Marker Tokenizer

In [2]:
em_tokenizer: BertEntityMarkerTokenizer = BertEntityMarkerTokenizer.from_pretrained(MODEL_NAME_OR_PATH)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'BertEntityMarkerTokenizer'.


In [3]:
em_tokenizer

PreTrainedTokenizer(name_or_path='HooshvareLab/bert-fa-zwnj-base', vocab_size=42000, model_max_len=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]', 'additional_special_tokens': ['<e1>', '</e1>', '<e2>', '</e2>']})

**token ids:**

In [6]:
tokens = em_tokenizer.tokenize(sample_rs)
token_ids = em_tokenizer.convert_tokens_to_ids(tokens)
print("id\t\t\ttoken")
for token, token_id in zip(tokens, token_ids):
    print(f'{token_id}\t\t{token}')

id			token
5049		کمیته
28982		پولیتزر
1961		یک
42000		<e1>
3470		نقل
9323		##قول
42001		</e1>
3291		رسمی
2596		منتشر
1960		کرد
1932		که
42002		<e2>
4701		دلایل
42003		</e2>
1930		این
3817		جایزه
1937		را
3730		توضیح
1924		می
2194		دهد
121		.


**encoding**: including *token_ids*, *segment_ids* and *attention_mask*

In [7]:
encoding = em_tokenizer.encode_plus(
    sample_rs,
    max_length=MAX_LEN,
    truncation=True,
    add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
    return_token_type_ids=True,
    return_attention_mask=True,
    padding='max_length',
    return_tensors='pt',  # Return PyTorch tensors
)

In [8]:
encoding.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [9]:
encoding

{'input_ids': tensor([[    2,  5049, 28982,  1961, 42000,  3470,  9323, 42001,  3291,  2596,
          1960,  1932, 42002,  4701, 42003,  1930,  3817,  1937,  3730,  1924,
          2194,   121,     3,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [10]:
ids = encoding['input_ids'][0].tolist()
tokens = em_tokenizer.convert_ids_to_tokens(ids)
segment_ids = encoding['token_type_ids'][0].tolist()
attention_mask = encoding['attention_mask'][0].tolist()

In [13]:
pd.DataFrame({'token': tokens, 'id': ids, 'segment_id': segment_ids, 'attention': attention_mask})

Unnamed: 0,token,id,segment_id,attention
0,[CLS],2,0,1
1,کمیته,5049,0,1
2,پولیتزر,28982,0,1
3,یک,1961,0,1
4,<e1>,42000,0,1
...,...,...,...,...
59,[PAD],0,0,0
60,[PAD],0,0,0
61,[PAD],0,0,0
62,[PAD],0,0,0


## Entity Marker + Positional Embedding Tokenizer

In [5]:
peem_tokenizer: BertPEEMTokenizer = BertPEEMTokenizer.from_pretrained(MODEL_NAME_OR_PATH)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'BertPEEMTokenizer'.


### Visualize Output
**token ids:**

In [6]:
tokens = peem_tokenizer.tokenize(sample_rs)
token_ids = peem_tokenizer.convert_tokens_to_ids(tokens)
print("id\t\t\ttoken")
for token, token_id in zip(tokens, token_ids):
    print(f'{token_id}\t\t{token}')

id			token
5049		کمیته
28982		پولیتزر
1961		یک
42000		<e1>
3470		نقل
9323		##قول
42001		</e1>
3291		رسمی
2596		منتشر
1960		کرد
1932		که
42002		<e2>
4701		دلایل
42003		</e2>
1930		این
3817		جایزه
1937		را
3730		توضیح
1924		می
2194		دهد
121		.


**encoding**: including *token_ids*, *segment_ids* and *attention_mask*

In [7]:
encoding = peem_tokenizer.encode_plus(
    sample_rs,
    max_length=MAX_LEN,
    truncation=True,
    add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
    return_token_type_ids=True,
    return_attention_mask=True,
    padding='max_length',
    return_tensors='pt',  # Return PyTorch tensors
)

In [8]:
encoding

{'input_ids': tensor([[    2,  5049, 28982,  1961, 42000,  3470,  9323, 42001,  3291,  2596,
          1960,  1932, 42002,  4701, 42003,  1930,  3817,  1937,  3730,  1924,
          2194,   121,     3,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [9]:
ids = encoding['input_ids'][0].tolist()
tokens = peem_tokenizer.convert_ids_to_tokens(ids)
segment_ids = encoding['token_type_ids'][0].tolist()
attention_mask = encoding['attention_mask'][0].tolist()

In [10]:
pd.DataFrame({'token': tokens, 'id': ids, 'segment_id': segment_ids, 'attention': attention_mask})

Unnamed: 0,token,id,segment_id,attention
0,[CLS],2,0,1
1,کمیته,5049,0,1
2,پولیتزر,28982,0,1
3,یک,1961,0,1
4,<e1>,42000,2,1
...,...,...,...,...
59,[PAD],0,0,0
60,[PAD],0,0,0
61,[PAD],0,0,0
62,[PAD],0,0,0
