In [2]:
import ast
from transformers import AutoTokenizer

from datasets import load_dataset

ds = load_dataset("hezarai/arman-ner")
ds['train']['tokens'][0]

Downloading readme:   0%|          | 0.00/804 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.29M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/301k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/20484 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2561 [00:00<?, ? examples/s]

['به',
 'عنوان',
 'مثال',
 'وقتی',
 'نشریات',
 'مدافع',
 'اصول',
 'و',
 'ارزشها',
 'و',
 'منادی',
 'انقلاب',
 'و',
 'اسلام',
 'در',
 'بالاترین',
 'درجه',
 '،',
 'اولین',
 'و',
 'درشت\u200cترین',
 'تیتر',
 'نشریه',
 'خود',
 'را',
 'در',
 'صدر',
 'صفحه',
 'نخستین',
 '،',
 'به',
 'تکذیب',
 'اظهارات',
 'و',
 'نظریات',
 'مشاور',
 'رئیس\u200cجمهور',
 'با',
 'همین',
 'ترکیب',
 'عبارتی',
 'و',
 'البته',
 'از',
 'قول',
 'دیگران',
 'اختصاص',
 'می\u200cدهند',
 '،',
 'آیا',
 'در',
 'موارد',
 'مشابه',
 'نیز',
 'هر',
 'گاه',
 'خبر',
 'تکذیب',
 'متوجه',
 'و',
 'معطوف',
 'به',
 'شخصی',
 'باشد',
 'كه',
 'در',
 'زمره',
 'مشاوران',
 'يك',
 'مقام',
 'بلندمرتبه\u200cی',
 'دیگر',
 'است',
 '،',
 'خبر',
 'را',
 'عینا',
 'به',
 'همین',
 'درشتی',
 'و',
 'با',
 'همین',
 'ترکیب',
 'عبارتی',
 'در',
 'صدر',
 'صفحه',
 'نخست',
 'به',
 'چاپ',
 'می\u200cرساند',
 'و',
 'در',
 'آن',
 'مورد',
 'هم',
 'به',
 'جای',
 'ذکر',
 'نام',
 'يا',
 'عضویت',
 'آن',
 'شخص',
 'در',
 'گروه',
 'و',
 'کمیته\u200cی',
 'خاص',
 'صرفا',
 'بر'

In [3]:
from huggingface_hub.hf_api import HfFolder

HfFolder.save_token("")

In [4]:
from transformers import AutoTokenizer
##%%
tokenizer = AutoTokenizer.from_pretrained("PartAI/TookaBERT-Base")

tokenizer_config.json:   0%|          | 0.00/463 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.59M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/145 [00:00<?, ?B/s]

In [7]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForTokenClassification
import ast
from torch.nn.utils.rnn import pad_sequence

class NERDataset(Dataset):
    def __init__(self, token, ner_tags, tokenizer, idx2tag):
        self.token = token
        self.tags = ner_tags
        self.tokenizer = tokenizer
        self.idx2tag = idx2tag

    def __len__(self):
        return len(self.token)

    def __getitem__(self, idx):
        words = self.token[idx]
        tags = self.tags[idx]
        
        tokens = []
        label_ids = []

        for word, tag in zip(words, tags):
     
            word_tokens = self.tokenizer.tokenize(word)
            tokens.extend(word_tokens)
            label_ids.extend([tag] * len(word_tokens))

        input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        attention_mask = [1] * len(input_ids)

        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
            'labels': torch.tensor(label_ids, dtype=torch.long)
        }

def collate_fn(batch):
    input_ids = [item['input_ids'] for item in batch]
    attention_mask = [item['attention_mask'] for item in batch]
    labels = [item['labels'] for item in batch]

    input_ids_padded = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask_padded = pad_sequence(attention_mask, batch_first=True, padding_value=0)
    labels_padded = pad_sequence(labels, batch_first=True, padding_value=-100)

    return {
        'input_ids': input_ids_padded,
        'attention_mask': attention_mask_padded,
        'labels': labels_padded
    }

# Define tags and mappings
tags =  ['O',
   'B-pro',
 'I-pro',
   'B-pers',
   'I-pers',
   'B-org',
   'I-org',
   'B-loc',
  'I-loc',
  'B-fac',
   'I-fac',
  'B-event',
   'I-event']


tag2idx = {tag: idx for idx, tag in enumerate(tags)}
idx2tag = {idx: tag for idx, tag in enumerate(tags)}

tokenizer = AutoTokenizer.from_pretrained("PartAI/TookaBERT-Base")


train_dataset = NERDataset(ds['train']['tokens'][:15000], ds['train']['ner_tags'][:15000], tokenizer, tag2idx)
val_dataset = NERDataset(ds['test']['tokens'], ds['test']['ner_tags'], tokenizer, tag2idx)

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)

In [8]:
next(iter(train_dataset))

{'input_ids': tensor([  691,  1078,  2688,  1480, 11063,  4265,  2832,   680, 26149,   680,
         23475,  1584,   680,  1062,   687,  5142,  2975,    51,    45,  1766,
           680, 10295,  1817,  7722,  6488,   765,   711,   687,  4255,  2043,
          2943,    51,    45,   691,  7068,  4569,   680, 15160,  2417,  4327,
           694,  1150,  2830,  8859,   680,  1474,   698,  4138,  2563,  2662,
          2942,    51,    45,  1747,   687,  2066,  3511,   890,   854,  4208,
           990,  7068,  2990,   680, 10390,   691,  2570,   902,   708,   687,
         13141,  7848,   753,  1818, 29222,  1300, 24322,   935,   707,    51,
            45,   990,   711, 28520,   691,  1150, 31914,   680,   694,  1150,
          2830,  8859,   687,  4255,  2043,  1854,   691,  3068,  9144,   680,
           687,   741,   979,   728,   691,  1261,  2734,  1056,   821,  4689,
           741,  1504,   687,  1285,   680, 39808,  1663,  4886,   706,   986,
          1078,  2417,  7610,  1753, 39

In [9]:

model = AutoModelForTokenClassification.from_pretrained("PartAI/TookaBERT-Base", num_labels=len(tags), label2id=tag2idx, id2label=idx2tag)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

config.json:   0%|          | 0.00/730 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/492M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at PartAI/TookaBERT-Base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(48000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [10]:

epochs = 4
for epoch in range(epochs):
    model.train()
    for batch in train_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss.item()}")

    model.eval()
    eval_loss = 0
    for batch in val_dataloader:
        with torch.no_grad():
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            eval_loss += outputs.loss.item()

    avg_eval_loss = eval_loss / len(val_dataloader)
    print(f"Validation Loss: {avg_eval_loss}")

Epoch 1/4, Loss: 0.06101022660732269
Validation Loss: 0.03467475574422262
Epoch 2/4, Loss: 0.13634473085403442
Validation Loss: 0.016704145081787705
Epoch 3/4, Loss: 0.032689567655324936
Validation Loss: 0.012533964911238957
Epoch 4/4, Loss: 0.0015748543664813042
Validation Loss: 0.0061502935837281


Entity: ▁منصور, Label: B-pers
Entity: ▁گرگان, Label: B-loc
Entity: ▁روز, Label: B-event
Entity: ▁مزرعه, Label: I-event
Entity: ▁سویا, Label: I-event
Entity: ▁ایستگاه, Label: B-fac
Entity: ▁تحقیقات, Label: I-fac
Entity: ▁کشاورزی, Label: I-fac
Entity: ▁گرگان, Label: I-fac


In [11]:
def predict_ner(sentence):
    
    tokens = tokenizer.tokenize(sentence)
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = torch.tensor([input_ids], dtype=torch.long).to(device)
    attention_mask = torch.tensor([[1] * len(input_ids[0])], dtype=torch.long).to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=2)
    predictions = predictions.detach().cpu().numpy()[0]

    predicted_tags = [idx2tag[pred] for pred in predictions]

    return list(zip(tokens, predicted_tags))

sentence = 'سید محمود محدث مدیر اکتشاف شرکت ملی نفت ایران در مصاحبه با واحد مرکزی خبر با اعلام این خبر افزود : با حفاریهای بیشتر در میدان نفتی چنگوله انتظار داریم ذخائر این میدان افزایش یابد .'

predicted_ner = predict_ner(sentence)


for token, tag in predicted_ner:
    print(f"{token}: {tag}")

▁سید: B-pers
▁محمود: I-pers
▁محدث: I-pers
▁مدیر: O
▁اکتشاف: B-org
▁شرکت: I-org
▁ملی: I-org
▁نفت: I-org
▁ایران: I-org
▁در: O
▁مصاحبه: O
▁با: O
▁واحد: B-org
▁مرکزی: I-org
▁خبر: I-org
▁با: O
▁اعلام: O
▁این: O
▁خبر: O
▁افزود: O
▁: O
:: O
▁با: O
▁حفاری: O
های: O
▁بیشتر: O
▁در: O
▁میدان: B-loc
▁نفتی: I-loc
▁چنگ: I-loc
وله: I-loc
▁انتظار: O
▁داریم: O
▁ذخائر: O
▁این: O
▁میدان: O
▁افزایش: O
▁یابد: O
▁.: O


In [45]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("NLPclass/Named-entity-recognition")
model = AutoModelForTokenClassification.from_pretrained("NLPclass/Named-entity-recognition")
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, device=device)


sentence = "گرگان ـ مراسم روز مزرعه سویا با هدف نظم بخشیدن به ارتباط محققان ، مروجان و کشاورزان سویاکار روز یکشنبه در ایستگاه تحقیقات کشاورزی گرگان برگزار شد و من منصور هستم "

predicted_ner = ner_pipeline(sentence)
for entity in predicted_ner:
    print(f"Entity: {entity['word']}, Label: {entity['entity']}")

Entity: ▁گرگان, Label: B-loc
Entity: ▁روز, Label: B-event
Entity: ▁مزرعه, Label: I-event
Entity: ▁سویا, Label: I-event
Entity: ▁ایستگاه, Label: B-fac
Entity: ▁تحقیقات, Label: I-fac
Entity: ▁کشاورزی, Label: I-fac
Entity: ▁گرگان, Label: I-fac
Entity: ▁منصور, Label: B-pers


In [16]:
from huggingface_hub.hf_api import HfFolder,HfApi

HfFolder.save_token("")

In [17]:
api=HfApi()

In [28]:
repo_id=''
repo_url=api.create_repo(repo_id,exist_ok=True)

In [29]:
model.push_to_hub(repo_id)
tokenizer.push_to_hub(repo_id)

model.safetensors:   0%|          | 0.00/489M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/NLPclass/Named-entity-recognition/commit/9772984c0d0f580bdef3a70566dfca67e0cd044b', commit_message='Upload tokenizer', commit_description='', oid='9772984c0d0f580bdef3a70566dfca67e0cd044b', pr_url=None, pr_revision=None, pr_num=None)