<a href="https://colab.research.google.com/github/overfit-ir/persian-twitter-ner/blob/master/ner-survey-models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [3]:
!pip -q install transformers
!pip  -q install sentencepiece

[K     |████████████████████████████████| 1.8MB 8.5MB/s 
[K     |████████████████████████████████| 2.9MB 50.5MB/s 
[K     |████████████████████████████████| 890kB 50.6MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 1.2MB 6.7MB/s 
[?25h

In [12]:
import pandas as pd
import numpy as np
from transformers import (
    pipeline, 
    AutoConfig, 
    AutoTokenizer, 
    AutoModel, 
    AutoModelForTokenClassification
)

In [5]:
tokenizer = AutoTokenizer.from_pretrained("m3hrdadfi/albert-fa-base-v2-ner")
model = AutoModelForTokenClassification.from_pretrained("m3hrdadfi/albert-fa-base-v2-ner")
model.eval()
print()




In [8]:
!wget -q --show-progress 'https://raw.githubusercontent.com/overfit-ir/persian-twitter-ner/master/twitter_data/persian-ner-twitter-data/persian-ner-twitter-data1.txt'
!wget -q --show-progress 'https://raw.githubusercontent.com/overfit-ir/persian-twitter-ner/master/twitter_data/persian-ner-twitter-data/persian-ner-twitter-data2.txt'
!wget -q --show-progress 'https://raw.githubusercontent.com/overfit-ir/persian-twitter-ner/master/twitter_data/persian-ner-twitter-data/persian-ner-twitter-data3.txt'
!wget -q --show-progress 'https://raw.githubusercontent.com/overfit-ir/persian-twitter-ner/master/twitter_data/persian-ner-twitter-data/persian-ner-twitter-data4.txt'
!wget -q --show-progress 'https://raw.githubusercontent.com/overfit-ir/persian-twitter-ner/master/twitter_data/persian-ner-twitter-data/persian-ner-twitter-data5.txt'
!wget -q --show-progress 'https://raw.githubusercontent.com/overfit-ir/persian-twitter-ner/master/twitter_data/persian-ner-twitter-data/persian-ner-twitter-data6.txt'
!wget -q --show-progress 'https://raw.githubusercontent.com/overfit-ir/persian-twitter-ner/master/twitter_data/persian-ner-twitter-data/persian-ner-twitter-data7.txt'
!wget -q --show-progress 'https://raw.githubusercontent.com/overfit-ir/persian-twitter-ner/master/twitter_data/persian-ner-twitter-data/persian-ner-twitter-data8.txt'
!wget -q --show-progress 'https://raw.githubusercontent.com/overfit-ir/persian-twitter-ner/master/twitter_data/persian-ner-twitter-data/persian-ner-twitter-data9.txt'
!wget -q --show-progress 'https://raw.githubusercontent.com/overfit-ir/persian-twitter-ner/master/twitter_data/persian-ner-twitter-data/persian-ner-twitter-data10.txt'
!mkdir data && mv persian-ner* data/



# Convert to Text

In [9]:
from pathlib import Path
import re

def convert_lines_to_text(file_path):
    file_path = Path(file_path)

    raw_text = file_path.read_text().strip()
    raw_docs = re.split(r'\n\t?\n', raw_text)
    token_docs = []
    tag_docs = []
    for doc in raw_docs:
        tokens = []
        tags = []
        for line in doc.split('\n'):
            token, tag = line.split('\t')
            tokens.append(token)
            tags.append(tag)
        token_docs.append(tokens)
        tag_docs.append(tags)

    return token_docs, tag_docs

In [10]:
texts, tags = convert_lines_to_text('data/persian-ner-twitter-data1.txt')

In [14]:
print(texts[0])
print(tags[0])

['خرداد', '1399', '«', 'کیهان', '»', 'میراث', 'سناتور', 'مصطفی', 'مصباح\u200cزاده', '78', 'ساله', 'شد', 'کیهان', 'لندن', 'تنها', 'رسانه', 'ایرانی', 'در', 'تبعید', 'است', 'که', '«', 'از', 'جنگ', 'دوم', 'جهانی', 'تاکنون', 'قدمت', 'و', 'یک', 'موسسه', 'غصب', 'شده', 'در', 'تهران', '»', 'دارد']
['O', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'B-ORG', 'B-LOC', 'O', 'O', 'B-NAT', 'O', 'O', 'O', 'O', 'O', 'O', 'B-EVE', 'I-EVE', 'I-EVE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O']


In [15]:
s = ''
for word in texts[0]:
  s += word + ' '
s

'خرداد 1399 « کیهان » میراث سناتور مصطفی مصباح\u200cزاده 78 ساله شد کیهان لندن تنها رسانه ایرانی در تبعید است که « از جنگ دوم جهانی تاکنون قدمت و یک موسسه غصب شده در تهران » دارد '

In [16]:
tokenizer.tokenize(s)

['▁خرداد',
 '▁',
 '1399',
 '▁',
 '«',
 '▁کیهان',
 '▁',
 '»',
 '▁میراث',
 '▁سناتور',
 '▁مصطفی',
 '▁مصباح',
 '▁زاده',
 '▁',
 '78',
 '▁ساله',
 '▁شد',
 '▁کیهان',
 '▁لندن',
 '▁تنها',
 '▁رسانه',
 '▁ایرانی',
 '▁در',
 '▁تبعید',
 '▁است',
 '▁که',
 '▁',
 '«',
 '▁از',
 '▁جنگ',
 '▁دوم',
 '▁جهانی',
 '▁تاکنون',
 '▁قدمت',
 '▁و',
 '▁یک',
 '▁موسسه',
 '▁غصب',
 '▁شده',
 '▁در',
 '▁تهران',
 '▁',
 '»',
 '▁دارد']

# Benchmark

In [17]:
albert_ner = pipeline('ner', model=model, tokenizer=tokenizer)

In [18]:
albert_ner(s)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[{'end': 18,
  'entity': 'B-organization',
  'index': 6,
  'score': 0.6452239155769348,
  'start': 13,
  'word': '▁کیهان'},
 {'end': 39,
  'entity': 'B-person',
  'index': 11,
  'score': 0.9977455139160156,
  'start': 34,
  'word': '▁مصطفی'},
 {'end': 45,
  'entity': 'I-person',
  'index': 12,
  'score': 0.9973155856132507,
  'start': 40,
  'word': '▁مصباح'},
 {'end': 50,
  'entity': 'I-person',
  'index': 13,
  'score': 0.8139748573303223,
  'start': 46,
  'word': '▁زاده'},
 {'end': 67,
  'entity': 'B-organization',
  'index': 18,
  'score': 0.8885950446128845,
  'start': 62,
  'word': '▁کیهان'},
 {'end': 72,
  'entity': 'I-organization',
  'index': 19,
  'score': 0.8778548240661621,
  'start': 68,
  'word': '▁لندن'},
 {'end': 165,
  'entity': 'B-location',
  'index': 41,
  'score': 0.9980483055114746,
  'start': 160,
  'word': '▁تهران'}]

# Benchmark2

In [None]:
!cat data/dev.txt.tmp | grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > dev.txt.tmp
# !cat NER-de-dev.tsv | grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > dev.txt.tmp
# !cat NER-de-test.tsv | grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > test.txt.tmp

In [None]:
!git clone https://github.com/huggingface/transformers
!pip install transformers/

In [None]:
!pip install sentencepiece

In [None]:
!wget "https://raw.githubusercontent.com/stefan-it/fine-tuned-berts-seq/master/scripts/preprocess.py"

In [None]:
# !python3 preprocess.py train.txt.tmp HooshvareLab/bert-base-parsbert-uncased 128 > train.txt
!python3 preprocess.py dev.txt.tmp HooshvareLab/bert-base-parsbert-uncased 128 > dev.txt
# !python3 preprocess.py test.txt.tmp m3hrdadfi/albert-fa-base-v2-ner 128 > test.txt

In [None]:
!pip install -r transformers/examples/token-classification/requirements.txt

In [None]:
!mv dev.txt data/

In [None]:
!cat data/dev.txt | cut -d " " -f 2 | grep -v "^$"| sort | uniq > labels.txt

In [None]:
!python3 transformers/examples/legacy/token-classification/run_ner.py --data_dir data/ \
--labels labels.txt \
--model_name_or_path HooshvareLab/bert-base-parsbert-uncased \
--output_dir eval/ \
--max_seq_length  128 \
--save_steps 750 \
--seed 1 \
--do_eval

In [None]:
!pip install conllu

In [None]:
import pandas as pd

In [None]:
with open('data-fa/train.csv', 'r+') as file:
  s = ''
  for line in file.readlines():
    s += line.replace(',', '\t')
  file.seek = 0
  file.write(s)

In [None]:
!cat data-fa/train.csv

In [None]:
!curl -L 'https://drive.google.com/file/d/1Jjhbal535VVz2ap4v4r_rN1UEHTdLK5P/view?usp=sharing' \
| grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > train.txt.tmp
!curl -L 'https://drive.google.com/file/d/1ZfRcQThdtAR5PPRjIDtrVP7BtXSCUBbm/view?usp=sharing' \
| grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > dev.txt.tmp
!curl -L 'https://drive.google.com/file/d/1u9mb7kNJHWQCWyweMDRMuTFoOHOfeBTH/view?usp=sharing' \
| grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > test.txt.tmp