# NLTK

In [None]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [None]:
text = "The UK will host the 26th UN Climate Change of the Parties (COP26) in Glasgow on 31 October - 12 November 2021"

In [None]:
for sent in nltk.sent_tokenize(text):
  for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
    if hasattr(chunk, 'label'):
      print(chunk.label(), ' '.join(c[0] for c in chunk))

ORGANIZATION UK
PERSON Climate Change
GPE Parties
ORGANIZATION COP26
GPE Glasgow


# SPACY

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')
text = "The UK will host the 26th UN Climate Change of the Parties (COP26) in Glasgow on 31 October - 12 November 2021"
doc = nlp(text)
print(f"{'Entity':<50}\t{'Start':<15}\t{'End':<15}\t{'Label':<15}")
for ent in doc.ents:
  print(f"{ent.text:<50}\t{ent.start_char:<15}\t{ent.end_char:<15}\t{ent.label_:<15}")

Entity                                            	Start          	End            	Label          
UK                                                	4              	6              	GPE            
26th                                              	21             	25             	ORDINAL        
UN Climate Change of the Parties                  	26             	58             	ORG            
Glasgow                                           	70             	77             	GPE            
31 October - 12 November 2021                     	81             	110            	DATE           


# BERT

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m49.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m24.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m77.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.15.1 tokenizers-0.13.3 transformers-4.29.2


In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

nlp = pipeline("ner",model=model, tokenizer=tokenizer)
text = "The UK will host the 26th UN Climate Change of the Parties (COP26) in Glasgow on 31 October - 12 November 2021"


In [None]:
ner_results = nlp(text)
for ne in ner_results:
  print(ne)

{'entity': 'B-LOC', 'score': 0.99947864, 'index': 2, 'word': 'UK', 'start': 4, 'end': 6}
{'entity': 'B-MISC', 'score': 0.8333227, 'index': 7, 'word': 'UN', 'start': 26, 'end': 28}
{'entity': 'I-MISC', 'score': 0.97171885, 'index': 8, 'word': 'Climate', 'start': 29, 'end': 36}
{'entity': 'I-MISC', 'score': 0.7681219, 'index': 9, 'word': 'Change', 'start': 37, 'end': 43}
{'entity': 'I-MISC', 'score': 0.89536256, 'index': 10, 'word': 'of', 'start': 44, 'end': 46}
{'entity': 'I-MISC', 'score': 0.9358373, 'index': 11, 'word': 'the', 'start': 47, 'end': 50}
{'entity': 'I-MISC', 'score': 0.90816504, 'index': 12, 'word': 'Parties', 'start': 51, 'end': 58}
{'entity': 'B-MISC', 'score': 0.9764014, 'index': 14, 'word': 'CO', 'start': 60, 'end': 62}
{'entity': 'B-LOC', 'score': 0.99941874, 'index': 19, 'word': 'Glasgow', 'start': 70, 'end': 77}


# PyThaiNLP

In [None]:
!pip3 install pythainlp[ner]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install python-crfsuite

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting python-crfsuite
  Downloading python_crfsuite-0.9.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (993 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m993.5/993.5 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-crfsuite
Successfully installed python-crfsuite-0.9.9


In [None]:
from pythainlp.tag import NER
ner = NER("thainer")
text = "กรมอุตุนิยมวิทยาประกาศ ประเทศไทยจะเข้าสู่ฤดูหนาวอย่างเป็นทางการในวันที่ 2 พฤศจิกายน 2564 และ จะสิ้นสุดประมาณปลายเดือนกุมภาพันธ์ 2565"
ner.tag(text)

[('กรมอุตุนิยมวิทยา', 'B-ORGANIZATION'),
 ('ประกาศ', 'O'),
 (' ', 'O'),
 ('ประเทศ', 'B-LOCATION'),
 ('ไทย', 'I-LOCATION'),
 ('จะ', 'O'),
 ('เข้าสู่', 'O'),
 ('ฤดูหนาว', 'O'),
 ('อย่าง', 'O'),
 ('เป็นทางการ', 'O'),
 ('ใน', 'O'),
 ('วันที่', 'O'),
 (' ', 'O'),
 ('2', 'B-DATE'),
 (' ', 'I-DATE'),
 ('พฤศจิกายน', 'I-DATE'),
 (' ', 'I-DATE'),
 ('2564', 'I-DATE'),
 (' ', 'O'),
 ('และ', 'O'),
 (' ', 'O'),
 ('จะ', 'O'),
 ('สิ้น', 'O'),
 ('สุดประมาณ', 'O'),
 ('ปลายเดือน', 'B-DATE'),
 ('กุมภาพันธ์', 'I-DATE'),
 (' ', 'I-DATE'),
 ('2565', 'I-DATE')]