STEMMING AND LEMMATISATION

In [1]:
import nltk
nltk.download('wordnet') # คลังคำศัพท์ คำความหมายคล้ายกัน Synonym vocab
from nltk.stem.porter import PorterStemmer #import stemming function
from nltk.stem.wordnet import WordNetLemmatizer #import Lemmatisation fuction
stemmer = PorterStemmer()
lemma = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\peaks\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
def stem_vs_lemma(text):
    print(f"{'word':<12}\t{'lemma':<12}\t{'stem':<12}")
    for word in text:
        print(f'{word:12}\t{lemma.lemmatize(word):12}\t{stemmer.stem(word):12}')

In [3]:
word_list = ['fly', 'flies', 'flying', 'flew', 'flown']
word_list2 = ['dance', 'dancing', 'danced',]

In [4]:
stem_vs_lemma(word_list)

word        	lemma       	stem        
fly         	fly         	fli         
flies       	fly         	fli         
flying      	flying      	fli         
flew        	flew        	flew        
flown       	flown       	flown       


In [5]:
stem_vs_lemma(word_list2)

word        	lemma       	stem        
dance       	dance       	danc        
dancing     	dancing     	danc        
danced      	danced      	danc        


Stopword Removal

In [6]:
import nltk
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
stopwords[0:5]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\peaks\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['i', 'me', 'my', 'myself', 'we']

In [7]:
def stopwords_removal(text):
    return [i for i in text if i not in stopwords]

In [8]:
text1 = "When your legs don't work like they used to before And I can't sweep you off of your feet Will your mouth still remember the taste of my love Will your eyes still smile from your cheeks"

In [9]:
' '.join(stopwords_removal(text1.split()))

"When legs work like used And I can't sweep feet Will mouth still remember taste love Will eyes still smile cheeks"

Normalisation
-   2moro, 2mrrw , 2mrw , tomrw -> tomorrow
-   otw -> on the way , FYI -> for your information, TTYL -> talk to you later
-    emoji -> word , :), :D ,:-) -> smile

In [10]:
norm_dict = {'2moro':'tomorrow',
             '2mrrw':'tomorrow',
             '2morrow':'tomorrow',
             'tmorw':'tomorrow',
             'b4':'before',
             'otw':'on the way',
             ':)':'smile',}

In [11]:
def nomalise(text):
    return [norm_dict[i] for i in text if i in norm_dict.keys() ]

In [12]:
text_list = ['2moro','2mrrw','2morrow','otw']
nomalise(text_list)

['tomorrow', 'tomorrow', 'tomorrow', 'on the way']

Noise Removal
-   clean all noise such as # .. ? <> // 

In [13]:
import pandas as pd
import re

In [14]:
def noise_removal(text):
    text = re.sub("(<.*?>)","",text)
    text = re.sub("(\\W|\\d)"," ",text)
    text = text.strip() #remove whitespace
    return text


In [15]:
raw_data = 'this is <a> banggg**?'
noise_removal(raw_data)

'this is  banggg'

Text enrichment

In [16]:
from nltk.corpus import wordnet
synonym = wordnet.synsets("program") # find synonym or related word of "program"
synonym

[Synset('plan.n.01'),
 Synset('program.n.02'),
 Synset('broadcast.n.02'),
 Synset('platform.n.02'),
 Synset('program.n.05'),
 Synset('course_of_study.n.01'),
 Synset('program.n.07'),
 Synset('program.n.08'),
 Synset('program.v.01'),
 Synset('program.v.02')]

In [17]:
synonym2 = wordnet.synsets("game")
synonym2

[Synset('game.n.01'),
 Synset('game.n.02'),
 Synset('game.n.03'),
 Synset('game.n.04'),
 Synset('game.n.05'),
 Synset('game.n.06'),
 Synset('game.n.07'),
 Synset('plot.n.01'),
 Synset('game.n.09'),
 Synset('game.n.10'),
 Synset('game.n.11'),
 Synset('bet_on.v.01'),
 Synset('crippled.s.01'),
 Synset('game.s.02')]

TOKENIZATION

In [18]:
from pythainlp import word_tokenize

In [19]:
text = "เหตุเกิดจากความเหงา ที่ทำให้รู้ว่ารักเธอเท่าไหร่"

DICTIONARY BASE

In [20]:
print("logest",word_tokenize(text, engine="longest"))
print("maximum",word_tokenize(text, engine="mm"))
print("new maximum",word_tokenize(text, engine="newmm"))
print("new maximum-safe",word_tokenize(text, engine="newmm-safe"))
#print("icu",word_tokenize(text, engine="icu"))

logest ['เหตุ', 'เกิด', 'จาก', 'ความเหงา', ' ', 'ที่', 'ทำให้', 'รู้', 'ว่า', 'รัก', 'เธอ', 'เท่าไหร่']
maximum ['เหตุ', 'เกิด', 'จากความเหงา', ' ', 'ที่', 'ทำให้', 'รู้', 'ว่า', 'รัก', 'เธอ', 'เท่าไหร่']
new maximum ['เหตุ', 'เกิด', 'จาก', 'ความเหงา', ' ', 'ที่', 'ทำให้', 'รู้', 'ว่า', 'รัก', 'เธอ', 'เท่าไหร่']
new maximum-safe ['เหตุ', 'เกิด', 'จาก', 'ความเหงา', ' ', 'ที่', 'ทำให้', 'รู้', 'ว่า', 'รัก', 'เธอ', 'เท่าไหร่']


MACHINE LEARNING BASED

In [21]:
print("deepcut",word_tokenize(text, engine="deepcut"))
#print("attacut",word_tokenize(text, engine="attacut"))



deepcut ['เหตุ', 'เกิด', 'จาก', 'ความ', 'เหงา', ' ', 'ที่', 'ทำ', 'ให้', 'รู้', 'ว่า', 'รัก', 'เธอ', 'เท่า', 'ไหร่']


In [22]:
#%%timeit
print("deepcut",word_tokenize(text, engine="deepcut"))


deepcut ['เหตุ', 'เกิด', 'จาก', 'ความ', 'เหงา', ' ', 'ที่', 'ทำ', 'ให้', 'รู้', 'ว่า', 'รัก', 'เธอ', 'เท่า', 'ไหร่']


Part Of Speech tagging

In [23]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\peaks\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\peaks\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [24]:
sentence = 'I love Opall'
token = nltk.word_tokenize(sentence)
token

['I', 'love', 'Opall']

In [26]:
nltk.pos_tag(token)

[('I', 'PRP'), ('love', 'VBP'), ('Opall', 'RB')]

In [27]:
from pythainlp import sent_tokenize, word_tokenize

In [28]:
text = 'ฉันรักธรรมศาสตร์เพราะธรรมศาสตร์สอนให้ฉันรักประชาชน'

In [31]:
sent = word_tokenize(text,engine='newmm')
sent

['ฉัน',
 'รัก',
 'ธรรมศาสตร์',
 'เพราะ',
 'ธรรมศาสตร์',
 'สอน',
 'ให้',
 'ฉัน',
 'รัก',
 'ประชาชน']

In [32]:
from pythainlp.tag import pos_tag, pos_tag_sents

In [33]:
pos_tag(sent) # Default engine = "perceptron" corpus = "orchid"

[('ฉัน', 'PPRS'),
 ('รัก', 'VACT'),
 ('ธรรมศาสตร์', 'NCMN'),
 ('เพราะ', 'JSBR'),
 ('ธรรมศาสตร์', 'NCMN'),
 ('สอน', 'VACT'),
 ('ให้', 'JSBR'),
 ('ฉัน', 'PPRS'),
 ('รัก', 'VACT'),
 ('ประชาชน', 'NCMN')]

In [34]:
pos_tag(sent, corpus="pud") # pud = Parallel Universal Dependencies

[('ฉัน', 'PRON'),
 ('รัก', 'VERB'),
 ('ธรรมศาสตร์', 'NOUN'),
 ('เพราะ', 'ADP'),
 ('ธรรมศาสตร์', 'NOUN'),
 ('สอน', 'VERB'),
 ('ให้', 'VERB'),
 ('ฉัน', 'PRON'),
 ('รัก', 'VERB'),
 ('ประชาชน', 'NOUN')]

NER Name Entities Recognition

In [4]:
text = "The UK will host the 26th UN Climate Change Conference of the Parties (COP26) in Glasglow on 31 Octorber - 12 November 2021"

NLTK

In [1]:
import nltk
nltk.download('punkt')
nltk.download('average_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\peaks\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Error loading average_perceptron_tagger: Package
[nltk_data]     'average_perceptron_tagger' not found in index
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\peaks\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping chunkers\maxent_ne_chunker.zip.
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\peaks\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\words.zip.


True

In [5]:
for sent in nltk.sent_tokenize(text):
    for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
        if hasattr(chunk,'label'):
            print(chunk.label(),' '.join(c[0] for c in chunk))

ORGANIZATION UK
PERSON Climate Change Conference
GPE Parties
ORGANIZATION COP26
GPE Glasglow


SPACY

In [6]:
import spacy

In [19]:
nlp_spacy = spacy.load("en_core_web_sm")
doc = nlp(text)
for ent in doc.ents:
    print(f"{ent.text:<50}\t{ent.label_:<15}")

UK                                                	GPE            
26th                                              	ORDINAL        
UN Climate Change Conference of the Parties       	EVENT          
Glasglow                                          	GPE            
31                                                	CARDINAL       
November 2021                                     	DATE           


BERT

In [9]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline




In [20]:
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
nlp_BERT = pipeline("ner", model=model, tokenizer=tokenizer)

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [21]:
result = nlp_BERT(text)
for entity in result:
    print(entity)

{'entity': 'B-LOC', 'score': 0.999511, 'index': 2, 'word': 'UK', 'start': 4, 'end': 6}
{'entity': 'B-MISC', 'score': 0.90101033, 'index': 7, 'word': 'UN', 'start': 26, 'end': 28}
{'entity': 'I-MISC', 'score': 0.98907846, 'index': 8, 'word': 'Climate', 'start': 29, 'end': 36}
{'entity': 'I-MISC', 'score': 0.97872615, 'index': 9, 'word': 'Change', 'start': 37, 'end': 43}
{'entity': 'I-MISC', 'score': 0.9932528, 'index': 10, 'word': 'Conference', 'start': 44, 'end': 54}
{'entity': 'I-MISC', 'score': 0.9913018, 'index': 11, 'word': 'of', 'start': 55, 'end': 57}
{'entity': 'I-MISC', 'score': 0.99368155, 'index': 12, 'word': 'the', 'start': 58, 'end': 61}
{'entity': 'I-MISC', 'score': 0.98914206, 'index': 13, 'word': 'Parties', 'start': 62, 'end': 69}
{'entity': 'B-MISC', 'score': 0.98339003, 'index': 15, 'word': 'CO', 'start': 71, 'end': 73}
{'entity': 'I-MISC', 'score': 0.8735296, 'index': 16, 'word': '##P', 'start': 73, 'end': 74}
{'entity': 'I-MISC', 'score': 0.86632967, 'index': 17, 'wo

PyThaiNLP

In [23]:
#from pythainlp.tag.named_entity import ThaiNameTagger
#ner = ThaiNameTagger()
text = 'สวัสดีครับ เราเคยรู้จักกันหรือเปล่า หน้าตาคุ้นๆ แค่ผมมองคุณยังไม่ค่อยชัด'
#ner.get_ner(text)
