# Testing Different Monolingual Filipino and English Part of Speech (POS) Taggers

Import FilWordNet Corpus

In [5]:
import pandas as pd
import random

filword_corpus = pd.read_csv("processed_corpus_oct_2022.csv")

Generate random string from FilWordNet

In [7]:
randInd = random.randrange(len(filword_corpus))
filword_randtext = filword_corpus.text[randInd]

print(filword_randtext)

And medyo nasa side ng nsfw ngayon this maybe one time thing kaya YOLO


## ENGPOSTs Testing

### spaCy Testing

Import spaCy and model

In [8]:
import spacy
spacy_nlp = spacy.load("en_core_web_sm")

Generate POS Tags

In [9]:
def print_spacy(sentence):
    
    doc = spacy_nlp(sentence)
    
    for token in doc:
        print(token, ": ", token.pos_, ": ", spacy.explain(token.pos_))

print_spacy(filword_randtext)

And :  CCONJ :  coordinating conjunction
medyo :  VERB :  verb
nasa :  PROPN :  proper noun
side :  PROPN :  proper noun
ng :  PROPN :  proper noun
nsfw :  PROPN :  proper noun
ngayon :  VERB :  verb
this :  DET :  determiner
maybe :  ADV :  adverb
one :  NUM :  numeral
time :  NOUN :  noun
thing :  NOUN :  noun
kaya :  VERB :  verb
YOLO :  VERB :  verb


### Flair Testing (with FW tag)

Import Flair and tagger to use (pos-english)

In [10]:
from flair.data import Sentence
from flair.models import SequenceTagger

# load tagger
tagger = SequenceTagger.load("flair/pos-english")

2022-11-02 03:04:07,537 loading file /home/makorino/.flair/models/pos-english/a9a73f6cd878edce8a0fa518db76f441f1cc49c2525b2b4557af278ec2f0659e.121306ea62993d04cd1978398b68396931a39eb47754c8a06a87f325ea70ac63
2022-11-02 03:04:08,119 SequenceTagger predicts: Dictionary with 53 tags: <unk>, O, UH, ,, VBD, PRP, VB, PRP$, NN, RB, ., DT, JJ, VBP, VBG, IN, CD, NNS, NNP, WRB, VBZ, WDT, CC, TO, MD, VBN, WP, :, RP, EX, JJR, FW, XX, HYPH, POS, RBR, JJS, PDT, NNPS, RBS, AFX, WP$, -LRB-, -RRB-, ``, '', LS, $, SYM, ADD


Generate POS Tags

In [11]:
# make example sentence
sentence = Sentence(filword_randtext)

# predict NER tags
tagger.predict(sentence)

# print sentence
print(sentence)

# print predicted NER spans
print('The following NER tags are found:')
# iterate over entities and print
for entity in sentence.get_spans('pos'):
    print(entity)

Sentence: "And medyo nasa side ng nsfw ngayon this maybe one time thing kaya YOLO" → ["And"/CC, "medyo"/NN, "nasa"/NN, "side"/NN, "ng"/UH, "nsfw"/FW, "ngayon"/FW, "this"/DT, "maybe"/RB, "one"/CD, "time"/NN, "thing"/NN, "kaya"/NN, "YOLO"/NNP]
The following NER tags are found:


## FILPOSTs Testing

### FSPOST (Go & Nocon, 2017)

Use FSPOST pipeline

In [13]:
import os
import nltk
from nltk.tag.stanford import StanfordPOSTagger

# These are Windows formatted directories
#model = 'model//filipino-left5words-owlqn2-distsim-pref6-inf2.tagger'
#jar = 'lib//stanford-postagger.jar'

# These are Linux formatted directories
model = 'model/filipino-left5words-owlqn2-distsim-pref6-inf2.tagger'
jar = 'lib/stanford-postagger.jar'

fspost = StanfordPOSTagger(model, path_to_jar=jar)  # Load Tagger Model
fspost._SEPARATOR = '|'  # Set separator for proper tuple formatting (word, tag)

def set_java_path(file_path):
    """
    Function for setting java path to make Stanford POS Tagger work. Makes use of the 'os' library. Input "" to use
    default java path, otherwise set the location.
    Args:
        file_path (str): The java file path / location.
    """
    if file_path == "":
        java_path = "C:/Program Files/Java/jdk1.8.0_111/bin/java.exe"
        print("Java path set by default")
    else:
        java_path = file_path
        print("Java path set from given")
    os.environ['JAVAHOME'] = java_path

def tag_string(sentence):
    """
    Function for tagging a sentence/string. Output is a (word, pos) tuple. To output a POS-only string, enclose this
    function with 'format_pos' function. Ex. fspost.format_pos(fspost.tag_string('this is a string')). Same goes for
    Stanford's word|tag notation, use 'format_stanford' function.
    Args:
        sentence (str): The string to be tagged.
    Returns:
        tagged_string: a list of string tokens containing POS labeled (word, pos) tuples.
    """
    tokens = sentence.split()  # Tokenize Sentence by whitespaces
    # print(tokens)
    tagged_string = fspost.tag(tokens)
    return tagged_string

def tag_string_list(sentence_list):
    """
    Function for tagging a list of sentences. Output is a list of (word, pos) tuple. To output a POS-only string,
    enclose the elements in this function with 'format_pos' function. Same goes for Stanford's word|tag notation, use
    'format_stanford' function.
    Args:
        sentence_list (list): The list of strings to be tagged.
    Returns:
        tagged_list: a list of strings containing POS labelled (word, pos) tuples.
    """
    progress_ctr = 0
    tagged_list = []  # Initialize an empty list
    for sentence in sentence_list:
        tagged_tuple = tag_string(sentence)  # Tag each sentence in the list
        tagged_list.append(tagged_tuple)  # Insert tagged sentence in the new list
        progress_ctr += 1
        print(progress_ctr, "/", len(sentence_list))  # Progress Counter
    return tagged_list

[REQUIRED] Set JDK Path

In [14]:
# WINDOWS
# set_java_path("C:/Program Files/Java/jdk-19/bin/java.exe")

# LINUX
set_java_path("/usr/lib/jvm/java-11-openjdk-amd64/bin/")

Java path set from given


Generate POS Tags

In [15]:
tag_string(filword_randtext)

[('And', 'NNP'),
 ('medyo', 'JJC'),
 ('nasa', 'RBL'),
 ('side', 'FW'),
 ('ng', 'CCB'),
 ('nsfw', 'FW'),
 ('ngayon', 'RBW'),
 ('this', 'FW'),
 ('maybe', 'FW'),
 ('one', 'FW'),
 ('time', 'FW'),
 ('thing', 'FW'),
 ('kaya', 'CCR'),
 ('YOLO', 'NNPA')]

In [17]:
! python ./filipino-pos/main.py \
    --do_predict \
    --checkpoint checkpoint \
    --sentence 'ginagamit ang matematika sa agham .'

Traceback (most recent call last):
  File "./filipino-pos/main.py", line 173, in <module>
    main()
  File "./filipino-pos/main.py", line 150, in main
    with open(args.checkpoint + '/settings.bin', 'rb') as f:
FileNotFoundError: [Errno 2] No such file or directory: 'checkpoint/settings.bin'
