## TODO:
1. Import ENGPOST (Spacy or CoreNLP)
2. Import FILPOST (FSPOST)
3. Test both POSTs
    - how to use?
    - accuracy

# spaCy Testing

In [1]:
import spacy

In [2]:
spacy_nlp = spacy.load("en_core_web_sm")

In [3]:
def print_spacy(sentence):
    
    doc = spacy_nlp(sentence)
    
    for token in doc:
        print(token, ": ", token.pos_, ": ", spacy.explain(token.pos_))

print_spacy('Hintayin po natin ang next train na paparating sa station')

Hintayin :  PROPN :  proper noun
po :  PROPN :  proper noun
natin :  X :  other
ang :  PROPN :  proper noun
next :  PROPN :  proper noun
train :  NOUN :  noun
na :  ADP :  adposition
paparating :  VERB :  verb
sa :  NOUN :  noun
station :  NOUN :  noun


# FSPOST testing

In [5]:
import os
import nltk
from nltk.tag.stanford import StanfordPOSTagger

model = 'model//filipino-left5words-owlqn2-distsim-pref6-inf2.tagger'
jar = 'lib//stanford-postagger.jar'

fspost = StanfordPOSTagger(model, path_to_jar=jar)  # Load Tagger Model
fspost._SEPARATOR = '|'  # Set separator for proper tuple formatting (word, tag)

def set_java_path(file_path):
    """
    Function for setting java path to make Stanford POS Tagger work. Makes use of the 'os' library. Input "" to use
    default java path, otherwise set the location.
    Args:
        file_path (str): The java file path / location.
    """
    if file_path == "":
        java_path = "C:/Program Files/Java/jdk1.8.0_111/bin/java.exe"
        print("Java path set by default")
    else:
        java_path = file_path
        print("Java path set from given")
    os.environ['JAVAHOME'] = java_path

def tag_string(sentence):
    """
    Function for tagging a sentence/string. Output is a (word, pos) tuple. To output a POS-only string, enclose this
    function with 'format_pos' function. Ex. fspost.format_pos(fspost.tag_string('this is a string')). Same goes for
    Stanford's word|tag notation, use 'format_stanford' function.
    Args:
        sentence (str): The string to be tagged.
    Returns:
        tagged_string: a list of string tokens containing POS labeled (word, pos) tuples.
    """
    tokens = sentence.split()  # Tokenize Sentence by whitespaces
    # print(tokens)
    tagged_string = fspost.tag(tokens)
    return tagged_string

def tag_string_list(sentence_list):
    """
    Function for tagging a list of sentences. Output is a list of (word, pos) tuple. To output a POS-only string,
    enclose the elements in this function with 'format_pos' function. Same goes for Stanford's word|tag notation, use
    'format_stanford' function.
    Args:
        sentence_list (list): The list of strings to be tagged.
    Returns:
        tagged_list: a list of strings containing POS labelled (word, pos) tuples.
    """
    progress_ctr = 0
    tagged_list = []  # Initialize an empty list
    for sentence in sentence_list:
        tagged_tuple = tag_string(sentence)  # Tag each sentence in the list
        tagged_list.append(tagged_tuple)  # Insert tagged sentence in the new list
        progress_ctr += 1
        print(progress_ctr, "/", len(sentence_list))  # Progress Counter
    return tagged_list

In [6]:
# TODO: Set PATH of your JDK
set_java_path("C:/Program Files/Java/jdk-19/bin/java.exe")
tag_string('Hintayin po natin ang next train na paparating sa station')

Java path set from given


[('Hintayin', 'VBOF'),
 ('po', 'RBS'),
 ('natin', 'PRP'),
 ('ang', 'DTC'),
 ('next', 'FW'),
 ('train', 'FW'),
 ('na', 'CCP'),
 ('paparating', 'VBTF'),
 ('sa', 'CCT'),
 ('station', 'FW')]

## Compare the performances of the two monolingual taggers

In [None]:
while True:
    sentence = str(input("Enter sentence ([n] to stop): "))
    
    if sentence != 'n':
        print("\nspaCy:")
        print_spacy(sentence)
        print("\nFSPOST:")
        print(tag_string(sentence))
    else:
        break;

Enter sentence ([n] to stop): sana all

spaCy:
sana :  VERB :  verb
all :  PRON :  pronoun

FSPOST:
[('sana', 'VBS'), ('all', 'FW')]
