# Testing Different Monolingual Filipino and English Part of Speech (POS) Taggers

### PLEASE TAKE NOTE!!!
- [IMPORTANT] Always refresh kernel, clear outputs, and save before exiting to avoid git conflicts
- Current formatting of this .ipynb is not final, will reformat when testing sample data from FilWordnet

In [None]:
import nltk
import string
import pandas as pd
import numpy as np
from lingua import Language, LanguageDetectorBuilder #used for language identification

Initialize language Identification Model

In [None]:
languages = [Language.ENGLISH, Language.TAGALOG]
detector = LanguageDetectorBuilder.from_languages(*languages).build()

Initialize the dataframe that will hold the sentences and its pos tags

In [None]:
df_format = {
    "text": [],
    "general_tags": [],
    "specific_tags": [],
    "token_tagset": []
}

In [None]:
tagged_texts_combi1_ff = pd.DataFrame(df_format)
tagged_texts_combi2_ff = pd.DataFrame(df_format)

tagged_texts_combi1_sf = pd.DataFrame(df_format)
tagged_texts_combi2_sf = pd.DataFrame(df_format)

display(tagged_texts_combi1_ff)
display(tagged_texts_combi2_ff)

display(tagged_texts_combi1_sf)
display(tagged_texts_combi2_sf)

## Loading the test data

Let us load the .json input file

In [None]:
input_dataframe = pd.read_json("input_data.json")
display(input_dataframe)

In [None]:
tokens_temp = []
tags_temp = []
input_sentence = []

for i in range(len(input_dataframe)):
    tokens_temp.clear()
    tags_temp.clear()
    
    for j in range(input_dataframe.iloc[i].count()):
        tokens_temp.append(input_dataframe.iloc[i][j].__getitem__("token"))
        tags_temp.append(input_dataframe.iloc[i][j].__getitem__("tag"))
        
    sentence_temp = ' '.join([str(item) for item in tokens_temp])
    
    input_sentence.append(sentence_temp)

## POS TAGGERS

Let us import the monolingual taggers. Flair for english pos tagger and FSPOST for filipino pos tagger

In [None]:
# FLAIR POS TAGGER
from flair.data import Sentence
from flair.models import SequenceTagger

flair_tagger = SequenceTagger.load("flair/pos-english")

In [None]:
# SPACY POS TAGGER
import spacy

spacy_tagger = spacy.load("en_core_web_sm")

In [None]:
# FSPOST POS TAGGER
import os
from nltk.tag.stanford import StanfordPOSTagger

# These are Windows formatted directories
#model = 'model//filipino-left5words-owlqn2-distsim-pref6-inf2.tagger'
#jar = 'lib//stanford-postagger.jar'

# These are Linux formatted directories
model = 'model/filipino-left5words-owlqn2-distsim-pref6-inf2.tagger'
jar = 'lib/stanford-postagger.jar'

fspost = StanfordPOSTagger(model, path_to_jar=jar)  # Load Tagger Model
fspost._SEPARATOR = '|'  # Set separator for proper tuple formatting (word, tag)

def set_java_path(file_path):
    """
    Function for setting java path to make Stanford POS Tagger work. Makes use of the 'os' library. Input "" to use
    default java path, otherwise set the location.
    Args:
        file_path (str): The java file path / location.
    """
    if file_path == "":
        java_path = "C:/Program Files/Java/jdk1.8.0_111/bin/java.exe"
        print("Java path set by default")
    else:
        java_path = file_path
        print("Java path set from given")
    os.environ['JAVAHOME'] = java_path

def tag_string(sentence):
    """
    Function for tagging a sentence/string. Output is a (word, pos) tuple. To output a POS-only string, enclose this
    function with 'format_pos' function. Ex. fspost.format_pos(fspost.tag_string('this is a string')). Same goes for
    Stanford's word|tag notation, use 'format_stanford' function.
    Args:
        sentence (str): The string to be tagged.
    Returns:
        tagged_string: a list of string tokens containing POS labeled (word, pos) tuples.
    """
    tokens = sentence.split()  # Tokenize Sentence by whitespaces
    # print(tokens)
    tagged_string = fspost.tag(tokens)
    return tagged_string

def tag_string_list(sentence_list):
    """
    Function for tagging a list of sentences. Output is a list of (word, pos) tuple. To output a POS-only string,
    enclose the elements in this function with 'format_pos' function. Same goes for Stanford's word|tag notation, use
    'format_stanford' function.
    Args:
        sentence_list (list): The list of strings to be tagged.
    Returns:
        tagged_list: a list of strings containing POS labelled (word, pos) tuples.
    """
    progress_ctr = 0
    tagged_list = []  # Initialize an empty list
    for sentence in sentence_list:
        tagged_tuple = tag_string(sentence)  # Tag each sentence in the list
        tagged_list.append(tagged_tuple)  # Insert tagged sentence in the new list
        progress_ctr += 1
        print(progress_ctr, "/", len(sentence_list))  # Progress Counter
    return tagged_list

In [None]:
# WINDOWS
# set_java_path("C:/Program Files/Java/jdk-19/bin/java.exe")

# LINUX
set_java_path("/usr/lib/jvm/java-11-openjdk-amd64/bin/")

### Create functions to be used for POS Tagging

In [None]:
# Returns the eng POS tag (Flair version)
def eng_tagger_flair(input_string):
    sentence = Sentence(input_string)
    flair_tagger.predict(sentence)
    return sentence

In [None]:
def eng_tagger_spacy(input_string):
    return spacy_tagger(input_string)

In [None]:
# Returns the fil POS tag
def fil_tagger(input_string):
    return tag_string(input_string)

In [None]:
# Converts the specific tag into generalized tag
def convert_eng(pos_tag):
    if(pos_tag == "NN" or pos_tag == "NNS"):
        return "NOUN"
    elif(pos_tag == "NNP" or pos_tag == "NNPS"):
        return "PROPN"
    elif(pos_tag == "PRP" or pos_tag == "PRP$" or pos_tag == "WP" or pos_tag == "WP$"):
        return "PR"
    elif(pos_tag == "DT" or pos_tag == "WDT"):
        return "DT"
    elif(pos_tag == "CC"):
        return "CONJ"
    elif(pos_tag == "IN"):
        return "IN"
    elif(pos_tag == "VB" or pos_tag == "VBD" or pos_tag == "VBG" or pos_tag == "VBN" 
         or pos_tag == "VBP" or pos_tag == "VBZ"):
        return "VB"
    elif(pos_tag == "JJ" or pos_tag == "JJR" or pos_tag == "JJS"):
        return "JJ"
    elif(pos_tag == "CD"):
        return "CD"
    elif(pos_tag == "RB" or pos_tag == "RBR" or pos_tag == "RBS" or pos_tag == "WRB" or pos_tag == "RP"):
        return "RB"
    elif(pos_tag == "UH"):
        return "UH"
    elif(pos_tag == "FW"):
        return "FW"
    elif(pos_tag == "." or pos_tag == "," or pos_tag == ":" or pos_tag == "NFP" or pos_tag == "(" or pos_tag == ")"
        or pos_tag == "''" or pos_tag == '""' or pos_tag == "``" or pos_tag == "`" or pos_tag == "-RRB-"
        or pos_tag == "-LRB-"):
        return "PUNC"
    elif(pos_tag == "HYPH" or pos_tag == "SYM" or pos_tag == "$" or pos_tag == "\"" or pos_tag == "LS"):
        return "SYM"
    else:
        return pos_tag

In [None]:
# Converts the specific tag into generalized tag
def convert_fil(pos_tag):
    if(pos_tag == "NNC" or pos_tag == "NNCA"):
        return "NOUN"
    elif(pos_tag == "NNP" or pos_tag == "NNPA"):
        return "PROPN"
    elif(pos_tag == "PRS" or pos_tag == "PRP" or pos_tag == "PRSP" or pos_tag == "PRO"
        or pos_tag == "PRQ" or pos_tag == "PRQP" or pos_tag == "PRL" or pos_tag == "PRC"
        or pos_tag == "PRF" or pos_tag == "PRI"):
        return "PR"
    elif(pos_tag == "DTC" or pos_tag == "DTCP" or pos_tag == "DTP" or pos_tag == "DTPP"):
        return "DT"
    elif(pos_tag == "LM"):
        return "LM"
    elif(pos_tag == "CCT" or pos_tag == "CCR" or pos_tag == "CCB" or pos_tag == "CCA"):
        return "CONJ"
    elif(pos_tag == "CCP"):
        return "CCP"
    elif(pos_tag == "CCU"):
        return "IN"
    elif(pos_tag == "VBW" or pos_tag == "VBS" or pos_tag == "VBH" or pos_tag == "VBN"
        or pos_tag == "VBTS" or pos_tag == "VBTR" or pos_tag == "VBTF" or pos_tag == "VBTP"
        or pos_tag == "VBAF" or pos_tag == "VBOF" or pos_tag == "VBOB" or pos_tag == "VBOL"
        or pos_tag == "VBOI" or pos_tag == "VBRF"):
        return "VB"
    elif(pos_tag == "JJD" or pos_tag == "JJC" or pos_tag == "JJCC" or pos_tag == "JJCS" or pos_tag == "JJCN"):
        return "JJ"
    elif(pos_tag == "JJN" or pos_tag == "CDB"):
        return "CD"
    elif(pos_tag == "RBD" or pos_tag == "RBN" or pos_tag == "RBK" or pos_tag == "RBP"
        or pos_tag == "RBB" or pos_tag == "RBR" or pos_tag == "RBQ" or pos_tag == "RBT"
        or pos_tag == "RBF" or pos_tag == "RBW" or pos_tag == "RBM" or pos_tag == "RBL"
        or pos_tag == "RBI" or pos_tag == "RBS"):
        return "RB"
    elif(pos_tag == "RBJ"):
        return "UH"
    elif(pos_tag == "FW"):
        return "FW"
    elif(pos_tag == "PMP" or pos_tag == "PME" or pos_tag == "PMQ" or pos_tag == "PMC" or pos_tag == "PMSC"):
        return "PUNC"
    elif(pos_tag == "PMS"):
        return "SYM"
    else:
        return pos_tag

In [None]:
langid = [] # will be used to store language id

def reset_variables(general, specific, token_tagset):
    #langid.clear()
    general.clear()
    specific.clear()
    token_tagset.clear()
    return

def reset_variables_combi2(gen_eng, spec_eng, gen_fil, spec_fil):
    gen_eng.clear()
    spec_eng.clear()
    gen_fil.clear()
    spec_fil.clear()

In [None]:
# Variables to be used for Flair-FSPOST Tagger

pos_tags_general_ff = [] # will be used to store generalized pos tags
pos_tags_specific_ff = [] # will be used to store specific pos tags
token_tagset_ff = [] # will be used to store the name of the tagset used for specific tags

# Temporary lists to be used for combi 2
pos_tags_general_eng_ff = []
pos_tags_specific_eng_ff = []
pos_tags_general_fil_ff = []
pos_tags_specific_fil_ff = []

In [None]:
# Variables to be used for Spacy-FSPOST Tagger

pos_tags_general_sf = [] # will be used to store generalized pos tags
pos_tags_specific_sf = [] # will be used to store specific pos tags
token_tagset_sf = [] # will be used to store the name of the tagset used for specific tags

# Temporary lists to be used for combi 2
pos_tags_general_eng_sf = []
pos_tags_specific_eng_sf = []
pos_tags_general_fil_sf = []
pos_tags_specific_fil_sf = []

In [None]:
def append_to_dataframe(input_sentence, general_tags, specific_tags, tagset, tagged_texts):
    #tagged_texts = tagged_texts.append({"text": input_sentence, "general_tags": np.array(general_tags),
                                       # "specific_tags": np.array(specific_tags), "token_tagset": np.array(tagset)},
                                       #ignore_index = True)
    tagged_texts = pd.concat([tagged_texts, pd.DataFrame.from_records([{"text": input_sentence,
                            "general_tags": np.array(general_tags), "specific_tags": np.array(specific_tags),
                            "token_tagset": np.array(tagset)}])], ignore_index = True)
    return tagged_texts

In [None]:
def join_string(string_list):
    return ' '.join([str(item) for item in string_list])

In [None]:
#nltk.download('punkt')

In [None]:
def tokenized_text_with_punc(input_text):
    input_text_tokenized = nltk.word_tokenize(input_text)
    return input_text_tokenized

In [None]:
def get_lang_per_token(text_wo_punc):
    langid = []
    #Identifies the language of each tokens to determine which tagger to use
    for i in range(len(text_wo_punc)):
        langid.append(detector.detect_language_of(text_wo_punc[i]))
        
    return langid

In [None]:
def dataframe_to_csv(dataframes, output_name):
    dataframes['general_tags'] = dataframes['general_tags'].map(list)
    dataframes['specific_tags'] = dataframes['specific_tags'].map(list)
    dataframes['token_tagset'] = dataframes['token_tagset'].map(list)
    dataframes.to_csv(output_name)

In [None]:
def isMultipleTags(tag):
    if(tag.__contains__('_')):
        new_tag = tag.split('_')
        return new_tag[0]
    else:
        return tag

## Language Identification then Monolingual Tagging (COMBI 1)

In [None]:
def lang_id_then_mono_tag(input_string):
    
    input_text_tokenized = tokenized_text_with_punc(input_string)
    
    # reset temporary variables
    reset_variables(pos_tags_general_ff, pos_tags_specific_ff, token_tagset_ff) # Flair-FSPOST version
    reset_variables(pos_tags_general_sf, pos_tags_specific_sf, token_tagset_sf) # Spacy-FSPOST version
    
    langid = get_lang_per_token(input_text_tokenized)
    
    for i in range(len(input_text_tokenized)):
        if(langid[i] == Language.TAGALOG):
            token = fil_tagger(input_text_tokenized[i])
            new_token = isMultipleTags(token[0][1])
            
            # Flair-FSPOST dataframes
            pos_tags_general_ff.append(convert_fil(new_token))
            pos_tags_specific_ff.append(new_token)
            token_tagset_ff.append("MGNN")
            
            # Spacy-FSPOST dataframes
            pos_tags_general_sf.append(convert_fil(new_token))
            pos_tags_specific_sf.append(new_token)
            token_tagset_sf.append("MGNN")

        elif(langid[i] == Language.ENGLISH):
            
            # Flair-FSPOST dataframes
            token = eng_tagger_flair(input_text_tokenized[i])
            new_token = isMultipleTags(token.get_labels('pos')[0].value)
            pos_tags_general_ff.append(convert_eng(new_token))
            pos_tags_specific_ff.append(new_token)
            token_tagset_ff.append("Flair")
            
             # Spacy-FSPOST dataframes
            token = eng_tagger_spacy(input_text_tokenized[i])
            new_token = isMultipleTags(token[0].tag_)
            pos_tags_general_sf.append(convert_eng(new_token))
            pos_tags_specific_sf.append(new_token)
            token_tagset_sf.append("Spacy")

        else:
            token = fil_tagger(input_text_tokenized[i])
            new_token = isMultipleTags(token[0][1])
            
            # Flair-FSPOST dataframes
            pos_tags_general_ff.append(convert_fil(new_token))
            pos_tags_specific_ff.append(new_token)
            token_tagset_ff.append("MGNN")
            
            # Spacy-FSPOST dataframes
            pos_tags_general_sf.append(convert_fil(new_token))
            pos_tags_specific_sf.append(new_token)
            token_tagset_sf.append("MGNN")
        
    global tagged_texts_combi1_ff
    temp = tagged_texts_combi1_ff
    tagged_texts_combi1_ff = append_to_dataframe(input_string, pos_tags_general_ff,
                                              pos_tags_specific_ff, token_tagset_ff, temp)
    
    global tagged_texts_combi1_sf
    temp = tagged_texts_combi1_sf
    tagged_texts_combi1_sf = append_to_dataframe(input_string, pos_tags_general_sf,
                                                 pos_tags_specific_sf, token_tagset_sf, temp)

In [None]:
j = 0
for i in range(len(input_sentence)):
    lang_id_then_mono_tag(input_sentence[i])
    
    if i % 100 == 0:
        print("pass ", j)
        j = j + 1
display(tagged_texts_combi1_ff)
display(tagged_texts_combi1_sf)

## Monolingual Tagging then Language Identification (COMBI 2)

In [None]:
def mono_tag_then_lang_id(input_string):
    # Resets temp variables FLair version
    reset_variables(pos_tags_general_ff, pos_tags_specific_ff, token_tagset_ff)
    reset_variables_combi2(pos_tags_general_eng_ff, pos_tags_specific_eng_ff,
                           pos_tags_general_fil_ff, pos_tags_specific_fil_ff)
    
    # Resets temp variables Spacy version
    reset_variables(pos_tags_general_sf, pos_tags_specific_sf, token_tagset_sf)
    reset_variables_combi2(pos_tags_general_eng_sf, pos_tags_specific_eng_sf,
                           pos_tags_general_fil_sf, pos_tags_specific_fil_sf)
    
    # Tokenized sentences
    input_text_tokenized = tokenized_text_with_punc(input_string)
    
    # Flair (English) pos tagging
    token_eng_flair = eng_tagger_flair(input_string)
    # Spacy (English) pos tagging
    token_eng_spacy = eng_tagger_spacy(input_string)
    
    # Store tags to temporary variables (ENGLISH - Flair Tagger)
    for i in range(len(token_eng_flair.get_labels('pos'))):
        new_token = isMultipleTags(token_eng_flair.get_labels('pos')[i].value)
        pos_tags_general_eng_ff.append(convert_eng(new_token))
        pos_tags_specific_eng_ff.append(new_token)
        
   # Store tags to temporary variables (ENGLISH - Spacy Tagger)
    for i in range(len(token_eng_spacy)):
        new_token = isMultipleTags(token_eng_spacy[i].tag_)
        pos_tags_general_eng_sf.append(convert_eng(new_token))
        pos_tags_specific_eng_sf.append(new_token)
         
        
    # FSPOST (Filipino) pos tagging
    token_fil = fil_tagger(input_string)
    
    # Store tags to temporary variables (FILIPINO Tagger)
    for i in range(len(token_fil)):
        new_token = isMultipleTags(token_fil[i][1])
        
        # Store filipino tags on Flair-FSPOST dataframe
        pos_tags_general_fil_ff.append(convert_fil(new_token))
        pos_tags_specific_fil_ff.append(new_token)
        
        # Store filipino tags on Spacy-FSPOST dataframe
        pos_tags_general_fil_sf.append(convert_fil(new_token))
        pos_tags_specific_fil_sf.append(new_token)
    
    # Get Languages per token ( Language Identification )
    langid = get_lang_per_token(input_text_tokenized)
    
    for i in range(len(input_text_tokenized)):
        if(langid[i] == Language.TAGALOG):
            pos_tags_general_ff.append(pos_tags_general_fil_ff[i])
            pos_tags_specific_ff.append(pos_tags_specific_fil_ff[i])
            token_tagset_ff.append("MGNN")
            
            pos_tags_general_sf.append(pos_tags_general_fil_sf[i])
            pos_tags_specific_sf.append(pos_tags_specific_fil_sf[i])
            token_tagset_sf.append("MGNN")

        elif(langid[i] == Language.ENGLISH):
            pos_tags_general_ff.append(pos_tags_general_eng_ff[i])
            pos_tags_specific_ff.append(pos_tags_specific_eng_ff[i])
            token_tagset_ff.append("Flair")
            
            pos_tags_general_sf.append(pos_tags_general_eng_sf[i])
            pos_tags_specific_sf.append(pos_tags_specific_eng_sf[i])
            token_tagset_sf.append("Spacy")

        else:
            pos_tags_general_ff.append(pos_tags_general_fil_ff[i])
            pos_tags_specific_ff.append(pos_tags_specific_fil_ff[i])
            token_tagset_ff.append("MGNN")
            
            pos_tags_general_sf.append(pos_tags_general_fil_sf[i])
            pos_tags_specific_sf.append(pos_tags_specific_fil_sf[i])
            token_tagset_sf.append("MGNN")
        
        
    global tagged_texts_combi2_ff
    temp = tagged_texts_combi2_ff
    tagged_texts_combi2_ff = append_to_dataframe(input_string, pos_tags_general_ff,
                                              pos_tags_specific_ff, token_tagset_ff, temp)
    
    global tagged_texts_combi2_sf
    temp = tagged_texts_combi2_sf
    tagged_texts_combi2_sf = append_to_dataframe(input_string, pos_tags_general_sf,
                                              pos_tags_specific_sf, token_tagset_sf, temp)
    

In [None]:
j = 0
for i in range(len(input_sentence)):
    #try:
    mono_tag_then_lang_id(input_sentence[i])
    
    if i % 100 == 0:
        print("pass ", j)
        j = j + 1
            
    #except:
    #print(i, ': ', input_sentence[i])
        
display(tagged_texts_combi2_ff)
display(tagged_texts_combi2_sf)

In [None]:
dataframe_to_csv(tagged_texts_combi1_ff, "Flair-FSPOST-Combination-1.csv")
dataframe_to_csv(tagged_texts_combi2_ff, "Flair-FSPOST-Combination-2.csv")

dataframe_to_csv(tagged_texts_combi1_sf, "Spacy-FSPOST-Combination-1.csv")
dataframe_to_csv(tagged_texts_combi2_sf, "Spacy-FSPOST-Combination-2.csv")

In [None]:
display(tagged_texts_combi1_ff)
display(tagged_texts_combi2_ff)
display(tagged_texts_combi1_sf)
display(tagged_texts_combi2_sf)

In [None]:
#test = tag_string("Noong nakaraang Nobyembre , isang tuta na kakaiba rin ang hitsura ang isinilang naman sa Mati city .")
#test = tag_string("Pumanaw ang 65 taong gulang")
test = tag_string(input_sentence[1])
print(test[2].get_item)
test[0].__dir__()

In [None]:
print(token_tagset)
display(input_sentence[24])

In [None]:
print(tagged_texts_combi1['general_tags'][0])

# TO BE REMOVED EVERYTHING BELOW

In [None]:
reset_variables(sample_text_pos_tags_general, sample_text_pos_tags_specific)
reset_variables_combi2(sample_text_pos_tags_general_eng, sample_text_pos_tags_specific_eng,
                       sample_text_pos_tags_general_fil, sample_text_pos_tags_specific_fil)

In [None]:
# Flair (English) pos tagging
token_eng = eng_tagger(sample_text)

for i in range(len(token_eng.get_labels('pos'))):
    sample_text_pos_tags_general_eng.append(convert_eng(token_eng.get_labels('pos')[i].value))
    sample_text_pos_tags_specific_eng.append(convert_eng(token_eng.get_labels('pos')[i].value))

In [None]:
# FSPOST (Filipino) pos tagging
sentence = nltk.word_tokenize(sample_text)
sentence = join_string(sentence)
token_fil = fil_tagger(sentence)

for i in range(len(token_fil)):
    sample_text_pos_tags_general_fil.append(convert_fil(token_fil[i][1]))
    sample_text_pos_tags_specific_fil.append(convert_fil(token_fil[i][1]))

In [None]:
#Language identification model cannot identify punctuations or symbol.
#Create a copy of the sentence without punctuations or symbols
text_without_punc = sample_text.translate(str.maketrans('', '', string.punctuation))

#Tokenize the sample text without punctuation
text_without_punc_tokenized = nltk.word_tokenize(text_without_punc)
sample_text_tokenized = nltk.word_tokenize(sample_text)

#IF ABOVE CODE HAS LOOKUP ERROR: RUN CODE ABOVE

In [None]:
#Identifies the language of each tokens to determine which tagger to use
for i in range(len(text_without_punc_tokenized)):
    sample_text_langid.append(detector.detect_language_of(text_without_punc_tokenized[i]))
    print(text_without_punc_tokenized[i], ": ", sample_text_langid[i])

In [None]:
j = 0
for i in range(len(sample_text_tokenized)):
    if(sample_text_tokenized[i] == text_without_punc_tokenized[j]):
        if(sample_text_langid[j] == Language.TAGALOG):
            sample_text_pos_tags_general.append(sample_text_pos_tags_general_fil[i])
            sample_text_pos_tags_specific.append(sample_text_pos_tags_specific_fil[i])
            
        elif(sample_text_langid[j] == Language.ENGLISH):
            token = eng_tagger(text_without_punc_tokenized[j])
            sample_text_pos_tags_general.append(convert_eng(token.get_labels('pos')[0].value))
            sample_text_pos_tags_specific.append(token.get_labels('pos')[0].value)
            
        j = j + 1
        
        if(j == len(text_without_punc_tokenized)):
            j = 0
    else:
        token = fil_tagger(sample_text_tokenized[i])
        sample_text_pos_tags_general.append(sample_text_pos_tags_general_fil[i])
        sample_text_pos_tags_specific.append(sample_text_pos_tags_specific_fil[i])
        
    print(sample_text_tokenized[i], ": General POS Tag -> ", sample_text_pos_tags_general[i], " - Specific POS Tag -> ", sample_text_pos_tags_specific[i])

In [None]:
print(sample_text_pos_tags_specific)
append_to_dataframe(sample_text, sample_text_pos_tags_general, sample_text_pos_tags_specific, tagged_texts_combi2)

## DO NOT RUN ANYTHING BELOW

In [None]:
#test = Sentence(sample_text)
#tagger.predict(test)

test = eng_tagger("ako")

#print(test.get_labels('pos')[0])

#print(test[0].get_labels('pos').value)

label = test.get_labels('pos')[0].value
print(test.get_labels('pos')[0].value)

#for label in test.get_labels('pos'):
    #print(label.value)
    
#print(test)
#print(test.to_tagged_string())
#for entity in test.get_spans('pos'):
    #print(entity)

In [None]:
#test = tag_string(sample_text)
test = fil_tagger(sample_text)
print(test[0])
#print(dir(test2[0].__getattribute__('pos')))

### Flair Testing (with FW tag)

Import Flair and tagger to use (pos-english)

In [None]:
from flair.data import Sentence
from flair.models import SequenceTagger
# gian was here
# load tagger
tagger = SequenceTagger.load("flair/pos-english")
#tagger = SequenceTagger.load("pos")

Generate POS tags with infinite loop for testing

In [None]:
# Type C or c to cancel loop
while True:
    input_sentence = input("Enter sample sentence: ")
    
    if input_sentence == "c" or input_sentence == "C":
        break
        
    sentence_test = Sentence(input_sentence)
    tagger.predict(sentence_test)
    
    print("\n")
    print(sentence_test)
    
    
    # print predicted NER spans
    print('The following NER tags are found:')
    # iterate over entities and print
    for entity in sentence_test.get_spans('pos'):
        print(entity)
        
    print("\n")

### FSPOST (Go & Nocon, 2017)

Use FSPOST pipeline

In [None]:
import os
import nltk
from nltk.tag.stanford import StanfordPOSTagger

# These are Windows formatted directories
#model = 'model//filipino-left5words-owlqn2-distsim-pref6-inf2.tagger'
#jar = 'lib//stanford-postagger.jar'

# These are Linux formatted directories
model = 'model/filipino-left5words-owlqn2-distsim-pref6-inf2.tagger'
jar = 'lib/stanford-postagger.jar'

fspost = StanfordPOSTagger(model, path_to_jar=jar)  # Load Tagger Model
fspost._SEPARATOR = '|'  # Set separator for proper tuple formatting (word, tag)

def set_java_path(file_path):
    """
    Function for setting java path to make Stanford POS Tagger work. Makes use of the 'os' library. Input "" to use
    default java path, otherwise set the location.
    Args:
        file_path (str): The java file path / location.
    """
    if file_path == "":
        java_path = "C:/Program Files/Java/jdk1.8.0_111/bin/java.exe"
        print("Java path set by default")
    else:
        java_path = file_path
        print("Java path set from given")
    os.environ['JAVAHOME'] = java_path

def tag_string(sentence):
    """
    Function for tagging a sentence/string. Output is a (word, pos) tuple. To output a POS-only string, enclose this
    function with 'format_pos' function. Ex. fspost.format_pos(fspost.tag_string('this is a string')). Same goes for
    Stanford's word|tag notation, use 'format_stanford' function.
    Args:
        sentence (str): The string to be tagged.
    Returns:
        tagged_string: a list of string tokens containing POS labeled (word, pos) tuples.
    """
    tokens = sentence.split()  # Tokenize Sentence by whitespaces
    # print(tokens)
    tagged_string = fspost.tag(tokens)
    return tagged_string

def tag_string_list(sentence_list):
    """
    Function for tagging a list of sentences. Output is a list of (word, pos) tuple. To output a POS-only string,
    enclose the elements in this function with 'format_pos' function. Same goes for Stanford's word|tag notation, use
    'format_stanford' function.
    Args:
        sentence_list (list): The list of strings to be tagged.
    Returns:
        tagged_list: a list of strings containing POS labelled (word, pos) tuples.
    """
    progress_ctr = 0
    tagged_list = []  # Initialize an empty list
    for sentence in sentence_list:
        tagged_tuple = tag_string(sentence)  # Tag each sentence in the list
        tagged_list.append(tagged_tuple)  # Insert tagged sentence in the new list
        progress_ctr += 1
        print(progress_ctr, "/", len(sentence_list))  # Progress Counter
    return tagged_list

[REQUIRED] Set JDK Path

In [None]:
# WINDOWS
# set_java_path("C:/Program Files/Java/jdk-19/bin/java.exe")

# LINUX
set_java_path("/usr/lib/jvm/java-11-openjdk-amd64/bin/")

In [None]:
# Type C or c to cancel loop
while True:
    input_sentence_mgnn = input("Enter sample sentence: ")
    
    if input_sentence_mgnn == "c" or input_sentence_mgnn == "C":
        break
        
    print(tag_string(input_sentence_mgnn))


## Get sample sentence from FilWordNet

Import FilWordNet Corpus

In [None]:
import pandas as pd
import random

filword_corpus = pd.read_csv("processed_corpus_oct_2022.csv")

Generate random string from FilWordNet

In [None]:
#Gets the sentences with the source type online_forums, social_media, news_sites
raw_sentences = filword_corpus[filword_corpus.source_type.isin(['online_forums', 'social_media', 'news_sites'])]

#Drops the rows with a substring XX_...
raw_sentences = raw_sentences.loc[~raw_sentences['text'].str.contains('XX_\w{1,}')]
#Drops the rows with a special character not included in ASCII dec 32-126
sentences = raw_sentences.loc[~raw_sentences['text'].str.contains('[^\x20-\x7E]')]

#Resets the index to start at 0. Since we removed some rows from the original data,
#resetting the index must be performed
sentences = sentences.reset_index(drop=True)

In [None]:

randInd = random.randrange(len(sentences))
filword_randtext = sentences.text[randInd]

print(filword_randtext)

In [None]:
# make example sentence
sentence = Sentence(filword_randtext)
#AAAAAAAAA
# Hello world!
# predict NER tags
tagger.predict(sentence)

# print sentence
print(sentence)

# print predicted NER spans
print('The following NER tags are found:')
# iterate over entities and print
for entity in sentence.get_spans('pos'):
    print(entity)

## Combined POS Tagger - Combination 1

### Language Identification then Monolingual Tagger

## ENGPOSTs Testing

### spaCy Testing

Import spaCy and model

In [None]:
import spacy
spacy_nlp = spacy.load("en_core_web_sm")

Generate POS Tags

In [None]:
def print_spacy(sentence):
    
    doc = spacy_nlp(sentence)
    
    for token in doc:
        print(token, ": ", token.pos_, ": ", spacy.explain(token.pos_))

print_spacy(filword_randtext)

### NLTK Testing (default ENGPOST, with FW tag)

In [None]:
import nltk

# [IMPORTANT] if this is your first time running this Python Notebook, run this:
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')

from nltk.tokenize import word_tokenize
from nltk import pos_tag

text = word_tokenize(filword_randtext)
nltk.pos_tag(text)

## FILPOSTs Testing

### LSTM Based Filipino POS Tagger (Cruz, 2020)  ***unfinished***

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as datautils

from tqdm import tqdm

from utils.utils import predict, normalize, produce_vocab, proc_set, init_weights, accuracy
from utils.model import LSTMTagger

import argparse
import os

def main():
    '''
    parser = argparse.ArgumentParser()
    parser.add_argument('--do_train', action='store_true', help='Train a part of speech tagger.')
    parser.add_argument('--do_predict', action='store_true', help='Use a trained model to predict parts of speech.')
    parser.add_argument('--seed', type=int, default=1234, help='Random seed.')
    parser.add_argument('--checkpoint', type=str, default='checkpoint', help='Location to save model.')
    parser.add_argument('--overwrite_save_directory', action='store_true', help='Overwrite the save directory if it exists.')

    parser.add_argument('--train_data', type=str, help='Training text dataset.')
    parser.add_argument('--evaluation_data', type=str, help='Evaluation text dataset.')
    parser.add_argument('--train_tags', type=str, help='Training tags dataset.')
    parser.add_argument('--evaluation_tags', type=str, help='Evaluation tags dataset.')
    parser.add_argument('--no_cuda', action='store_true', help='Do not use a GPU.')
    
    parser.add_argument('--embedding_dim', type=int, default=300, help='Embedding dimension.')
    parser.add_argument('--num_layers', type=int, default=1, help='Number of recurrent layers.')
    parser.add_argument('--bidirectional', action='store_true', help='Use a bidirectional RNN.')
    parser.add_argument('--hidden_dim', type=int, default=512, help='Hidden dimension.')
    parser.add_argument('--dropout', type=float, default=0.5, help='Dropout probability.')
    parser.add_argument('--recur_dropout', type=float, default=0.1, help='Recurrent dropout probability.')
    parser.add_argument('--min_freq', type=int, default=1, help='Minimum frequency of words to be added to vocabulary.')
    parser.add_argument('--msl', type=int, default=128, help='Maximum sequence length of text.')
    parser.add_argument('--bs', type=int, default=128, help='Batch size.')
    parser.add_argument('--learning_rate', type=float, default=3e-4, help='Learning rate.')
    parser.add_argument('--weight_decay', type=float, default=0.0, help='Weight decay.')
    parser.add_argument('--epochs', type=int, default=10, help='Number of epochs to train.')
    parser.add_argument('--sentence', type=str, default='Hello', help='Sentence to predict')
    '''
    
    # args = parser.parse_args()
    torch.manual_seed(args.seed);
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    '''
    if args.do_train:
        # Load Dataset
        print("Loading dataset")
        with open(args.train_data, 'r') as f:
            train_words = [line.strip() for line in f]
        with open(args.evaluation_data, 'r') as f:
            test_words = [line.strip() for line in f]
        with open(args.train_tags, 'r') as f:
            train_tags = [line.strip() for line in f]
        with open(args.evaluation_tags, 'r') as f:
            test_tags = [line.strip() for line in f]

        # Normalize text
        print("Normalizing text and producing vocabularies.")
        train_words = [normalize(line) for line in train_words]
        test_words = [normalize(line) for line in test_words]

        # Produce vocabularies
        word_vocab, idx2word, word2idx = produce_vocab(train_words, min_freq=args.min_freq)
        tags_vocab, idx2tag, tag2idx  = produce_vocab(train_tags, min_freq=args.min_freq)
        print("Training word vocabulary has {:,} unique tokens.".format(len(word_vocab)))
        print("Training tags vocabulary has {:,} unique tokens.".format(len(tags_vocab)))

        # Produce sets
        X_train = proc_set(train_words, word2idx, word_vocab, msl=args.msl)
        y_train = proc_set(train_tags , tag2idx,  tags_vocab,  msl=args.msl)
        X_test = proc_set(test_words, word2idx, word_vocab, msl=args.msl)
        y_test = proc_set(test_tags , tag2idx,  tags_vocab,  msl=args.msl)

        # Convert to tensors
        X_train, y_train = torch.LongTensor(X_train), torch.LongTensor(y_train)
        X_test, y_test = torch.LongTensor(X_test), torch.LongTensor(y_test)

        # Produce dataloaders
        train_set = datautils.TensorDataset(X_train, y_train)
        test_set = datautils.TensorDataset(X_test, y_test)
        train_sampler = datautils.RandomSampler(train_set)
        train_loader = datautils.DataLoader(train_set, sampler=train_sampler, batch_size=args.bs)
        test_loader = datautils.DataLoader(test_set, shuffle=False, batch_size=args.bs)

        print("Training batches: {}\nEvaluation batches: {}".format(len(train_loader), len(test_loader)))

        # Training setup
        model = LSTMTagger(word_vocab_sz=len(word_vocab), 
                           tag_vocab_sz=len(tags_vocab), 
                           embedding_dim=args.embedding_dim, 
                           hidden_dim=args.hidden_dim, 
                           dropout=args.dropout,
                           num_layers=args.num_layers,
                           recur_dropout=args.recur_dropout,
                           bidirectional=args.bidirectional).to(device)
        model.apply(init_weights)
        criterion = nn.CrossEntropyLoss(ignore_index=tag2idx['<pad>'])
        optimizer = optim.Adam(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay)

        print("Model has {:,} trainable parameters.".format(sum(p.numel() for p in model.parameters() if p.requires_grad)))   

        # Training
        for e in range(1, args.epochs + 1):
            model.train()
            train_loss, train_acc = 0, 0
            for x, y in tqdm(train_loader):
                x, y = x.transpose(1, 0).to(device), y.transpose(1, 0).to(device)
                out = model(x)
                loss = criterion(out.flatten(0, 1), y.flatten(0))

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                train_loss += loss.item()
                train_acc += accuracy(out, y, tag2idx)
            train_loss /= len(train_loader)
            train_acc /= len(train_loader)

            model.eval()
            test_loss, test_acc = 0, 0
            for x, y in tqdm(test_loader):
                with torch.no_grad():
                    x, y = x.transpose(1, 0).to(device), y.transpose(1, 0).to(device)
                    out = model(x)
                    loss = criterion(out.flatten(0, 1), y.flatten(0))
                test_loss += loss.item()
                test_acc += accuracy(out, y, tag2idx)
            test_loss /= len(test_loader)
            test_acc /= len(test_loader)

            print("Epoch {:4} | Train Loss {:.4f} | Train Acc {:.2f}% | Test Loss {:.4f} | Test Acc {:.2f}%".format(e, train_loss, train_acc, test_loss, test_acc))  
        
        # Save model
        if args.overwrite_save_directory:
            if os.path.exists(args.checkpoint): os.system('rm -r '+ args.checkpoint + '/')

        print('Saving model and vocabularies.')
        os.mkdir(args.checkpoint)
        with open(args.checkpoint + '/model.bin', 'wb') as f:
            torch.save(model.state_dict(), f)
        with open(args.checkpoint + '/settings.bin', 'wb') as f:
            torch.save([word_vocab, word2idx, idx2word, tags_vocab, tag2idx, idx2tag, args.msl, 
                        args.embedding_dim, args.hidden_dim, args.dropout, args.bidirectional, 
                        args.num_layers, args.recur_dropout], f)
    '''
    #if args.do_predict:
        # Load the vocabularies
    with open('checkpoint/settings.bin', 'rb') as f:
        word_vocab, word2idx, idx2word, tags_vocab, tag2idx, idx2tag, msl, embedding_dim, hidden_dim, dropout, bidirectional, num_layers, recur_dropout = torch.load(f)

        # Produce a blank model
    model = LSTMTagger(word_vocab_sz=len(word_vocab), 
                        tag_vocab_sz=len(tags_vocab), 
                        embedding_dim=embedding_dim, 
                        hidden_dim=hidden_dim, 
                        dropout=dropout,
                        num_layers=num_layers,
                        recur_dropout=recur_dropout,
                        bidirectional=bidirectional)

    # Load checkpoints and put the model in eval mode
    with open('checkpoint/model.bin', 'rb') as f:
        model.load_state_dict(torch.load(f))
    model = model.cpu()
    model.eval();

    preds = predict(args.sentence, word2idx, idx2tag, word_vocab, msl, model)
    print(preds)

if __name__ == '__main__':
    main()
