# Testing Different Monolingual Filipino and English Part of Speech (POS) Taggers

### PLEASE TAKE NOTE!!!
- [IMPORTANT] Always refresh kernel, clear outputs, and save before exiting to avoid git conflicts
- Current formatting of this .ipynb is not final, will reformat when testing sample data from FilWordnet

In [None]:
import nltk
import string
import pandas as pd
import numpy as np
from lingua import Language, LanguageDetectorBuilder #used for language identification

Initialize language Identification Model

In [None]:
languages = [Language.ENGLISH, Language.TAGALOG]
detector = LanguageDetectorBuilder.from_languages(*languages).build()

Initialize the dataframe that will hold the sentences and its pos tags

In [None]:
df_format = {
    "text": [],
    "general_tags": [],
    "specific_tags": [],
    "token_tagset": []
}

In [None]:
tagged_texts_combi1_ff = pd.DataFrame(df_format)
tagged_texts_combi2_ff = pd.DataFrame(df_format)

tagged_texts_combi1_sf = pd.DataFrame(df_format)
tagged_texts_combi2_sf = pd.DataFrame(df_format)

display(tagged_texts_combi1_ff)
display(tagged_texts_combi2_ff)

display(tagged_texts_combi1_sf)
display(tagged_texts_combi2_sf)

## Loading the test data

Let us load the .json input file

In [None]:
input_dataframe = pd.read_json("input_data_validated_cleaned_v2.json")
display(input_dataframe)

In [None]:
tokens_temp = []
tags_temp = []
input_sentence = []

for i in range(len(input_dataframe)):
    tokens_temp.clear()
    tags_temp.clear()
    
    for j in range(input_dataframe.iloc[i].count()):
        tokens_temp.append(input_dataframe.iloc[i][j].__getitem__("token"))
        tags_temp.append(input_dataframe.iloc[i][j].__getitem__("tag"))
        
    sentence_temp = ' '.join([str(item) for item in tokens_temp])
    
    input_sentence.append(sentence_temp)

## POS TAGGERS

Let us import the monolingual taggers. Flair for english pos tagger and FSPOST for filipino pos tagger

In [None]:
# FLAIR POS TAGGER
from flair.data import Sentence
from flair.models import SequenceTagger

flair_tagger = SequenceTagger.load("flair/pos-english")

In [None]:
# SPACY POS TAGGER
import spacy

spacy_tagger = spacy.load("en_core_web_sm")

In [None]:
# FSPOST POS TAGGER
import os
from nltk.tag.stanford import StanfordPOSTagger

# These are Windows formatted directories
#model = 'model//filipino-left5words-owlqn2-distsim-pref6-inf2.tagger'
#jar = 'lib//stanford-postagger.jar'

# These are Linux formatted directories
model = 'model/filipino-left5words-owlqn2-distsim-pref6-inf2.tagger'
jar = 'lib/stanford-postagger.jar'

fspost = StanfordPOSTagger(model, path_to_jar=jar)  # Load Tagger Model
fspost._SEPARATOR = '|'  # Set separator for proper tuple formatting (word, tag)

def set_java_path(file_path):
    """
    Function for setting java path to make Stanford POS Tagger work. Makes use of the 'os' library. Input "" to use
    default java path, otherwise set the location.
    Args:
        file_path (str): The java file path / location.
    """
    if file_path == "":
        java_path = "C:/Program Files/Java/jdk1.8.0_111/bin/java.exe"
        print("Java path set by default")
    else:
        java_path = file_path
        print("Java path set from given")
    os.environ['JAVAHOME'] = java_path

def tag_string(sentence):
    """
    Function for tagging a sentence/string. Output is a (word, pos) tuple. To output a POS-only string, enclose this
    function with 'format_pos' function. Ex. fspost.format_pos(fspost.tag_string('this is a string')). Same goes for
    Stanford's word|tag notation, use 'format_stanford' function.
    Args:
        sentence (str): The string to be tagged.
    Returns:
        tagged_string: a list of string tokens containing POS labeled (word, pos) tuples.
    """
    tokens = sentence.split()  # Tokenize Sentence by whitespaces
    # print(tokens)
    tagged_string = fspost.tag(tokens)
    return tagged_string

def tag_string_list(sentence_list):
    """
    Function for tagging a list of sentences. Output is a list of (word, pos) tuple. To output a POS-only string,
    enclose the elements in this function with 'format_pos' function. Same goes for Stanford's word|tag notation, use
    'format_stanford' function.
    Args:
        sentence_list (list): The list of strings to be tagged.
    Returns:
        tagged_list: a list of strings containing POS labelled (word, pos) tuples.
    """
    progress_ctr = 0
    tagged_list = []  # Initialize an empty list
    for sentence in sentence_list:
        tagged_tuple = tag_string(sentence)  # Tag each sentence in the list
        tagged_list.append(tagged_tuple)  # Insert tagged sentence in the new list
        progress_ctr += 1
        print(progress_ctr, "/", len(sentence_list))  # Progress Counter
    return tagged_list

In [None]:
# WINDOWS
# set_java_path("C:/Program Files/Java/jdk-19/bin/java.exe")

# LINUX
set_java_path("/usr/lib/jvm/java-11-openjdk-amd64/bin/")

### Create functions to be used for POS Tagging

In [None]:
# Returns the eng POS tag (Flair version)
def eng_tagger_flair(input_string):
    sentence = Sentence(input_string)
    flair_tagger.predict(sentence)
    return sentence

In [None]:
def eng_tagger_spacy(input_string):
    return spacy_tagger(input_string)

In [None]:
# Returns the fil POS tag
def fil_tagger(input_string):
    return tag_string(input_string)

In [None]:
# Converts the specific tag into generalized tag
def convert_eng(pos_tag):
    if(pos_tag == "NN" or pos_tag == "NNS"):
        return "NOUN"
    elif(pos_tag == "NNP" or pos_tag == "NNPS"):
        return "PROPN"
    elif(pos_tag == "PRP" or pos_tag == "PRP$" or pos_tag == "WP" or pos_tag == "WP$"):
        return "PR"
    elif(pos_tag == "DT" or pos_tag == "WDT"):
        return "DT"
    elif(pos_tag == "CC"):
        return "CONJ"
    elif(pos_tag == "IN"):
        return "IN"
    elif(pos_tag == "VB"):
        return "VB"
    elif(pos_tag == "VBD" or pos_tag == "VBN"):
        return "VBPT"
    elif(pos_tag == "VBP" or pos_tag == "VBZ" or pos_tag == "VBG"):
        return "VBPR"
    elif(pos_tag == "JJ" or pos_tag == "JJR" or pos_tag == "JJS"):
        return "JJ"
    elif(pos_tag == "CD"):
        return "CD"
    elif(pos_tag == "RB" or pos_tag == "RBR" or pos_tag == "RBS" or pos_tag == "WRB" or pos_tag == "RP"):
        return "RB"
    elif(pos_tag == "UH"):
        return "UH"
    elif(pos_tag == "FW"):
        return "FW"
    elif(pos_tag == "." or pos_tag == "," or pos_tag == ":" or pos_tag == "NFP" or pos_tag == "(" or pos_tag == ")"
        or pos_tag == "''" or pos_tag == '""' or pos_tag == "``" or pos_tag == "`" or pos_tag == "-RRB-"
        or pos_tag == "-LRB-"):
        return "PUNC"
    elif(pos_tag == "HYPH" or pos_tag == "SYM" or pos_tag == "$" or pos_tag == "\"" or pos_tag == "LS"):
        return "SYM"
    else:
        return pos_tag

In [None]:
# Converts the specific tag into generalized tag
def convert_fil(pos_tag):
    if(pos_tag == "NNC" or pos_tag == "NNCA"):
        return "NOUN"
    elif(pos_tag == "NNP" or pos_tag == "NNPA"):
        return "PROPN"
    elif(pos_tag == "PRS" or pos_tag == "PRP" or pos_tag == "PRSP" or pos_tag == "PRO"
        or pos_tag == "PRQ" or pos_tag == "PRQP" or pos_tag == "PRL" or pos_tag == "PRC"
        or pos_tag == "PRF" or pos_tag == "PRI"):
        return "PR"
    elif(pos_tag == "DTC" or pos_tag == "DTCP" or pos_tag == "DTP" or pos_tag == "DTPP"):
        return "DT"
    elif(pos_tag == "LM"):
        return "LM"
    elif(pos_tag == "CCT" or pos_tag == "CCR" or pos_tag == "CCB" or pos_tag == "CCA"):
        return "CONJ"
    elif(pos_tag == "CCP"):
        return "CCP"
    elif(pos_tag == "CCU"):
        return "IN"
    elif(pos_tag == "VBW" or pos_tag == "VBS" or pos_tag == "VBH" or pos_tag == "VBN"
        or pos_tag == "VBAF" or pos_tag == "VBOF" or pos_tag == "VBOB" or pos_tag == "VBOL"
        or pos_tag == "VBOI" or pos_tag == "VBRF"):
        return "VB"
    elif(pos_tag == "VBTS" or pos_tag == "VBTP"):
        return "VBPT"
    elif(pos_tag == "VBTR"):
        return "VBPR"
    elif(pos_tag == "VBTF"):
        return "VBFT"
    elif(pos_tag == "JJD" or pos_tag == "JJC" or pos_tag == "JJCC" or pos_tag == "JJCS" or pos_tag == "JJCN"):
        return "JJ"
    elif(pos_tag == "JJN" or pos_tag == "CDB"):
        return "CD"
    elif(pos_tag == "RBD" or pos_tag == "RBN" or pos_tag == "RBK" or pos_tag == "RBP"
        or pos_tag == "RBB" or pos_tag == "RBR" or pos_tag == "RBQ" or pos_tag == "RBT"
        or pos_tag == "RBF" or pos_tag == "RBW" or pos_tag == "RBM" or pos_tag == "RBL"
        or pos_tag == "RBI" or pos_tag == "RBS"):
        return "RB"
    elif(pos_tag == "RBJ"):
        return "UH"
    elif(pos_tag == "FW"):
        return "FW"
    elif(pos_tag == "PMP" or pos_tag == "PME" or pos_tag == "PMQ" or pos_tag == "PMC" or pos_tag == "PMSC"):
        return "PUNC"
    elif(pos_tag == "PMS"):
        return "SYM"
    else:
        return pos_tag

In [None]:
langid = [] # will be used to store language id

def reset_variables(general, specific, token_tagset):
    #langid.clear()
    general.clear()
    specific.clear()
    token_tagset.clear()
    return

def reset_variables_combi2(gen_eng, spec_eng, gen_fil, spec_fil):
    gen_eng.clear()
    spec_eng.clear()
    gen_fil.clear()
    spec_fil.clear()

In [None]:
# Variables to be used for Flair-FSPOST Tagger

pos_tags_general_ff = [] # will be used to store generalized pos tags
pos_tags_specific_ff = [] # will be used to store specific pos tags
token_tagset_ff = [] # will be used to store the name of the tagset used for specific tags

# Temporary lists to be used for combi 2
pos_tags_general_eng_ff = []
pos_tags_specific_eng_ff = []
pos_tags_general_fil_ff = []
pos_tags_specific_fil_ff = []

In [None]:
# Variables to be used for Spacy-FSPOST Tagger

pos_tags_general_sf = [] # will be used to store generalized pos tags
pos_tags_specific_sf = [] # will be used to store specific pos tags
token_tagset_sf = [] # will be used to store the name of the tagset used for specific tags

# Temporary lists to be used for combi 2
pos_tags_general_eng_sf = []
pos_tags_specific_eng_sf = []
pos_tags_general_fil_sf = []
pos_tags_specific_fil_sf = []

In [None]:
def append_to_dataframe(input_sentence, general_tags, specific_tags, tagset, tagged_texts):
    #tagged_texts = tagged_texts.append({"text": input_sentence, "general_tags": np.array(general_tags),
                                       # "specific_tags": np.array(specific_tags), "token_tagset": np.array(tagset)},
                                       #ignore_index = True)
    tagged_texts = pd.concat([tagged_texts, pd.DataFrame.from_records([{"text": input_sentence,
                            "general_tags": np.array(general_tags), "specific_tags": np.array(specific_tags),
                            "token_tagset": np.array(tagset)}])], ignore_index = True)
    return tagged_texts

In [None]:
def join_string(string_list):
    return ' '.join([str(item) for item in string_list])

In [None]:
#nltk.download('punkt')

In [None]:
def tokenized_text_with_punc(input_text):
    input_text_tokenized = nltk.word_tokenize(input_text)
    return input_text_tokenized

In [None]:
def get_lang_per_token(text_wo_punc):
    langid = []
    #Identifies the language of each tokens to determine which tagger to use
    for i in range(len(text_wo_punc)):
        langid.append(detector.detect_language_of(text_wo_punc[i]))
        
    return langid

In [None]:
def dataframe_to_csv(dataframes, output_name):
    dataframes['general_tags'] = dataframes['general_tags'].map(list)
    dataframes['specific_tags'] = dataframes['specific_tags'].map(list)
    dataframes['token_tagset'] = dataframes['token_tagset'].map(list)
    dataframes.to_csv(output_name)

In [None]:
def isMultipleTags(tag):
    if(tag.__contains__('_')):
        new_tag = tag.split('_')
        return new_tag[0]
    else:
        return tag

## Language Identification then Monolingual Tagging (COMBI 1)

In [None]:
def lang_id_then_mono_tag(input_string):
    
    input_text_tokenized = tokenized_text_with_punc(input_string)
    
    # reset temporary variables
    reset_variables(pos_tags_general_ff, pos_tags_specific_ff, token_tagset_ff) # Flair-FSPOST version
    reset_variables(pos_tags_general_sf, pos_tags_specific_sf, token_tagset_sf) # Spacy-FSPOST version
    
    langid = get_lang_per_token(input_text_tokenized)
    
    for i in range(len(input_text_tokenized)):
        if(langid[i] == Language.TAGALOG):
            token = fil_tagger(input_text_tokenized[i])
            new_token = isMultipleTags(token[0][1])
            
            # Flair-FSPOST dataframes
            pos_tags_general_ff.append(convert_fil(new_token))
            pos_tags_specific_ff.append(new_token)
            token_tagset_ff.append("MGNN")
            
            # Spacy-FSPOST dataframes
            pos_tags_general_sf.append(convert_fil(new_token))
            pos_tags_specific_sf.append(new_token)
            token_tagset_sf.append("MGNN")

        elif(langid[i] == Language.ENGLISH):
            
            # Flair-FSPOST dataframes
            token = eng_tagger_flair(input_text_tokenized[i])
            new_token = isMultipleTags(token.get_labels('pos')[0].value)
            pos_tags_general_ff.append(convert_eng(new_token))
            pos_tags_specific_ff.append(new_token)
            token_tagset_ff.append("Flair")
            
             # Spacy-FSPOST dataframes
            token = eng_tagger_spacy(input_text_tokenized[i])
            new_token = isMultipleTags(token[0].tag_)
            pos_tags_general_sf.append(convert_eng(new_token))
            pos_tags_specific_sf.append(new_token)
            token_tagset_sf.append("Spacy")

        else:
            token = fil_tagger(input_text_tokenized[i])
            new_token = isMultipleTags(token[0][1])
            
            # Flair-FSPOST dataframes
            pos_tags_general_ff.append(convert_fil(new_token))
            pos_tags_specific_ff.append(new_token)
            token_tagset_ff.append("MGNN")
            
            # Spacy-FSPOST dataframes
            pos_tags_general_sf.append(convert_fil(new_token))
            pos_tags_specific_sf.append(new_token)
            token_tagset_sf.append("MGNN")
        
    global tagged_texts_combi1_ff
    temp = tagged_texts_combi1_ff
    tagged_texts_combi1_ff = append_to_dataframe(input_string, pos_tags_general_ff,
                                              pos_tags_specific_ff, token_tagset_ff, temp)
    
    global tagged_texts_combi1_sf
    temp = tagged_texts_combi1_sf
    tagged_texts_combi1_sf = append_to_dataframe(input_string, pos_tags_general_sf,
                                                 pos_tags_specific_sf, token_tagset_sf, temp)

In [None]:
j = 0
for i in range(len(input_sentence)):
    lang_id_then_mono_tag(input_sentence[i])
    
    if i % 100 == 0:
        print("pass ", j)
        j = j + 1
display(tagged_texts_combi1_ff)
display(tagged_texts_combi1_sf)

## Monolingual Tagging then Language Identification (COMBI 2)

In [None]:
def mono_tag_then_lang_id(input_string):
    # Resets temp variables FLair version
    reset_variables(pos_tags_general_ff, pos_tags_specific_ff, token_tagset_ff)
    reset_variables_combi2(pos_tags_general_eng_ff, pos_tags_specific_eng_ff,
                           pos_tags_general_fil_ff, pos_tags_specific_fil_ff)
    
    # Resets temp variables Spacy version
    reset_variables(pos_tags_general_sf, pos_tags_specific_sf, token_tagset_sf)
    reset_variables_combi2(pos_tags_general_eng_sf, pos_tags_specific_eng_sf,
                           pos_tags_general_fil_sf, pos_tags_specific_fil_sf)
    
    # Tokenized sentences
    input_text_tokenized = tokenized_text_with_punc(input_string)
    
    # Flair (English) pos tagging
    token_eng_flair = eng_tagger_flair(input_string)
    # Spacy (English) pos tagging
    token_eng_spacy = eng_tagger_spacy(input_string)
    
    # Store tags to temporary variables (ENGLISH - Flair Tagger)
    for i in range(len(token_eng_flair.get_labels('pos'))):
        new_token = isMultipleTags(token_eng_flair.get_labels('pos')[i].value)
        pos_tags_general_eng_ff.append(convert_eng(new_token))
        pos_tags_specific_eng_ff.append(new_token)
        
   # Store tags to temporary variables (ENGLISH - Spacy Tagger)
    for i in range(len(token_eng_spacy)):
        new_token = isMultipleTags(token_eng_spacy[i].tag_)
        pos_tags_general_eng_sf.append(convert_eng(new_token))
        pos_tags_specific_eng_sf.append(new_token)
         
        
    # FSPOST (Filipino) pos tagging
    token_fil = fil_tagger(input_string)
    
    # Store tags to temporary variables (FILIPINO Tagger)
    for i in range(len(token_fil)):
        new_token = isMultipleTags(token_fil[i][1])
        
        # Store filipino tags on Flair-FSPOST dataframe
        pos_tags_general_fil_ff.append(convert_fil(new_token))
        pos_tags_specific_fil_ff.append(new_token)
        
        # Store filipino tags on Spacy-FSPOST dataframe
        pos_tags_general_fil_sf.append(convert_fil(new_token))
        pos_tags_specific_fil_sf.append(new_token)
    
    # Get Languages per token ( Language Identification )
    langid = get_lang_per_token(input_text_tokenized)
    
    for i in range(len(input_text_tokenized)):
        if(langid[i] == Language.TAGALOG):
            pos_tags_general_ff.append(pos_tags_general_fil_ff[i])
            pos_tags_specific_ff.append(pos_tags_specific_fil_ff[i])
            token_tagset_ff.append("MGNN")
            
            pos_tags_general_sf.append(pos_tags_general_fil_sf[i])
            pos_tags_specific_sf.append(pos_tags_specific_fil_sf[i])
            token_tagset_sf.append("MGNN")

        elif(langid[i] == Language.ENGLISH):
            pos_tags_general_ff.append(pos_tags_general_eng_ff[i])
            pos_tags_specific_ff.append(pos_tags_specific_eng_ff[i])
            token_tagset_ff.append("Flair")
            
            pos_tags_general_sf.append(pos_tags_general_eng_sf[i])
            pos_tags_specific_sf.append(pos_tags_specific_eng_sf[i])
            token_tagset_sf.append("Spacy")

        else:
            pos_tags_general_ff.append(pos_tags_general_fil_ff[i])
            pos_tags_specific_ff.append(pos_tags_specific_fil_ff[i])
            token_tagset_ff.append("MGNN")
            
            pos_tags_general_sf.append(pos_tags_general_fil_sf[i])
            pos_tags_specific_sf.append(pos_tags_specific_fil_sf[i])
            token_tagset_sf.append("MGNN")
        
        
    global tagged_texts_combi2_ff
    temp = tagged_texts_combi2_ff
    tagged_texts_combi2_ff = append_to_dataframe(input_string, pos_tags_general_ff,
                                              pos_tags_specific_ff, token_tagset_ff, temp)
    
    global tagged_texts_combi2_sf
    temp = tagged_texts_combi2_sf
    tagged_texts_combi2_sf = append_to_dataframe(input_string, pos_tags_general_sf,
                                              pos_tags_specific_sf, token_tagset_sf, temp)
    

In [None]:
j = 0
for i in range(len(input_sentence)):
    #try:
    mono_tag_then_lang_id(input_sentence[i])
    
    if i % 100 == 0:
        print("pass ", j)
        j = j + 1
            
    #except:
    #print(i, ': ', input_sentence[i])
        
display(tagged_texts_combi2_ff)
display(tagged_texts_combi2_sf)

In [None]:
dataframe_to_csv(tagged_texts_combi1_ff, "Flair-FSPOST-Combination-1.csv")
dataframe_to_csv(tagged_texts_combi2_ff, "Flair-FSPOST-Combination-2.csv")

dataframe_to_csv(tagged_texts_combi1_sf, "Spacy-FSPOST-Combination-1.csv")
dataframe_to_csv(tagged_texts_combi2_sf, "Spacy-FSPOST-Combination-2.csv")

In [None]:
display(tagged_texts_combi1_ff)
display(tagged_texts_combi2_ff)
display(tagged_texts_combi1_sf)
display(tagged_texts_combi2_sf)

## Results evaluation

Reading the output csv files of each combinations

In [None]:
ff_combi1_output = pd.read_csv("Flair-FSPOST-Combination-1.csv")
ff_combi2_output = pd.read_csv("Flair-FSPOST-Combination-2.csv")
sf_combi1_output = pd.read_csv("Spacy-FSPOST-Combination-1.csv")
sf_combi2_output = pd.read_csv("Spacy-FSPOST-Combination-2.csv")

### Declaring functions to be used for results evaluation

In [None]:
def fixed_columns(output):
    output['general_tags'] = output['general_tags'].apply(eval)
    output['specific_tags'] = output['specific_tags'].apply(eval)
    output['token_tagset'] = output['token_tagset'].apply(eval)
    
    return output

In [None]:
def to_1D(series):
    return pd.Series([x for _list in series for x in _list])

### Taking the number of each POS tags per combination

In [None]:
ff_combi1_output = fixed_columns(ff_combi1_output)
ff_combi2_output = fixed_columns(ff_combi2_output)
sf_combi1_output = fixed_columns(sf_combi1_output)
sf_combi2_output = fixed_columns(sf_combi2_output)

In [None]:
ff_combi1_tag_counts = to_1D(ff_combi1_output['general_tags']).value_counts()
ff_combi1_total = ff_combi1_tag_counts.sum()

ff_combi2_tag_counts = to_1D(ff_combi2_output['general_tags']).value_counts()
ff_combi2_total = ff_combi2_tag_counts.sum()

sf_combi1_tag_counts = to_1D(sf_combi1_output['general_tags']).value_counts()
sf_combi1_total = sf_combi1_tag_counts.sum()

sf_combi2_tag_counts = to_1D(sf_combi2_output['general_tags']).value_counts()
sf_combi2_total = sf_combi2_tag_counts.sum()

In [None]:
def print_tag_counts(tag_counts, total, tagger):
    print(tag_counts)
    print(tagger, " total tokens: ", total, "\n")

In [None]:
print_tag_counts(ff_combi1_tag_counts, ff_combi1_total, "Flair-FSPOST Combi1")
print_tag_counts(ff_combi2_tag_counts, ff_combi2_total, "Flair-FSPOST Combi2")
print_tag_counts(sf_combi1_tag_counts, sf_combi1_total, "Spacy-FSPOST Combi1")
print_tag_counts(sf_combi2_tag_counts, sf_combi2_total, "Spacy-FSPOST Combi2")

### Taking overall accuracy of each combinations

Let us first read the correct tags from our input data

In [None]:
pos_temp = []
tags_list = []

for i in range(len(input_dataframe)):
    pos_temp.clear()
    
    for j in range(input_dataframe.iloc[i].count()):
        pos_temp.append(input_dataframe.iloc[i][j].__getitem__("tag"))
    
    temp = np.array(pos_temp)
    tags_list.append(temp)

Let us print the number of tags present in the test data.

In [None]:
test_data_counts = to_1D(tags_list).value_counts()
test_data_counts_total = test_data_counts.sum()

print_tag_counts(test_data_counts, test_data_counts_total, "Test data counts")

Almost all POS tags in our tagset is present in our test data. Although ADD, PDT and AFX are missing in our test data.

In [None]:
def get_overall_accuracy(output):
    accuracy = []
    for i in range(len(tags_list)):
        counter = 0
        for j in range(len(tags_list[i])):
            if tags_list[i][j] == 'VB':
                if (output['general_tags'][i][j] == 'VB' or output['general_tags'][i][j] == 'VBPT' or
                output['general_tags'][i][j] == 'VBPR' or output['general_tags'][i][j] == 'VBFT'):
                    counter = counter + 1
            elif tags_list[i][j] == output['general_tags'][i][j]:
                counter = counter + 1
                
        accuracy.append(counter / len(tags_list[i]))
        
    return accuracy

In [None]:
def print_overall_accuracy(output, tagger):
    accuracy = get_overall_accuracy(output)
    print(tagger, ' accuracy: %f' % (sum(accuracy) / len(accuracy)))

In [None]:
print_overall_accuracy(ff_combi1_output, "Flair-FSPOST Combi1")
print_overall_accuracy(sf_combi1_output, "Spacy-FSPOST Combi1")
print_overall_accuracy(ff_combi2_output, "Flair-FSPOST Combi2")
print_overall_accuracy(sf_combi2_output, "Spacy-FSPOST Combi2")

### Taking each pos tag accuracy

In [None]:
possible_tags = ['NOUN', 'PROPN', 'PR', 'DT', 'LM', 'CONJ', 'CCP', 'IN', 'VB', 'JJ', 'CD', 'RB', 'UH',
                 'TS', 'FW', 'PUNC', 'SYM', 'EX', 'TO', 'ADD', 'POS', 'PDT', 'XX', 'MD', 'AFX']

In [None]:
def get_accuracy_per_tags(output, tag):
    counter_right = 0
    counter_total = 0
    for i in range(len(tags_list)):
        for j in range(len(tags_list[i])):
            if tags_list[i][j] == tag:
                counter_total = counter_total + 1
                if tags_list[i][j] == 'VB':
                    if (output['general_tags'][i][j] == 'VB' or output['general_tags'][i][j] == 'VBPT' or
                    output['general_tags'][i][j] == 'VBPR' or output['general_tags'][i][j] == 'VBFT'):
                        counter_right = counter_right + 1
                elif tags_list[i][j] == output['general_tags'][i][j]:
                    counter_right = counter_right + 1
    
    if counter_total == 0:
        return None
    else:
        return counter_right / counter_total * 100

In [None]:
def print_accuracy_per_tags(output, tagger):
    print("Tagger: ", tagger)
    for i in range(len(possible_tags)):
        accuracy = get_accuracy_per_tags(output, possible_tags[i])
        if accuracy != None:
            print('POS Tag: ', possible_tags[i], ' accuracy: %f' % accuracy)
            
    print("\n")

In [None]:
print_accuracy_per_tags(ff_combi1_output, "Flair-FSPOST Combi1")
print_accuracy_per_tags(sf_combi1_output, "Spacy-FSPOST Combi1")
print_accuracy_per_tags(ff_combi2_output, "Flair-FSPOST Combi2")
print_accuracy_per_tags(sf_combi2_output, "Spacy-FSPOST Combi2")

### Generating confusion matrix

In [None]:
from sklearn import metrics
import matplotlib.pyplot as plt

Taking the actual and predicted values

In [None]:
def get_tags(tag_list, tag):
    tags = []
    
    for i in range(len(tag_list)):
        for j in range(len(tag_list[i])):
            if tag_list[i][j] == tag:
                tags.append(True)
                
            else:
                tags.append(False)
                
    return tags

In [None]:
def generate_confu_matrix(combi, top_tags):
    actual_tags = []
    predicted_tags = []

    for i in range(len(top_tags)):
        actual_tags = get_tags(tags_list, top_tags[i])
        predicted_tags = get_tags(combi['general_tags'], top_tags[i])

        confusion_matrix_ff_combi1 = metrics.confusion_matrix(actual_tags, predicted_tags)

        cm_display_ff_combi1 = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix_ff_combi1, 
                                                          display_labels = [False, True])

        cm_display_ff_combi1.plot()
        plt.show()

        accuracy = metrics.accuracy_score(actual_tags, predicted_tags)
        precision = metrics.precision_score(actual_tags, predicted_tags)
        specificity = metrics.recall_score(actual_tags, predicted_tags, pos_label = 0)
        print("Accuracy: ", accuracy)
        print("Precision: ", precision)
        print("Specificity: ", specificity)

In [None]:
ff_combi1_top_tags = ['CCP', 'NOUN']
sf_combi1_top_tags = ['CCP', 'NOUN']
ff_combi2_top_tags = ['CCP', 'NOUN']
sf_combi2_top_tags = ['CCP', 'NOUN']

generate_confu_matrix(ff_combi1_output, ff_combi1_top_tags)

In [None]:
generate_confu_matrix(sf_combi1_output, sf_combi1_top_tags)

In [None]:
generate_confu_matrix(ff_combi2_output, ff_combi2_top_tags)

In [None]:
generate_confu_matrix(sf_combi2_output, sf_combi2_top_tags)