### Next Steps
- Begin evaluating results
    - Inference time
- Compare to Google translate performance and language models' performance


### Notes
- `True` `False` indicates that the model was able to generate the right word, but not choose it as a top candidate. This means either:
    - The candidate is not in the dictionary - add this to limitations
    - The top k function is not good - improve using the probabilities idea
- `False` `True` or `False` `False` indicates that the model is not able to generate the right word, and is using LD to find it in the vocab
    - Check why the model isn't picking up on the right words
    - Maybe need to use a different target size?


In [1]:
import pandas as pd
from utils import *
import ast

SOURCE_PATH = ""

In [2]:
# Load train/test datasets
df      = pd.read_csv(SOURCE_PATH + "data/train_words.csv", header=None)
df_test = pd.read_csv(SOURCE_PATH + "data/test_words.csv", header=None)

# Load vocab
f = open("TagalogStemmerPython/output/with_info.txt", "r", encoding='latin1')
f = f.readlines()
vocab_tl = set(ast.literal_eval(item.strip('\n'))['word'] for item in f)
vocab_tl = set(df[1]).union(vocab_tl) # Add in vocab from dataframe
vocab_tl = set(df_test[1]).union(vocab_tl) # Add in vocab from test dataframe

In [3]:
# Generate rules
og_dict = {}
for i in range(df.shape[0]):
    og_dict = collate_dict(og_dict, collect_rules(df[0][i], df[1][i], 2))

for key in og_dict:
    og_dict[key] = og_dict[key]+[key]

# Rank rules by frequency
og_dict = collate_max(og_dict, 'test')

In [5]:
result_lst = generate(word_lst=df_test[0], 
         rule_dict=og_dict, 
         vocab=vocab_tl, 
         use_dld=True,
         max_sub=2)
evaluate(result_lst, df_test[1])

Average Time: 2.488020887374878


{'best_dl': 0.45,
 'max_dl': 4.8,
 'avg_dl': 2.9948333333333337,
 'acc_1': 0.7,
 'acc_3': 0.78,
 'acc_5': 0.86,
 'target_in_candidate': 0.86}

In [4]:
result_lst = generate(word_lst=df_test[0], 
         rule_dict=og_dict, 
         vocab=vocab_tl, 
         use_dld=False,
         max_sub=2)
evaluate(result_lst, df_test[1])

Average Time: 2.4510830187797548


{'best_dl': 1.22,
 'max_dl': 5.29,
 'avg_dl': 3.504833333333332,
 'acc_1': 0.17,
 'acc_3': 0.39,
 'acc_5': 0.58,
 'target_in_candidate': 0.58}

In [6]:
for i,j,k in zip(df_test[0],df_test[1],result_lst):
    if j not in k:
        print(i,j,k)

2mama tumama ['tomo', 'kamo', 'to', 'o', 'wa']
2naw tunaw ['ong', 'wang', 'kaya', 'kang', 'kain']
anjan andiyan ['yang', 'gaya', 'gang', 'gana', 'gaan']
cntro sentro ['siya', 'sina', 'si']
dng ding ['di nin', 'di nga', 'dinig', 'dinaig', 'di ng']
engat ingat ['ng nina', 'ng nin', 'ng ng', 'ng nga', 'nya']
gnagabi ginagabi ['ginawa', 'gaya', 'gang', 'gaan', 'ging']
hnggang hanggang ['hininga nin', 'hininga nga', 'hangga nin', 'hangga nga', 'hininga ng']
kb ka ba ['kaba', 'ka']
kbabayan kababayan ['ka ba bayang', 'ka ba bayaan', 'kaba bayang', 'kaba bayaan', 'ka babae']
kmsta kamusta ['kamo', 'kama', 'ka']
lanq lang ['anya ng', 'anyaya', 'laya ng', 'anyang', 'anya']
manaliq manalig ['mga nadaig', 'mga na', 'mga', 'ma nadaig', 'maya']
mappnta mapupunta ['mapa panata', 'mapa pinya', 'mapa pinta', 'mo', 'mapa']
mbahala mabahala ['mabahaan', 'mo bahala', 'mo ba', 'mo baha', 'ma bahala']
mgbabaha magbabaha ['mga babae', 'maga babae', 'mag babae', 'mga ba ba', 'mga ba baha']
mglaho mglaho ['mi

In [10]:
for word,target in zip(df_test[0],df_test[1]):
    candidates = generate_candidates(word, og_dict, 2)
    results = choose_top_k(candidates, word, vocab_tl, 5, False)
    if target not in results:
        print(word, candidates, results)

['on']