In [7]:
import pandas as pd
import numpy as np
from utils import *
import ast

SOURCE_PATH = ""

In [8]:
# Load train/test datasets
df      = pd.read_csv(SOURCE_PATH + "data/train_words.csv", header=None)
df_test = pd.read_csv(SOURCE_PATH + "data/test_words.csv", header=None)

# Load vocab
f = open("TagalogStemmerPython/output/with_info.txt", "r", encoding='latin1')
f = f.readlines()
vocab_tl = set(ast.literal_eval(item.strip('\n'))['word'] for item in f)
vocab_tl = set(df[1]).union(vocab_tl) # Add in vocab from dataframe
vocab_tl = set(df_test[1]).union(vocab_tl) # Add in vocab from test dataframe

In [3]:
# Generate rules
og_dict = {}
for i in range(df.shape[0]):
    og_dict = collate_dict(og_dict, collect_rules(df[0][i], df[1][i], 2))

for key in og_dict:
    og_dict[key] = og_dict[key]+[key]

# Rank rules by frequency
og_dict = collate_max(og_dict, 'test')

In [4]:
result_lst = generate(word_lst=df_test[0], 
         rule_dict=og_dict, 
         vocab=vocab_tl, 
         use_dld=True,
         max_sub=2)
evaluate(result_lst, df_test[1])

Average Time: 2.5701652789115905


{'best_dl': 0.46,
 'max_dl': 4.73,
 'avg_dl': 2.9108333333333345,
 'acc_1': 0.77,
 'acc_3': 0.82,
 'acc_5': 0.85,
 'target_in_candidate': 0.85}

In [4]:
result_lst = generate(word_lst=df_test[0], 
         rule_dict=og_dict, 
         vocab=vocab_tl, 
         use_dld=False,
         max_sub=2)
evaluate(result_lst, df_test[1])

Average Time: 2.4510830187797548


{'best_dl': 1.22,
 'max_dl': 5.29,
 'avg_dl': 3.504833333333332,
 'acc_1': 0.17,
 'acc_3': 0.39,
 'acc_5': 0.58,
 'target_in_candidate': 0.58}

Error Analysis

In [5]:
# Identify observations where target was not first choice
error_idx = [i for (i,(lst,target)) in enumerate(zip(result_lst,df_test[1])) if target!=lst[0]]
df_error = df_test.loc[error_idx].reset_index(drop=True)

# Generate candidates
df_error['Candidates'] = df_error[0].apply(lambda x: generate_candidates(x, og_dict, 2))
df_error['In_Candidates'] = [r in cand for (r,cand) in zip(df_error[1], df_error['Candidates'])]

# Get score of target
df_error['Target_Score']  = [cand[target] if target in cand \
                             else None for (cand, target) \
                             in zip(df_error['Candidates'], df_error[1])]

In [6]:
# Percentile of target among candidates
cand_percentile = [100*sorted(cand.values(), reverse=True).index(target)/len(cand.values()) \
                   for (target,cand) in zip(df_error.loc[df_error.In_Candidates==True]['Target_Score'],
                                            df_error.loc[df_error.In_Candidates==True]['Candidates'])]
np.median(cand_percentile), np.mean(cand_percentile)

(8.571428571428571, 12.659164700352214)

In [7]:
# Generate rules for those not among candidates
df_error_no_cand = df_error.loc[df_error.In_Candidates==False].reset_index(drop=True)

# Collect rules involved in 
all_rules_count, missing_rules_count = 0, 0
good_rules_lst, bad_rules_lst = [], []

for (orig, correct) in zip(df_error_no_cand[0], df_error_no_cand[1]):
    # Extract rules which are and are not found in og_dict
    good_rules, bad_rules = compare_rules(orig, correct, og_dict)
    
    # Count number of observations where all rules are present/not
    if len(bad_rules)>0:
        missing_rules_count += 1
    else:
        all_rules_count += 1
        
    # Collect rules across observations
    good_rules_lst.extend(good_rules) 
    bad_rules_lst.extend(bad_rules)
    
# Percent of observations missing one rule needed to make correction
print(100*missing_rules_count/(missing_rules_count+all_rules_count))

# Percent of rules missing from the og_dict
print(100*len(set(bad_rules_lst))/(len(set(good_rules_lst))+len(set(bad_rules_lst))))

100.0
39.39393939393939


In [8]:
# Generate rules, now using the original and error words
og_dict = {}
for i in range(df.shape[0]):
    og_dict = collate_dict(og_dict, collect_rules(df[0][i], df[1][i], 2))

for i in range(df_error_no_cand.shape[0]):
    og_dict = collate_dict(og_dict, collect_rules(df_error_no_cand[0][i], 
                                                  df_error_no_cand[1][i], 
                                                  2))
for key in og_dict:
    og_dict[key] = og_dict[key]+[key]

# Rank rules by frequency
og_dict = collate_max(og_dict, 'test')

In [12]:
# Check which words are still not solved by the updated model
result_lst_no_cand = generate(word_lst=df_error_no_cand[0], 
                              rule_dict=og_dict, 
                              vocab=vocab_tl, 
                              use_dld=True,
                              max_sub=2)
for (orig, target, result) in zip(df_error_no_cand[0],
                                  df_error_no_cand[1],
                                  result_lst_no_cand):
    if target != result[0]:
        print(orig, target, result)

2mama tumama ['kama', 'tumama', 'kamo', 'tomo ama', 'tomo']
kb ka ba ['ka', 'kaba']
na2wa natuwa ['nawa', 'naawa', 'nakaw', 'natuwa', 'nawa']
nde hindi ['ng', 'na', 'hinde', 'on', 'hindi']
sya siya ['sya', 'saya', 'sa', 'siya', 'si']


### DLD Study

In [14]:
result_lst = []
for word in df_test[0]:
    dl_scores = damerau_levenshtein_distance_seqs(word, vocab_tl)
    temp_vocab_lst = [(a,b) for (a,b) in zip(vocab_tl, dl_scores)]
    temp_result = sorted(temp_vocab_lst, key=lambda x: x[1])[:5]
    result_lst.append([i[0] for i in temp_result])

In [15]:
evaluate(result_lst, df_test[1])

{'best_dl': 0.59,
 'max_dl': 3.32,
 'avg_dl': 2.2819999999999996,
 'acc_1': 0.45,
 'acc_3': 0.67,
 'acc_5': 0.72,
 'target_in_candidate': 0.72}