In [3]:
#!/usr/bin/env python3

import os
import argparse
import math
import json
from collections import defaultdict, Counter
import pandas as pd
import editdistance

class Evaluation:

    def get_args(self):
        '''Parses commandline arguments'''

        parser = argparse.ArgumentParser(description = \
        "Evalute predicted lexicon against gold lexicon")
        parser.add_argument("--gold_lexicon", type = str, \
        required = True, help = "Path to gold lexicon")
        parser.add_argument("--pred_lexicon", type = str, \
        required = True, help = "Path to predicted lexicon")
        parser.add_argument("--eval_type", type = str, \
        default = "loose", help = "Type of evaluation: {loose, hard, softgold}")
        parser.add_argument("--OUTPATH", type=str, \
        help = "Path to JSON file to store results")

        return parser.parse_args()

    def read_files(self, filepath):

        with open(filepath) as f:
            return json.load(f)

    def eval(self, gold_lexicon, pred_lexicon, type = "loose"):
        '''
        Basic accuracy-based evaluation
        (type) loose: Considers any gold option equally correct
        (type) hard: Considers only the best gold option to be correct
        '''
        accuracy, found = 0, 0
        
        correct = dict()
        
        for word in gold_lexicon:
            if word not in pred_lexicon:
                continue
            found += 1
            # best_pred = max(pred_lexicon[word].keys(), key = lambda x:pred_lexicon[word][x])
            #For now, pred_targets only contains a single element
            # so it is irrelevant whether we do intersection
            # or check whether all pred targets lie in gold targets
            if type=="loose":
                if set(pred_lexicon[word].keys()).intersection(set(gold_lexicon[word].keys())):
                    accuracy += 1
                    correct[word] = {"gold":set(gold_lexicon[word].keys()), \
                                     "pred":set(pred_lexicon[word].keys()), \
                                     "both":set(pred_lexicon[word].keys()).intersection(set(gold_lexicon[word].keys())) \
                                    }
                    

            elif type=="hard":
                best_gold_score = max(gold_lexicon[word].values())
                best_golds = {word for word in gold_lexicon if gold_lexicon[word]==best_gold_score}
                if best_pred in best_golds:
                    accuracy += 1

            elif type=="softgold":
                best_gold_score = max(gold_lexicon[word].values())
                if best_pred in gold_lexicon[word]:
                    accuracy += gold_lexicon[word][best_pred]/best_gold_score

        result = {
        "accuracy":accuracy/found,
        "found":found,
        "total":len(gold_lexicon)
        }
        return result, correct

    def save_results(self, lang, result, OUTPATH):

        try:
            results = self.read_files(OUTPATH)

        except:
            results = dict()
        results[lang] = result
        with open(OUTPATH, "w") as f:
            json.dump(results, f, ensure_ascii = False, indent=2)

    def driver(self, gold_lexicon, pred_lexicon, eval_type, OUTPATH=None):
        # Get the target language (currently, we take anchor as source)
        lang = gold_lexicon.split("/")[-1].split("_")[1].split(".")[0]
        # Read lexicons
        gold_lexicon = self.read_files(gold_lexicon)
        pred_lexicon = self.read_files(pred_lexicon)
        # Evaluate
        result = self.eval(gold_lexicon, pred_lexicon, type = eval_type)
#         if OUTPATH:
#             self.save_results(lang, result, OUTPATH)
        return result


    def main(self):
        args = self.get_args()
        self.driver(args.gold_lexicon, args.pred_lexicon, \
        args.eval_type, args.OUTPATH)

# if __name__ == "__main__":
#     obj = Evaluation()
#     obj.main()



In [110]:
lang = "bhojpuri"
f0 = "../mli_od/lexicons_ned_top5/hindi-urdu_{}.json".format(lang)
f1 = "../mli_od/lexicons_jw_top5/hindi-urdu_{}.json".format(lang)
f2 = "../mli_em_od/lexicons_top5/hindi-urdu_{}.json".format(lang)
f3 = "../mli_sem_od/lexicons_K50_top5/hindi-urdu_{}.json".format(lang)
f4 = "../mli_sem_emod/lexicons_top5/hindi-urdu_{}.json".format(lang)
gold = "eval_data/lexicons/hindi-urdu_source/hindi-urdu_{}.json".format(lang)
eval_type = "loose"

In [111]:
obj = Evaluation()

In [112]:
apps = {"NED":f0, "JW":f1, "EMT":f2, "SEM_JW":f3, "SEM_EMT":f4}
all_results = dict()
for app, f in apps.items():
    gold_lexicon = obj.read_files(gold)
    pred_lexicon = obj.read_files(f)
    res, correct = obj.eval(gold_lexicon, pred_lexicon, type = eval_type)
    all_results[app] = {"res":res, "corr":correct}
    print(app, res)

NED {'accuracy': 0.3130434782608696, 'found': 115, 'total': 139}
JW {'accuracy': 0.28695652173913044, 'found': 115, 'total': 139}
EMT {'accuracy': 0.3217391304347826, 'found': 115, 'total': 139}
SEM_JW {'accuracy': 0.30434782608695654, 'found': 115, 'total': 139}
SEM_EMT {'accuracy': 0.2956521739130435, 'found': 115, 'total': 139}


In [58]:
non_id = 0
for word, d in all_results["od"]["corr"].items():
    for b in d["both"]:
        if word != b:
            print(word, all_results["od"]["corr"][word])
            non_id += 1

print(non_id, len(all_results["od"]["corr"]))

लिखा {'gold': {'लिखलास', 'लिखल', 'लिखला'}, 'pred': {'लिखाई', 'लिखना', 'लिखा', 'लिखता', 'लिखला'}, 'both': {'लिखला'}}
करते {'gold': {'करे'}, 'pred': {'करते', 'करे', 'कतरे', 'कुर्ते', 'करत'}, 'both': {'करे'}}
2 33


In [59]:
non_id = 0
for word, d in all_results["emod"]["corr"].items():
    for b in d["both"]:
        if word != b:
            print(word, all_results["emod"]["corr"][word])
            non_id += 1

print(non_id, len(all_results["emod"]["corr"]))

क्या {'gold': {'का', 'करे', 'कर'}, 'pred': {'कहा', 'क्या', 'कला', 'का'}, 'both': {'का'}}
तुम {'gold': {'तू'}, 'pred': {'तू', 'तुम', 'तु', 'त'}, 'both': {'तू'}}
मैं {'gold': {'हम', 'में'}, 'pred': {'मैं', 'सें', 'ं', 'में'}, 'both': {'में'}}
को {'gold': {'के'}, 'pred': {'कि', 'का', 'के', 'को'}, 'both': {'के'}}
कितना {'gold': {'केतना'}, 'pred': {'कितना', 'कतना', 'कातना', 'केतना'}, 'both': {'केतना'}}
की {'gold': {'राज्य', 'का'}, 'pred': {'की', 'कि', 'का', 'के'}, 'both': {'का'}}
6 37


In [60]:
non_id = 0
for word, d in all_results["sem_od"]["corr"].items():
    for b in d["both"]:
        if word != b:
            print(word, all_results["sem_od"]["corr"][word])
            non_id += 1

print(non_id, len(all_results["sem_od"]["corr"]))

वहाँ {'gold': {'यहाँ', 'लोग'}, 'pred': {'वहां', 'तहाँ', 'वहाँ', 'यहाँ', 'जहाँ'}, 'both': {'यहाँ'}}
को {'gold': {'के'}, 'pred': {'का', 'के', 'जो', 'की', 'को'}, 'both': {'के'}}
कैसी {'gold': {'कैसन'}, 'pred': {'कैसा', 'कैसन', 'बैसी', 'कैसी', 'कैसो'}, 'both': {'कैसन'}}
3 35


In [61]:
non_id = 0
for word, d in all_results["sem_emod"]["corr"].items():
    for b in d["both"]:
        if word != b:
            print(word, all_results["sem_emod"]["corr"][word])
            non_id += 1

print(non_id, len(all_results["sem_emod"]["corr"]))

सकते {'gold': {'सका'}, 'pred': {'सकती', 'सकता', 'सका', 'सकते', 'सको'}, 'both': {'सका'}}
वहाँ {'gold': {'यहाँ', 'लोग'}, 'pred': {'जहाँ', 'वहीं', 'वहाँ', 'यहाँ', 'वह'}, 'both': {'यहाँ'}}
2 34


In [123]:
for word, d in all_results["SEM_EMT"]["corr"].items():
    if word not in all_results["NED"]["corr"]:
        print(word, all_results["SEM_EMT"]["corr"][word])

दोपहर {'gold': {'दोपहर', 'मे'}, 'pred': {'द', 'दोपहरी', 'दुपहर', 'धूप', 'दोपहर'}, 'both': {'दोपहर'}}
सकते {'gold': {'सका'}, 'pred': {'सकती', 'सकता', 'सका', 'सकते', 'सको'}, 'both': {'सका'}}
वहाँ {'gold': {'यहाँ', 'लोग'}, 'pred': {'जहाँ', 'वहीं', 'वहाँ', 'यहाँ', 'वह'}, 'both': {'यहाँ'}}


In [74]:
word = "सोये"
pred_lexicon1 = obj.read_files(apps["od"])
pred_lexicon2 = obj.read_files(apps["emod"])
pred_lexicon3 = obj.read_files(apps["sem_od"])
pred_lexicon4 = obj.read_files(apps["sem_emod"])
print(pred_lexicon1[word])
print(pred_lexicon2[word])
print(pred_lexicon3[word])
print(pred_lexicon4[word])
print(gold_lexicon[word])

{'सोये': 1.0, 'सोभे': 0.8500000000000001, 'सोझे': 0.8500000000000001, 'सोने': 0.8500000000000001, 'सो': 0.8500000000000001}
{'सो': -0.05207981979827417, 'सोये': -7.4355311687131325, 'सब': -9.654800060333967, 'सर': -9.654800060333967}
{'सोये': -0.0, 'होये': -0.516082356373469, 'सोया': -0.5234620988368988, 'रोये': -0.549056887626648, 'धोये': -0.5537111659844716}
{'सोये': -2.4813055518103906, 'स': -5.283347210041681, 'सोए': -11.40666876898273, 'सोया': -12.121608503495125, 'सोयं': -12.121608503495125}
{'गेल': 1, 'सत्ला': 2, 'हा': 1}


In [64]:
word = "कितना"
pred_lexicon1 = obj.read_files(apps["od"])
pred_lexicon2 = obj.read_files(apps["emod"])
pred_lexicon3 = obj.read_files(apps["sem_od"])
pred_lexicon4 = obj.read_files(apps["sem_emod"])
print(pred_lexicon1[word])
print(pred_lexicon2[word])
print(pred_lexicon3[word])
print(pred_lexicon4[word])

{'कितना': 1.0, 'कतना': 0.94, 'कतिना': 0.94, 'किनार': 0.88, 'कितने': 0.88}
{'कितना': -1.728983063083883, 'कतना': -10.01525251586695, 'केतना': -10.01525251586695, 'कातना': -10.01525251586695}
{'कितना': -0.0, 'जितना': -0.5228662371635437, 'कितनी': -0.5454433393478393, 'कितने': -0.564395432472229, 'इतना': -0.5961363782485326}
{'कितना': -0.5733946293728067, 'कितने': -2.497571134177482, 'जितना': -11.084716542114833, 'कितनी': -12.034405554390837, 'इतना': -13.184235842520081}


In [65]:
word = "कहानी"
pred_lexicon1 = obj.read_files(apps["od"])
pred_lexicon2 = obj.read_files(apps["emod"])
pred_lexicon3 = obj.read_files(apps["sem_od"])
pred_lexicon4 = obj.read_files(apps["sem_emod"])
print(pred_lexicon1[word])
print(pred_lexicon2[word])
print(pred_lexicon3[word])
print(pred_lexicon4[word])

{'कहानी': 1.0, 'कहनी': 0.9466666666666665, 'कानी': 0.94, 'हानी': 0.9333333333333332, 'कहा': 0.8933333333333333}
{'कहानी': 0.0, 'कहतानी': -10.139586815010555, 'करानी': -10.171413246622231, 'कानी': -10.171413246622231}
{'कहानी': -0.0, 'हानी': -0.3273595492045085, 'रामकहानी': -0.3636296724279723, 'चुहानी': -0.4890451868375142, 'कानी': -0.49181364297866825}
{'कहानी': -1.6559746531357284, 'हानी': -12.167296565877754, 'सुहानी': -17.141049463457836, 'घानी': -22.38807353561832, 'नानी': -22.38807353561832}


In [70]:
word = "जवाब"
pred_lexicon1 = obj.read_files(apps["od"])
pred_lexicon2 = obj.read_files(apps["emod"])
pred_lexicon3 = obj.read_files(apps["sem_od"])
pred_lexicon4 = obj.read_files(apps["sem_emod"])
print(pred_lexicon1[word])
print(pred_lexicon2[word])
print(pred_lexicon3[word])
print(pred_lexicon4[word])
print(gold_lexicon[word])

{'जवाब': 1.0, 'जाब': 0.9249999999999999, 'जवना': 0.8666666666666667, 'जव': 0.8666666666666667, 'जवान': 0.8666666666666667}
{'जाब': -0.06342435270371753, 'जवाब': -2.788761581485382, 'बाब': -9.926870855702662, 'दाब': -9.926870855702662}
{'जवाब': -0.0, 'वाब': -0.40016142527262377, 'जबाब': -0.5223091900348663, 'जवाल': -0.5513108452161153, 'नवाब': -0.5664635797341664}
{'जवाब': -0.3516394750572899, 'वाब': -2.7024857716159985, 'जबाब': -3.1581863725111723, 'कवाब': -10.247346840274457, 'बताब': -14.99303185187371}
{'उत्तर': 1}


In [71]:
word = "ज्यादा"
pred_lexicon1 = obj.read_files(apps["od"])
pred_lexicon2 = obj.read_files(apps["emod"])
pred_lexicon3 = obj.read_files(apps["sem_od"])
pred_lexicon4 = obj.read_files(apps["sem_emod"])
print(pred_lexicon1[word])
print(pred_lexicon2[word])
print(pred_lexicon3[word])
print(pred_lexicon4[word])
print(gold_lexicon[word])

{'ज्यादा': 1.0, 'ज्यादातर': 0.9249999999999999, 'ज्यादे': 0.9, 'मर्यादा': 0.8492063492063492, 'याद': 0.8333333333333334}
{'ज्यादा': -3.6642873167588736, 'जादा': -8.917683687330632, 'जामा': -10.07190279914727, 'ज़्यादा': -13.80387413176943}
{'ज्यादा': -0.0, 'ज्यादातर': -0.30796434581279764, 'ज़्यादा': -0.3627715059689113, 'ज्यादे': -0.37114316821098325, 'मर्यादा': -0.59316729103762}
{'ज्यादा': -0.4545150852530514, 'ज़्यादा': -10.675292054993621, 'ज्यादातर': -14.958532819651762, 'ज्यादेतर': -16.82530244250128, 'जितना': -34.74932816807329}
{'जयदा': 1, 'लोग': 1, 'हैं': 1}


In [103]:
word = "कहानी"
pred_lexicon1 = obj.read_files(apps["od"])
pred_lexicon2 = obj.read_files(apps["emod"])
pred_lexicon3 = obj.read_files(apps["sem_od"])
pred_lexicon4 = obj.read_files(apps["sem_emod"])
print(pred_lexicon1[word])
print(pred_lexicon2[word])
print(pred_lexicon3[word])
print(pred_lexicon4[word])
print(gold_lexicon[word])

KeyError: 'od'

In [113]:
pred_all = dict()
for a in apps:
    pred_all[a] = obj.read_files(apps[a])

pred_all["Gold"] = gold_lexicon

In [126]:
word = "सकते"
comp = dict()
for a in pred_all:
    comp[a] = list(pred_all[a][word].keys())[:5]
    comp[a] = {i+1: comp[a][i] if i < len(comp[a]) else "-" for i in range(5)}
df = pd.DataFrame(comp)
df

Unnamed: 0,NED,JW,EMT,SEM_JW,SEM_EMT,Gold
1,सकते,सकते,सकते,सकते,सकते,सका
2,सूते,सके,सकेत,सकता,सकती,-
3,सहते,सकता,घसकत,सकती,सकता,-
4,सके,सकती,सहकत,सके,सको,-
5,सकता,सकति,-,सकें,सका,-


In [109]:
pred_all["JW"][word]

{'यात्रा': 1.0,
 'यात्राएँ': 0.9249999999999999,
 'यात्री': 0.9,
 'यात्रायें': 0.8999999999999999,
 'मात्रा': 0.888888888888889}

In [108]:
all_results["JW"]["corr"][word]

KeyError: 'यात्रा'

In [67]:
# all_results["sem_emod"]["/corr"].items()

In [68]:
gold_lexicon.keys()

dict_keys(['क्या', 'तुमने', 'दोपहर', 'का', 'खाना', 'खाया', 'तुम', 'मेरे', 'साथ', 'आओगे', 'कैसे', 'आये', 'खोलें', 'वो', 'आयेगा', 'तुम्हारा', 'पसंदीडा', 'रंग', 'कौन', 'सा', 'हैं', 'कल', 'बस', 'से', 'आयी', 'आखिरी', 'सवाल', 'थे', 'सबसे', 'मीठा', 'फल', 'कौनसा', 'एक', 'मुझे', 'अपना', 'पेन', 'दे', 'सकते', 'हो', 'आप', 'क्यों', 'सोये', 'सेव', 'रहा', 'नाम', 'वे', 'वहाँ', 'बैठे', 'परीक्षा', 'लिख', 'उसने', 'यह', 'मैनेजर', 'कैबिन', 'कहाँ', 'किताब', 'खोलेंगे', 'आइये', 'हें', 'मैं', 'जवाब', 'होगा', 'हाँ', 'तुम्हें', 'प्यार', 'करता', 'करती', 'हूँ', 'दौड़ना', 'कार', 'चलते', 'खोलना', 'चलना', 'कितने', 'लिए', 'हिन्दी', 'न्युज़पेपर', 'अच्छा', 'कौंसी', 'कहानी', 'बताय्', 'बैठना', 'लिया', 'था', 'आपका', 'खत', 'लिखा', 'खा', 'बाग', 'लाया', 'बॉक्स', 'उठा', 'उसे', 'जाने', 'को', 'कहा', 'में', 'कितना', 'पैसा', 'दिया', 'किस', 'भारतीय', 'राज्य', 'की', 'जनसंख्या', 'ज्यादा', 'रास्ते', 'चाहिए', 'जायेगा', 'खायेगा', 'मैने', 'पिछले', 'सफ्तह', 'फिल्म', 'देखी', 'सकता', 'हाथ', 'रात', 'भर', 'सोया', 'करूँ', 'खाइये', 'यात्रा', 'क

In [75]:
print(gold_lexicon)

{'क्या': {'का': 21, 'करे': 1, 'कर': 1}, 'तुमने': {'तू': 6}, 'दोपहर': {'दोपहर': 1, 'मे': 1}, 'का': {'खाना': 1, 'का': 1}, 'खाना': {'खैला': 1}, 'खाया': {'हवा': 1, 'खैलास': 1}, 'तुम': {'तू': 16}, 'मेरे': {'हमरा': 1, 'हमरी': 1}, 'साथ': {'साथ': 2}, 'आओगे': {'चाल्बा': 1}, 'कैसे': {'कैसे': 3, 'आयला': 1}, 'आये': {'हव': 2, 'आयलाह': 1, 'आइल': 1}, 'खोलें': {'खुल्ला': 1}, 'वो': {'ऊऊ': 7, 'हौ': 2, 'रात': 1}, 'आयेगा': {'आई': 1}, 'तुम्हारा': {'तोहर': 2}, 'पसंदीडा': {'पसंदीडा': 1}, 'रंग': {'रंग': 1}, 'कौन': {'कौन': 1}, 'सा': {'सा': 1}, 'हैं': {'हा': 8, 'बा': 2, 'बाड़ा': 1}, 'कल': {'कल': 2}, 'बस': {'बस': 1}, 'से': {'से': 1, 'आयला': 1}, 'आयी': {'आइलास': 1}, 'आखिरी': {'अंतिम': 1}, 'सवाल': {'प्रशना': 3, 'रहल': 2}, 'थे': {'हा': 1, 'थे': 1}, 'सबसे': {'सबसे': 3, 'कौन': 2}, 'मीठा': {'मीठ्': 1}, 'फल': {'फल': 1, 'कौन': 1}, 'कौनसा': {'सा': 2}, 'एक': {'आ': 1}, 'मुझे': {'हमरा': 3, 'हम': 1}, 'अपना': {'के': 1, 'अपना': 1}, 'पेन': {'पेन': 2}, 'दे': {'दे': 1}, 'सकते': {'सका': 3}, 'हो': {'तारा': 3, 'ला': 2}, 'आप': {'क्या