In [1]:
import sys
sys.path.append("../evaluation_languages_home/eval_scripts/")
from eval import Evaluation

In [25]:
#!/usr/bin/env python3

import os
import argparse
import math
import json
from collections import defaultdict, Counter
import pandas as pd
import editdistance
import Levenshtein as lv

class MLI_EM_PHONOLOGICAL:

    # def __init__(self, source_lang, target_lang):
    #     '''Initialize langs'''
    #     self.source_lang = source_lang
    #     self.target_lang = target_lang


    def get_args(self):
        parser = argparse.ArgumentParser(description = "Build a lexicon from 2 corpora using NED.")
        parser.add_argument("--source_file", type=str, default=None, required=True, help="Source corpus filepath")
        parser.add_argument("--target_file", type=str, default=None, required=True, help="Target corpus filepath")
        parser.add_argument("--max_lexicon_length", type=int, default=math.inf, help="Maximum length of extracted lexicon")
        parser.add_argument("--min_source_freq", type=int, default=2, help="Min freq of source side words")
        parser.add_argument("--min_target_freq", type=int, default=2, help="Min freq of target side words")
        parser.add_argument("--OUTPATH", type=str, help="Path for saving lexicon (JSON)")

        return parser.parse_args()

    def read_file(self, filepath):
        '''Reads text file and returns as string'''
        with open(filepath, "r") as f:
            return f.read()

    def get_lexicon_words(self, source_words, target_words, max_lexicon_length, min_source_freq, min_target_freq):
        '''Decide which source-side words will be in the lexicon'''

        cand_source_words = defaultdict(lambda:0, {w:f for w, f in source_words.most_common(max_lexicon_length) if f >= min_source_freq})
        cand_target_words = defaultdict(lambda:0, {w:f for w, f in target_words.items() if f >= min_target_freq})

        return cand_source_words, cand_target_words

    def ned_match(self, word, cand_target_words):
        '''Find best match using NED'''
        vowel_range = list(range(2305, 2315)) + list(range(2317, 2325)) + list(range(2365, 2384))
        bad_char_range = range(2364, 2367)

        min_dist, best_word = 2, ""
        for cand in cand_target_words:
            if cand[0] != word[0]:
                continue
            # cons_sequence_1 = "".join([c for c in word if ord(c) not in bad_char_range])
            # cons_sequence_2 = "".join([c for c in cand if ord(c) not in bad_char_range])
            # if cons_sequence_1 != cons_sequence_2:
            #     continue

            ned = editdistance.eval(word, cand)/max(len(word), len(cand))
            if ned < min_dist:
                min_dist = ned
                best_word = cand

        return best_word, min_dist

    
    def build_ops(self, word, cand_target_words, ops_freq = None, best_candidates = None):
        '''Find minimal Levenshtein operations'''

        vowel_range = list(range(2305, 2315)) + list(range(2317, 2325)) + list(range(2365, 2384))
        bad_char_range = range(2364, 2367)

        for cand in cand_target_words:

            ops = lv.editops(word, cand)
            if len(ops) == 1:
    #             print(word, cand, "\n")
                for op in ops:
                    if op[0] == "replace":
                        char1 = word[op[1]]
                        char2 = cand[op[2]]
                        ops_freq[char1][char2] += 1
                        best_candidates[word].add(cand)


        return ops_freq, best_candidates
    
    def find_new_equivalents(self, sound_changes, best_candidates):
        '''Finds new equivalents given sound changes'''
        new_eqs = defaultdict(lambda: dict())
        for source in best_candidates:
            for cand in best_candidates[source]:
                ops = lv.editops(source, cand)
                if len(ops) == 1 and ops[0][0]=="replace":
                    char1 = source[ops[0][1]]
                    char2 = cand[ops[0][2]]
                    if char2 in accepted_changes[char1]:
                        new_eqs[source][cand] = "NA"

        return new_eqs

    def build_lexicon(self, cand_source_words, cand_target_words, source_words, target_words):
        '''Build bilingual lexicon using NED'''
        lexicon = defaultdict(lambda: dict())

        for word in cand_source_words:
            best_word, ned = self.ned_match(word, cand_target_words)
            lexicon[word][best_word] = 1 - ned

        return lexicon

    def save_lexicon(self, lexicon, OUTPATH):
        '''Dump lexicon'''
        OUTDIR = "/".join(OUTPATH.split("/")[:-1])+"/"
        if not os.path.isdir(OUTDIR):
            os.makedirs(OUTDIR)

        with open(OUTPATH, "w") as f:
            json.dump(lexicon, f, ensure_ascii = False, indent = 2)


    def driver(self, source_file, target_file, max_lexicon_length = math.inf, min_source_freq = 2, min_target_freq = 2, OUTPATH = None):

        # Read files
        source_corpus = self.read_file(source_file)
        target_corpus = self.read_file(target_file)

        # Filter
        source_words = Counter(source_corpus.split())
        target_words = Counter(target_corpus.split())
        
        return source_words, target_words

        max_lexicon_length = min(max_lexicon_length, len(source_words))
        cand_source_words, cand_target_words = self.get_lexicon_words(source_words, target_words, max_lexicon_length, min_source_freq, min_target_freq)

        # Build lexicon
        lexicon = self.build_lexicon(cand_source_words, cand_target_words, source_words, target_words)

        # Save lexicon
        if OUTPATH:
            self.save_lexicon(lexicon, OUTPATH)

    def main(self):
        args = self.get_args()
        self.driver(args.source_file, args.target_file, \
        args.max_lexicon_length, args.min_source_freq, args.min_target_freq, \
        args.OUTPATH)



In [29]:
# Import file paths

DATADIR="../data/crawled_cleaned/"
anchor="hindi-urdu"
source_file=DATADIR+anchor+".txt"
target="bhojpuri"
target_file=DATADIR+target+".txt"
gold_file = "../evaluation_languages_home/eval_data/lexicons/hindi-urdu_source/hindi-urdu_bhojpuri.json"

In [26]:
# Object of above class, get source and target words

obj = MLI_EM_PHONOLOGICAL()
source_words, target_words = obj.driver(source_file, target_file)

print(len(source_words), len(target_words))

In [9]:
# Build ops_freq by calling build_ops on every source word.

ops_freq = defaultdict(lambda: defaultdict(lambda: 0))
best_candidates = defaultdict(lambda: set())

for source in list(source_words.keys())[:2000]:
    ops_freq, best_candidates = obj.build_ops(source, target_words, ops_freq, best_candidates)
    

In [50]:
# Find accepted changes by picking the cream of ops_freq

N = 10

all_changes = {(char, target): ops_freq[char][target] for char in ops_freq for target in ops_freq[char]}

best_changes = sorted(all_changes.items(), key = lambda x: x[1], reverse = True)[:N]

accepted_changes = defaultdict(lambda: set())
for (char, target), freq in best_changes:
    accepted_changes[char].add(target)


In [51]:
accepted_changes

defaultdict(<function __main__.<lambda>()>,
            {'े': {'ा', 'ी'},
             'ा': {'ि', 'ी', 'े', 'ो'},
             'ी': {'ा', 'े'},
             'न': {'त'},
             'त': {'न'}})

In [52]:
# Find new equivalents based on accepted_changes

new_eqs = obj.find_new_equivalents(accepted_changes, best_candidates)

In [53]:
type(new_eqs)

collections.defaultdict

In [54]:
# Evaluate new_eqs against gold lexicon
obj_eval = Evaluation()
gold_lexicon = obj_eval.read_files(gold_file)
result = obj_eval.eval(gold_lexicon, new_eqs, type = "loose")

In [55]:
result

{'accuracy': 0.0, 'found': 41, 'total': 139}

In [56]:
new_eqs

defaultdict(<function __main__.MLI_EM_PHONOLOGICAL.find_new_equivalents.<locals>.<lambda>()>,
            {'मोरा': {'मोरि': 'NA', 'मोरे': 'NA', 'मोरी': 'NA'},
             'अभागा': {'अभागि': 'NA', 'अभागी': 'NA', 'अभागे': 'NA'},
             'न': {'त': 'NA'},
             'जागा': {'जागो': 'NA',
              'जागे': 'NA',
              'जोगा': 'NA',
              'जागी': 'NA',
              'जागि': 'NA'},
             'ले': {'ली': 'NA', 'ला': 'NA'},
             'भागा': {'भागी': 'NA', 'भागि': 'NA', 'भागे': 'NA'},
             'हाय': {'होय': 'NA', 'हिय': 'NA'},
             'ना': {'नी': 'NA', 'ने': 'NA', 'ता': 'NA'},
             'मोरी': {'मोरा': 'NA', 'मोरे': 'NA'},
             'पे': {'पा': 'NA', 'पी': 'NA'},
             'बैठा': {'बैठो': 'NA', 'बैठे': 'NA', 'बैठी': 'NA', 'बैठि': 'NA'},
             'मोरे': {'मोरा': 'NA', 'मोरी': 'NA'},
             'माथे': {'माथा': 'NA'},
             'का': {'को': 'NA', 'की': 'NA', 'कि': 'NA', 'के': 'NA'},
             'नथुनी': {'नथुना': 'NA'},
      

In [57]:
gold_lexicon_mar = obj_eval.read_files("../evaluation_languages_home/eval_data/lexicons/hindi-urdu_source/hindi-urdu_marathi.json")

In [58]:
target_num = [len(tar) for key, tar in gold_lexicon_mar.items()]

In [60]:
sum(target_num)/len(target_num)

1.2028985507246377

In [61]:
max(target_num)

3