In [29]:
import morfessor
import fasttext
import numpy as np
from scipy.spatial import distance
from scipy import stats
import json

In [37]:
io = morfessor.MorfessorIO()
mr_segment = io.read_binary_model_file('marathi_data/mr.morfessor')



In [64]:
# Get Hindi embeddings
# hin_embeddings = fasttext.load_model("hin_embeddings/hin.embeddings.100.bin")
hin_embeddings = fasttext.load_model("cc.hi.300.bin")



In [41]:
# Segment mr words using morfessor
def get_segments(word):
    return mr_segment.viterbi_segment(word)[0]

In [68]:
# Map morphs to Hindi morphs 
def get_h_map_self(morph_l):
    return morph_l


In [61]:
# Map morphs to Hindi morphs : baseline ned

with open("morph_mappings/baseline_ned.json", "r") as f:
    morph_mapping_str = json.load(f)
    morph_mapping = {int(k):v for k,v in morph_mapping_str.items()}
    f.close()
    
with open("tag_distributions/tag_dist_l.json", "r") as l_file:
    tag_dist_l = json.load(l_file)
    
with open("tag_distributions/tag_dist_h.json", "r") as h_file:
    tag_dist_h = json.load(h_file)
    
morph_voc_l = list(tag_dist_l.keys())
morph_voc_h = list(tag_dist_h.keys())

def get_h_map_ned(morph_l):
    try:
        return morph_voc_h[morph_mapping[morph_voc_l.index(morph_l)]]
    except ValueError:
        return morph_l



In [51]:
list(morph_mapping.items())[:5]

[(0, 5550), (1, 2108), (2, 7600), (3, 524), (4, 9136)]

In [52]:
get_h_map_ned("चे")

'चे'

In [69]:
# Get hindi embedding for list of Hindi morphs
def get_h_embedding(word_l):
    morph_list_l = get_segments(word_l)
    morph_list_h = [get_h_map_self(morph) for morph in morph_list_l]
    word_embedding = np.asarray([hin_embeddings[morph] for morph in morph_list_h])
    return np.sum(word_embedding, axis=0)

In [54]:
# Get model judgment for one word pair
def get_model_judgment(word_pair):
    (word1, word2) = word_pair
    cos_sim = 1 - distance.cosine(get_h_embedding(word1), get_h_embedding(word2))
    return cos_sim
        
    

In [55]:
# Get m j for all pairs
def get_model_judgments(word_pairs):
    judgments = [get_model_judgment(word_pair) for word_pair in word_pairs]
    return judgments

In [56]:
# Get word sim dataset
# Process word sim dataset
def get_word_sim_dataset(path):
    word_sim = open(path, "r").read().split("\n")
    word_pairs = [(pair.split("\t")[0], pair.split("\t")[1]) for pair in word_sim]
    judgments = [float(pair.split("\t")[2]) for pair in word_sim]
    return word_pairs, judgments

In [42]:
#SELF fastext 100
word_pairs, human_judgments = get_word_sim_dataset("evaluation/mr.word_sim.txt")
model_judgments = get_model_judgments(word_pairs)
spearman_correlation = stats.spearmanr(model_judgments, human_judgments)
print(spearman_correlation)

SpearmanrResult(correlation=0.4760644906469756, pvalue=3.259849667020419e-07)


In [70]:
#SELF cc 300
word_pairs, human_judgments = get_word_sim_dataset("evaluation/mr.word_sim.txt")
model_judgments = get_model_judgments(word_pairs)
spearman_correlation = stats.spearmanr(model_judgments, human_judgments)
print(spearman_correlation)

SpearmanrResult(correlation=0.4272678268388106, pvalue=6.083299566974009e-06)


In [62]:
# BASELINE NED fasttext 100

word_pairs, human_judgments = get_word_sim_dataset("evaluation/mr.word_sim.txt")
model_judgments = get_model_judgments(word_pairs)
spearman_correlation = stats.spearmanr(model_judgments, human_judgments)
print(spearman_correlation)

SpearmanrResult(correlation=0.39474028542336564, pvalue=3.3742445552604146e-05)


In [65]:
# BASELINE NED cc 300

word_pairs, human_judgments = get_word_sim_dataset("evaluation/mr.word_sim.txt")
model_judgments = get_model_judgments(word_pairs)
spearman_correlation = stats.spearmanr(model_judgments, human_judgments)
print(spearman_correlation)

SpearmanrResult(correlation=0.4185873937726948, pvalue=9.779397990726483e-06)


In [None]:
# Get embeddings of Hindi morphs
# Compose embeddings, return

In [None]:
# Find set of cosine sim

In [None]:
# Find Spearman correlation