In [1]:
import morfessor
import fasttext
import numpy as np
from scipy.spatial import distance
from scipy import stats
import json
import io

In [2]:
morfessor = morfessor.MorfessorIO()
mr_segment = morfessor.read_binary_model_file('marathi_data/mr.morfessor')



In [50]:
# Get Hindi embeddings

def load_vectors(fname):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = np.asarray(list(map(float, tokens[1:])))
    return data


# hin_embeddings = fasttext.load_model("hin_embeddings/hin.embeddings.300.bin")
hin_embeddings = fasttext.load_model("hin_embeddings/cc.hi.300.bin")
# hin_embeddings = load_vectors("hin_embeddings/wiki.hi.vec")



In [66]:
# Get Marathi embeddings for testing direct approaches

# mr_embeddings = load_vectors("mr_embeddings/mr.crosslingual.100.vec")
mr_embeddings = load_vectors("mr_embeddings/mr.crosslingual3.300.vec")
# mr_embeddings = load_vectors("mr_embeddings/mr.crosslingual.300.vec")
# mr_embeddings = load_vectors("mr_embeddings/mr.crosslingual2.300.vec")
# mr_embeddings_backup = fasttext.load_model("mr_embeddings/hi_mr.embeddings.300.bin")
# mr_embeddings_backup = fasttext.load_model("hin_embeddings/hin.embeddings.300.bin")

In [57]:
mr_embeddings = fasttext.load_model('mr_embeddings/cc.mr.300.bin')
# mr_embeddings = load_vectors("mr_embeddings/mr.cc.crosslingual.300.vec")
# mr_embeddings = fasttext.load_model("mr_embeddings/hi_mr.embeddings.300.bin")
# mr_embeddings = fasttext.load_model("mr_embeddings/mr.embeddings.300.bin")
# mr_embeddings = fasttext.load_model("mr_embeddings/mr.embeddings.unsegmented.300.bin")



In [40]:
mr_embeddings_backup = fasttext.load_model("mr_embeddings/mr.embeddings.300.bin")



In [7]:
# Segment mr words using morfessor
def get_segments(word):
    return mr_segment.viterbi_segment(word)[0]

In [30]:
# Map morphs to Hindi morphs 
def get_h_map_self(morph_l):
    return morph_l


In [106]:
# Map morphs to Hindi morphs : baseline ned

with open("morph_mappings/baseline_ned.json", "r") as f:
    morph_mapping_str = json.load(f)
    morph_mapping = {int(k):v for k,v in morph_mapping_str.items()}
    f.close()

# with open("morph_mappings/baseline_ned_tag.json", "r") as f:
#     morph_mapping_str = json.load(f)
#     morph_mapping_tag = {int(k):v for k,v in morph_mapping_str.items()}
#     f.close()


    
with open("tag_distributions/tag_dist_l.json", "r") as l_file:
    tag_dist_l = json.load(l_file)
    
with open("tag_distributions/tag_dist_h.json", "r") as h_file:
    tag_dist_h = json.load(h_file)
    
morph_voc_l = list(tag_dist_l.keys())
morph_voc_h = list(tag_dist_h.keys())

def get_h_map_ned(morph_l):
    try:
        return morph_voc_h[morph_mapping[morph_voc_l.index(morph_l)]]
    except ValueError:
        print("error")
        return morph_l

def get_h_map_ned_tag(morph_l):
    try:
        return morph_voc_h[morph_mapping_tag[morph_voc_l.index(morph_l)]]
    except ValueError:
        print("error")
        return morph_l



In [88]:
list(morph_mapping.items())[:5]

[(0, 4708), (1, 2108), (2, 7600), (3, 524), (4, 27751)]

In [91]:
get_h_map_ned("गाव")

'राव'

In [32]:
# Get hindi embedding for list of Marathi morphs
def get_h_embedding(word_l):
    morph_list_l = get_segments(word_l)
    morph_list_h = [get_h_map_self(morph) for morph in morph_list_l]
    word_embedding = np.asarray([hin_embeddings[morph] for morph in morph_list_h])
    return np.sum(word_embedding, axis=0)

In [73]:
# Get marathi embedding for list of Marathi morphs
def get_l_embedding(word_l, backup=False):
    try:
        return mr_embeddings[word_l], backup
    except KeyError:
        morph_list_l = get_segments(word_l)
        embedding_list = list()
        for m_l in morph_list_l: 
            try:
                embedding_list.append(mr_embeddings[m_l])
            except KeyError:
                embedding_list.append(mr_embeddings_backup[m_l])
    #             backup = True
    #             break
                break
        if backup:
            embedding_list = [mr_embeddings_backup[m_l] for m_l in morph_list_l[:1]]
        word_embedding = np.asarray(embedding_list)
        return np.sum(word_embedding, axis=0), backup

In [39]:
# Get model judgment for one word pair
def get_model_judgment(word_pair):
    (word1, word2) = word_pair
    e1, backup = get_l_embedding(word1)
    e2, backup = get_l_embedding(word2, backup)
    if e1 is None or e2 is None:
        return None
    cos_sim = 1 - distance.cosine(e1,e2)
#     cos_sim = 1 - distance.cosine(get_h_embedding(word1), get_h_embedding(word2))
    return cos_sim
        
    

In [34]:
# Get m j for all pairs
def get_model_judgments(word_pairs):
    judgments = [get_model_judgment(word_pair) for word_pair in word_pairs]
    return judgments

In [35]:
# Get word sim dataset
# Process word sim dataset
def get_word_sim_dataset(path):
    word_sim = open(path, "r").read().split("\n")
    word_pairs = [(pair.split("\t")[0], pair.split("\t")[1]) for pair in word_sim]
    judgments = [float(pair.split("\t")[2]) for pair in word_sim]
    return word_pairs, judgments

In [65]:
# Hindi embeddings on Marathi word sim
word_pairs, human_judgments = get_word_sim_dataset("evaluation/mr.word_sim.txt")
model_judgments = get_model_judgments(word_pairs)
spearman_correlation = stats.spearmanr(model_judgments, human_judgments)
print(spearman_correlation)

SpearmanrResult(correlation=0.5489330335743691, pvalue=1.6062365044424024e-09)


In [52]:
# Hindi embeddings on Marathi word sim
word_pairs, human_judgments = get_word_sim_dataset("evaluation/mr.word_sim.txt")
model_judgments = get_model_judgments(word_pairs)
spearman_correlation = stats.spearmanr(model_judgments, human_judgments)
print(spearman_correlation)

SpearmanrResult(correlation=0.3994444165362607, pvalue=2.6625810172676956e-05)


In [74]:
#CROSSLINGUAL 500000 Hindi sentences
word_pairs, human_judgments = get_word_sim_dataset("evaluation/mr.word_sim.txt")
model_judgments = get_model_judgments(word_pairs)
spearman_correlation = stats.spearmanr(model_judgments, human_judgments)
print(spearman_correlation)

SpearmanrResult(correlation=0.4876080751868102, pvalue=1.5227681833473377e-07)


In [36]:
#SELF fastext 300
word_pairs, human_judgments = get_word_sim_dataset("evaluation/mr.word_sim.txt")
model_judgments = get_model_judgments(word_pairs)
spearman_correlation = stats.spearmanr(model_judgments, human_judgments)
print(spearman_correlation)

SpearmanrResult(correlation=0.4362318992097995, pvalue=3.674277345653242e-06)


In [42]:
#SELF fastext 100
word_pairs, human_judgments = get_word_sim_dataset("evaluation/mr.word_sim.txt")
model_judgments = get_model_judgments(word_pairs)
spearman_correlation = stats.spearmanr(model_judgments, human_judgments)
print(spearman_correlation)

SpearmanrResult(correlation=0.4760644906469756, pvalue=3.259849667020419e-07)


In [70]:
#SELF Hindi cc 300
word_pairs, human_judgments = get_word_sim_dataset("evaluation/mr.word_sim.txt")
model_judgments = get_model_judgments(word_pairs)
spearman_correlation = stats.spearmanr(model_judgments, human_judgments)
print(spearman_correlation)

SpearmanrResult(correlation=0.4272678268388106, pvalue=6.083299566974009e-06)


In [62]:
# BASELINE NED fasttext 100

word_pairs, human_judgments = get_word_sim_dataset("evaluation/mr.word_sim.txt")
model_judgments = get_model_judgments(word_pairs)
spearman_correlation = stats.spearmanr(model_judgments, human_judgments)
print(spearman_correlation)

SpearmanrResult(correlation=0.39474028542336564, pvalue=3.3742445552604146e-05)


In [65]:
# BASELINE NED cc 300

word_pairs, human_judgments = get_word_sim_dataset("evaluation/mr.word_sim.txt")
model_judgments = get_model_judgments(word_pairs)
spearman_correlation = stats.spearmanr(model_judgments, human_judgments)
print(spearman_correlation)

SpearmanrResult(correlation=0.4185873937726948, pvalue=9.779397990726483e-06)


In [108]:
# BASELINE NED Hindi 300

word_pairs, human_judgments = get_word_sim_dataset("evaluation/mr.word_sim.txt")
model_judgments = get_model_judgments(word_pairs)
spearman_correlation = stats.spearmanr(model_judgments, human_judgments)
print(spearman_correlation)

error
error
error
SpearmanrResult(correlation=0.3936806517938399, pvalue=3.5574243600696806e-05)


In [104]:
# BASELINE NED TAG cc 300

word_pairs, human_judgments = get_word_sim_dataset("evaluation/mr.word_sim.txt")
model_judgments = get_model_judgments(word_pairs)
spearman_correlation = stats.spearmanr(model_judgments, human_judgments)
print(spearman_correlation)

error
error
error
SpearmanrResult(correlation=0.36379149108221437, pvalue=0.0001470211585492415)


In [103]:
# BASELINE NED TAG Hindi 300

word_pairs, human_judgments = get_word_sim_dataset("evaluation/mr.word_sim.txt")
model_judgments = get_model_judgments(word_pairs)
spearman_correlation = stats.spearmanr(model_judgments, human_judgments)
print(spearman_correlation)

error
error
error
SpearmanrResult(correlation=0.411683720125784, pvalue=1.4133900483873763e-05)


In [26]:
# SELF hin wiki 300
word_pairs, human_judgments = get_word_sim_dataset("evaluation/mr.word_sim.txt")
model_judgments = get_model_judgments(word_pairs)
spearman_correlation = stats.spearmanr(model_judgments, human_judgments)
print(spearman_correlation)

In [29]:
# CROSSLINGUAL Hindi segmented with 300 fasttext (trained from segmented data), as backup Marathi
word_pairs, human_judgments = get_word_sim_dataset("evaluation/mr.word_sim.txt")
model_judgments = get_model_judgments(word_pairs)
spearman_correlation = stats.spearmanr(model_judgments, human_judgments)
print(spearman_correlation)


SpearmanrResult(correlation=0.4921998209147555, pvalue=1.1161682576661716e-07)


In [32]:
# Downloaded 300 fasttext Marathi
word_pairs, human_judgments = get_word_sim_dataset("evaluation/mr.word_sim.txt")
model_judgments = get_model_judgments(word_pairs)
spearman_correlation = stats.spearmanr(model_judgments, human_judgments)
print(spearman_correlation)



SpearmanrResult(correlation=0.5223833243012493, pvalue=1.2878400849307844e-08)


In [35]:
# Marathi 300 fasttext (trained on segmented data)
word_pairs, human_judgments = get_word_sim_dataset("evaluation/mr.word_sim.txt")
model_judgments = get_model_judgments(word_pairs)
spearman_correlation = stats.spearmanr(model_judgments, human_judgments)
print(spearman_correlation)




SpearmanrResult(correlation=0.45898726381961713, pvalue=9.561449920225694e-07)


In [26]:
# Marathi 100 fasttext (trained on segmented data)
word_pairs, human_judgments = get_word_sim_dataset("evaluation/mr.word_sim.txt")
model_judgments = get_model_judgments(word_pairs)
spearman_correlation = stats.spearmanr(model_judgments, human_judgments)
print(spearman_correlation)





SpearmanrResult(correlation=0.39192529911462526, pvalue=3.881478379336467e-05)


In [46]:
#Marathi CROSSLINGUAL 300, using cc 300 Hindi pretrained, 50k Marathi segmented to train - backup 300 fasttext marathi
word_pairs, human_judgments = get_word_sim_dataset("evaluation/mr.word_sim.txt")
model_judgments = get_model_judgments(word_pairs)
human_judgments = [human_judgments[idx] for idx in range(len(human_judgments)) if model_judgments[idx] is not None]
model_judgments = list(filter(lambda x: x is not None, model_judgments))
print(len(human_judgments))
spearman_correlation = stats.spearmanr(model_judgments, human_judgments)
print(spearman_correlation)


104
SpearmanrResult(correlation=0.44282517512684916, pvalue=2.5124182999621907e-06)


In [56]:
#Marathi JOINT 300, using segmented Hindi and Mar
word_pairs, human_judgments = get_word_sim_dataset("evaluation/hin.word_sim.txt")
model_judgments = get_model_judgments(word_pairs)
spearman_correlation = stats.spearmanr(model_judgments, human_judgments)
print(spearman_correlation)



SpearmanrResult(correlation=0.48868002988329123, pvalue=1.6486611801230756e-15)


In [92]:
# CROSSLINGUAL Hindi segmented with 300 fasttext (trained from segmented data), as backup Marathi
word_pairs, human_judgments = get_word_sim_dataset("evaluation/mr.word_sim.txt")
model_judgments = get_model_judgments(word_pairs)
human_judgments = [human_judgments[idx] for idx in range(len(human_judgments)) if model_judgments[idx] is not None]
model_judgments = list(filter(lambda x: x is not None, model_judgments))
print(len(human_judgments))

spearman_correlation = stats.spearmanr(model_judgments, human_judgments)
print(spearman_correlation)



104
SpearmanrResult(correlation=0.49047116725731515, pvalue=1.2553038614035856e-07)


In [95]:
# CROSSLINGUAL Hindi segmented with 300 fasttext (trained from segmented data), as backup Marathi
word_pairs, human_judgments = get_word_sim_dataset("evaluation/mr.word_sim.txt")
model_judgments = get_model_judgments(word_pairs)
human_judgments = [human_judgments[idx] for idx in range(len(human_judgments)) if model_judgments[idx] is not None]
model_judgments = list(filter(lambda x: x is not None, model_judgments))
print(len(human_judgments))

spearman_correlation = stats.spearmanr(model_judgments, human_judgments)
print(spearman_correlation)




कळफलक
['कळ', 'फलक']
टेलिफोन
['टेलि', 'फोन']
दूरदर्शन
['दूर', 'दर्शन']
बटाटा
['बटाट', 'ा']
अंडी
['अ', 'ंडी']
कोबी
['को', 'बी']
यासिर
['यास', 'िर']
शांतता
['शांत', 'ता']
अराफात
['अ', 'राफा', 'त']
दहशतवादी
['दहशतवाद', 'ी']
अराफात
['अ', 'राफा', 'त']
पॉपकॉर्न
['पॉप', 'कॉर्', 'न']
भौतिकशास्त्र
['भौतिक', 'शास्त्र']
प्रोटॉन
['प्रो', 'ट', 'ॉन']
रसायनशास्त्र
['रसायन', 'शास्त्र']
भौतिकशास्त्र
['भौतिक', 'शास्त्र']
रसायनशास्त्र
['रसायन', 'शास्त्र']
रत्नजडित
['रत्न', 'जड', 'ित']
शेगडी
['शे', 'गडी']
कोंबडा
['कोंब', 'डा']
कोंबडा
['कोंब', 'डा']
स्मशानभूमी
['स्मशान', 'भूमी']
काच
['का', 'च']
जादूगार
['जादू', 'गार']
प्राणीसंग्रहालय
['प्राणी', 'संग्रहालय']
मानसशास्त्र
['मानस', 'शास्त्र']
मनोदोषचिकित्सा
['मनो', 'दोष', 'चिकित्सा']
मानसशास्त्र
['मानस', 'शास्त्र']
मानसशास्त्र
['मानस', 'शास्त्र']
मानसशास्त्र
['मानस', 'शास्त्र']
मानसशास्त्र
['मानस', 'शास्त्र']
मानसशास्त्र
['मानस', 'शास्त्र']
मानसशास्त्र
['मानस', 'शास्त्र']
मानसशास्त्र
['मानस', 'शास्त्र']
मानसशास्त्र
['मानस', 'शास्त्र']
मानसशास्त्र
['मानस', 'शास्

In [146]:
# CROSSLINGUAL Hindi segmented with 300 fasttext (trained from segmented data) with 2 million sentences, as backup Marathi
word_pairs, human_judgments = get_word_sim_dataset("evaluation/mr.word_sim.txt")
model_judgments = get_model_judgments(word_pairs)
# human_judgments = [human_judgments[idx] for idx in range(len(human_judgments)) if model_judgments[idx] is not None]
# model_judgments = list(filter(lambda x: x is not None, model_judgments))
print(len(human_judgments))
print(len(model_judgments))

spearman_correlation = stats.spearmanr(model_judgments, human_judgments)
print(spearman_correlation)





104
104
SpearmanrResult(correlation=0.500965877522097, pvalue=6.090206269207641e-08)


In [21]:
# CROSSLINGUAL Hindi segmented with 300 fasttext (trained from segmented data), JOINT as backup 
word_pairs, human_judgments = get_word_sim_dataset("evaluation/mr.word_sim.txt")
model_judgments = get_model_judgments(word_pairs)
# human_judgments = [human_judgments[idx] for idx in range(len(human_judgments)) if model_judgments[idx] is not None]
# model_judgments = list(filter(lambda x: x is not None, model_judgments))
print(len(human_judgments))
print(len(model_judgments))

spearman_correlation = stats.spearmanr(model_judgments, human_judgments)
print(spearman_correlation)






104
104
SpearmanrResult(correlation=0.3547685501762518, pvalue=0.00021974351085621348)


In [28]:
# CROSSLINGUAL Hindi segmented with 300 fasttext (trained from segmented data), JOINT as backup 
word_pairs, human_judgments = get_word_sim_dataset("evaluation/mr.word_sim.txt")
model_judgments = get_model_judgments(word_pairs)
# human_judgments = [human_judgments[idx] for idx in range(len(human_judgments)) if model_judgments[idx] is not None]
# model_judgments = list(filter(lambda x: x is not None, model_judgments))
print(len(human_judgments))
print(len(model_judgments))

spearman_correlation = stats.spearmanr(model_judgments, human_judgments)
print(spearman_correlation)







104
104
SpearmanrResult(correlation=0.24638087391975202, pvalue=0.011694614084061028)


In [38]:
# for p in word_pairs:
#     print(p[0], p[1])
#     print(get_h_map_ned(p[0]), get_h_map_ned(p[1]))
#     print(get_h_map_ned_tag(p[0]), get_h_map_ned_tag(p[1]))
#     print("\n\n\n")

In [132]:
len(mr_embeddings["मांजर"])

300