In [1]:
import morfessor
import fasttext
import numpy as np
from scipy.spatial import distance
from scipy import stats
import json
import io
import random

In [2]:
#Load Morfessor model

morfessor = morfessor.MorfessorIO()
mr_segment = morfessor.read_binary_model_file('../data/konkani_data/kon.morfessor')



In [3]:
# Load vectors

def load_vectors(fname):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = np.asarray(list(map(float, tokens[1:])))
    return data






In [None]:
# mr_embeddings_backup = fasttext.load_model("nep_embeddings/nep.embeddings.300.bin")
# mr_embeddings = load_vectors("nep_embeddings/nep.embeddings.crosslingual.300.vec")
# mr_embeddings = fasttext.load_model("nep_embeddings/nep.embeddings.unsegmented.300.bin")
# mr_embeddings = fasttext.load_model("nep_embeddings/cc.ne.300.bin")
# mr_embeddings = fasttext.load_model("nep_embeddings/nep.embeddings.300.bin")

# mr_embeddings = load_vectors("mr_embeddings/mr.cc.crosslingual.300.vec")
# mr_embeddings = fasttext.load_model("mr_embeddings/hi_mr.embeddings.300.bin")

# mr_embeddings = fasttext.load_model("mr_embeddings/mr.embeddings.100.bin")
# hin_embeddings = fasttext.load_model("hin_embeddings/hin.embeddings.300.bin")
# hin_embeddings = fasttext.load_model("hin_embeddings/cc.hi.300.bin")
# hin_embeddings = load_vectors("hin_embeddings/wiki.hi.vec")

In [46]:
# Segment morph using morfessor
def get_segments(word):
    return mr_segment.viterbi_segment(word)[0]

In [49]:
# Get embedding for list of lrl morphs
def get_l_embedding(word_l, backup=False):
#     try:
#         return mr_embeddings[word_l], backup
#     except KeyError:
    morph_list_l = get_segments(word_l)
#     print(morph_list_l)
    embedding_list = list()
    for m_l in morph_list_l: 
        try:
            embedding_list.append(mr_embeddings[m_l])
        except KeyError:
            embedding_list.append(mr_embeddings_backup[m_l])
            backup = True
#             break
    if backup:
        embedding_list = [mr_embeddings_backup[m_l] for m_l in morph_list_l]
    word_embedding = np.asarray(embedding_list)
    return np.sum(word_embedding, axis=0), backup

In [41]:
# Get model judgment for one word pair
def get_model_cosim(word_pair):
    (word1, word2) = word_pair
#     print(word1)
    e1, backup = get_l_embedding(word1)
#     print(word2)
    e2, backup = get_l_embedding(word2, backup)
    cos_sim = 1 - distance.cosine(e1,e2)
#     cos_sim = 1 - distance.cosine(get_h_embedding(word1), get_h_embedding(word2))
    return cos_sim
        
    

In [34]:
def get_model_judgment(question, option_set):
    all_cosims = list()
    for opt in option_set:
        all_cosims.append(get_model_cosim((question, opt)))
        
    return option_set[np.argmax(all_cosims)]

In [35]:
# Get m j for all pairs
def get_model_judgments(questions, options):
#     print(len(questions))
#     print(len(options))
    judgments = [get_model_judgment(question, option_set) for question, option_set in list(zip(questions, options))]
    return judgments

In [36]:

def get_wbst_dataset(path):
    with open(path, "r") as f:
        wbst = json.load(f)
        
    questions = list()
    answers = list()
    options = list()
    for q in wbst:
        questions.append(q)
        answers.append(wbst[q]["answer"])
        opt = wbst[q]["detractors"] + [wbst[q]["answer"]]
        random.shuffle(opt)
        options.append(opt)
    return questions, answers, options

In [37]:
# Get embeddings 
lang_code = "kon"
type_exp = "iter"
is_first_run = False
#unseg
if type_exp == "unseg":
    path = "../embeddings/kon_embeddings/kon.embeddings.unsegmented.300.bin"

#seg
if type_exp == "seg":
    path = "../embeddings/kon_embeddings/kon.embeddings.300.bin"

#iter
if type_exp == "iter":
    path = "../embeddings/kon_embeddings/kon.embeddings.crosslingual.300.vec"
    backup_path =  "../embeddings/kon_embeddings/kon.embeddings.unsegmented.300.bin"

#pret
if type_exp == "pret":
    path = '../embeddings/kon_embeddings/cc.gom.300.bin'


if path.endswith(".vec"):
    mr_embeddings = load_vectors(path)
    mr_embeddings_backup = fasttext.load_model(backup_path)

else:
    mr_embeddings = fasttext.load_model(path)






In [50]:
# Evaluate lrl embeddings

if is_first_run:
    test_results = dict()
    
parameters = [(10, 6), (10, 5), (20, 6), (20, 5)]
for pidx, (MIN_FREQ, N) in enumerate(parameters):
    print(pidx)
    
    questions, answers, options = get_wbst_dataset("../eval_data/{}_wbst/{}.wbst-{}-{}.json".format(lang_code, lang_code, MIN_FREQ, N))
    
    
    model_judgments = get_model_judgments(questions, options)
    accuracy = 0
    for idx, ans in enumerate(answers):
        if model_judgments[idx]==ans:
            accuracy+=1
    print(accuracy*100/len(answers))
        
    if pidx not in test_results:
        test_results[pidx] = dict()
        test_results[pidx]["parameters"] = (MIN_FREQ, N)
        test_results[pidx]["size"] = len(questions)
    test_results[pidx][type_exp] = accuracy*100/len(answers)

0
20.155038759689923
1
21.705426356589147
2
32.608695652173914
3
39.130434782608695


In [30]:
test_results

{0: {'parameters': (10, 6),
  'size': 129,
  'unseg': 25.58139534883721,
  'seg': 20.930232558139537,
  'iter': 20.155038759689923,
  'pret': 51.93798449612403},
 1: {'parameters': (10, 5),
  'size': 129,
  'unseg': 31.007751937984494,
  'seg': 27.131782945736433,
  'iter': 21.705426356589147,
  'pret': 55.03875968992248},
 2: {'parameters': (20, 6),
  'size': 46,
  'unseg': 26.08695652173913,
  'seg': 26.08695652173913,
  'iter': 32.608695652173914,
  'pret': 60.869565217391305},
 3: {'parameters': (20, 5),
  'size': 46,
  'unseg': 36.95652173913044,
  'seg': 36.95652173913044,
  'iter': 39.130434782608695,
  'pret': 69.56521739130434}}

In [53]:
import pandas as pd
df = pd.DataFrame(test_results).transpose()
cols = ["parameters", "size", "unseg", "seg", "iter", "pret"]
df[cols]

Unnamed: 0,parameters,size,unseg,seg,iter,pret
0,"(10, 6)",129,25.581395,20.930233,20.155039,51.937984
1,"(10, 5)",129,31.007752,27.131783,21.705426,55.03876
2,"(20, 6)",46,26.086957,26.086957,32.608696,60.869565
3,"(20, 5)",46,36.956522,36.956522,39.130435,69.565217


In [23]:
# Crosslingual with 300 backup
# MIN = 10, N = 4
# always segment , take all morphs

questions, answers, options = get_wbst_dataset("evaluation/nep_wbst/nep.wbst-10-5.json")
model_judgments = get_model_judgments(questions, options)
accuracy = 0
for idx, ans in enumerate(answers):
    if model_judgments[idx]==ans:
        accuracy+=1
print(accuracy*100/len(answers))

1414
1414
69.16548797736917


In [145]:
# Crosslingual 2M with 300 backup
# MIN = 50, N = 4 

questions, answers, options = get_wbst_dataset("evaluation/mr.wbst.json")
model_judgments = get_model_judgments(questions, options)
accuracy = 0
for idx, ans in enumerate(answers):
    if model_judgments[idx]==ans:
        accuracy+=1
print(accuracy*100/len(answers))

1183
1183
60.777683854606934


In [153]:
# 300
# MIN = 10, N = 4

questions, answers, options = get_wbst_dataset("evaluation/mr.wbst.json")
model_judgments = get_model_judgments(questions, options)
accuracy = 0
for idx, ans in enumerate(answers):
    if model_judgments[idx]==ans:
        accuracy+=1
print(accuracy*100/len(answers))

1183
1183
58.918005071851226


In [157]:
# Pretrained
# MIN = 10, N = 4
# Direct embedding

questions, answers, options = get_wbst_dataset("evaluation/mr.wbst.json")
model_judgments = get_model_judgments(questions, options)
accuracy = 0
for idx, ans in enumerate(answers):
    if model_judgments[idx]==ans:
        accuracy+=1
print(accuracy*100/len(answers))

1183
1183
84.69991546914623


In [176]:
# Crosslingual with 300 backup
# MIN = 10, N = 5 ans freq > 10
# always segment , take all morphs

questions, answers, options = get_wbst_dataset("evaluation/mr.wbst.json")
model_judgments = get_model_judgments(questions, options)
accuracy = 0
for idx, ans in enumerate(answers):
    if model_judgments[idx]==ans:
        accuracy+=1
print(accuracy*100/len(answers))

435
435
59.08045977011494


In [179]:
# 300
# MIN = 10, N = 5 ans freq > 10

questions, answers, options = get_wbst_dataset("evaluation/mr.wbst.json")
model_judgments = get_model_judgments(questions, options)
accuracy = 0
for idx, ans in enumerate(answers):
    if model_judgments[idx]==ans:
        accuracy+=1
print(accuracy*100/len(answers))

435
435
62.98850574712644


In [185]:
# Crosslingual with 300 backup
# MIN = 10, N = 5 
# always segment , take all morphs

questions, answers, options = get_wbst_dataset("evaluation/mr.wbst.json")
model_judgments = get_model_judgments(questions, options)
accuracy = 0
for idx, ans in enumerate(answers):
    if model_judgments[idx]==ans:
        accuracy+=1
print(accuracy*100/len(answers))

1183
1183
58.66441251056636


In [181]:
# 300
# MIN = 10, N = 5

questions, answers, options = get_wbst_dataset("evaluation/mr.wbst.json")
model_judgments = get_model_judgments(questions, options)
accuracy = 0
for idx, ans in enumerate(answers):
    if model_judgments[idx]==ans:
        accuracy+=1
print(accuracy*100/len(answers))

1183
1183
54.7759932375317


In [188]:
# Pretrained
# MIN = 10, N = 5
# Direct embedding

questions, answers, options = get_wbst_dataset("evaluation/mr.wbst.json")
model_judgments = get_model_judgments(questions, options)
accuracy = 0
for idx, ans in enumerate(answers):
    if model_judgments[idx]==ans:
        accuracy+=1
print(accuracy*100/len(answers))

1183
1183
84.86897717666949


In [192]:
# Crosslingual with 300 backup
# MIN = 50, N = 5 
# always segment , take all morphs

questions, answers, options = get_wbst_dataset("evaluation/mr.wbst.json")
model_judgments = get_model_judgments(questions, options)
accuracy = 0
for idx, ans in enumerate(answers):
    if model_judgments[idx]==ans:
        accuracy+=1
print(accuracy*100/len(answers))

293
293
63.13993174061434


In [196]:
# 300
# MIN = 50, N = 5

questions, answers, options = get_wbst_dataset("evaluation/mr.wbst.json")
model_judgments = get_model_judgments(questions, options)
accuracy = 0
for idx, ans in enumerate(answers):
    if model_judgments[idx]==ans:
        accuracy+=1
print(accuracy*100/len(answers))

293
293
63.822525597269625


In [189]:
# Pretrained
# MIN = 50, N = 5
# Direct embedding

questions, answers, options = get_wbst_dataset("evaluation/mr.wbst.json")
model_judgments = get_model_judgments(questions, options)
accuracy = 0
for idx, ans in enumerate(answers):
    if model_judgments[idx]==ans:
        accuracy+=1
print(accuracy*100/len(answers))

293
293
82.5938566552901


In [203]:
# Crosslingual with 300 backup
# MIN = 50, N = 4
# always segment , take all morphs

questions, answers, options = get_wbst_dataset("evaluation/mr.wbst.json")
model_judgments = get_model_judgments(questions, options)
accuracy = 0
for idx, ans in enumerate(answers):
    if model_judgments[idx]==ans:
        accuracy+=1
print(accuracy*100/len(answers))

293
293
67.23549488054607


In [197]:
# 300
# MIN = 50, N = 4

questions, answers, options = get_wbst_dataset("evaluation/mr.wbst.json")
model_judgments = get_model_judgments(questions, options)
accuracy = 0
for idx, ans in enumerate(answers):
    if model_judgments[idx]==ans:
        accuracy+=1
print(accuracy*100/len(answers))

293
293
63.13993174061434


In [199]:
# Pretrained
# MIN = 50, N = 4
# Direct embedding

questions, answers, options = get_wbst_dataset("evaluation/mr.wbst.json")
model_judgments = get_model_judgments(questions, options)
accuracy = 0
for idx, ans in enumerate(answers):
    if model_judgments[idx]==ans:
        accuracy+=1
print(accuracy*100/len(answers))

293
293
81.22866894197952


In [204]:
# Crosslingual with 300 backup
# MIN = 20, N = 4
# always segment , take all morphs

questions, answers, options = get_wbst_dataset("evaluation/mr.wbst.json")
model_judgments = get_model_judgments(questions, options)
accuracy = 0
for idx, ans in enumerate(answers):
    if model_judgments[idx]==ans:
        accuracy+=1
print(accuracy*100/len(answers))

684
684
64.47368421052632


In [207]:
# 300
# MIN = 20, N = 4

questions, answers, options = get_wbst_dataset("evaluation/mr.wbst.json")
model_judgments = get_model_judgments(questions, options)
accuracy = 0
for idx, ans in enumerate(answers):
    if model_judgments[idx]==ans:
        accuracy+=1
print(accuracy*100/len(answers))

684
684
59.941520467836256


In [209]:
# Pretrained
# MIN = 20, N = 4
# Direct embedding

questions, answers, options = get_wbst_dataset("evaluation/mr.wbst.json")
model_judgments = get_model_judgments(questions, options)
accuracy = 0
for idx, ans in enumerate(answers):
    if model_judgments[idx]==ans:
        accuracy+=1
print(accuracy*100/len(answers))

684
684
87.57309941520468


In [215]:
# Crosslingual with 300 backup
# MIN = 20, N = 5
# always segment , take all morphs

questions, answers, options = get_wbst_dataset("evaluation/mr.wbst.json")
model_judgments = get_model_judgments(questions, options)
accuracy = 0
for idx, ans in enumerate(answers):
    if model_judgments[idx]==ans:
        accuracy+=1
print(accuracy*100/len(answers))

684
684
59.941520467836256


In [212]:
# 300
# MIN = 20, N = 5

questions, answers, options = get_wbst_dataset("evaluation/mr.wbst.json")
model_judgments = get_model_judgments(questions, options)
accuracy = 0
for idx, ans in enumerate(answers):
    if model_judgments[idx]==ans:
        accuracy+=1
print(accuracy*100/len(answers))

684
684
53.654970760233915


In [210]:
# Pretrained
# MIN = 20, N = 5
# Direct embedding

questions, answers, options = get_wbst_dataset("evaluation/mr.wbst.json")
model_judgments = get_model_judgments(questions, options)
accuracy = 0
for idx, ans in enumerate(answers):
    if model_judgments[idx]==ans:
        accuracy+=1
print(accuracy*100/len(answers))

684
684
84.50292397660819


In [230]:
# Unsegmented
# MIN = 20, N = 5
# Direct embedding

questions, answers, options = get_wbst_dataset("evaluation/mr.wbst.json")
model_judgments = get_model_judgments(questions, options)
accuracy = 0
for idx, ans in enumerate(answers):
    if model_judgments[idx]==ans:
        accuracy+=1
print(accuracy*100/len(answers))

684
684
48.9766081871345


In [224]:
# Unsegmented
# MIN = 20, N = 4
# Direct embedding

questions, answers, options = get_wbst_dataset("evaluation/mr.wbst.json")
model_judgments = get_model_judgments(questions, options)
accuracy = 0
for idx, ans in enumerate(answers):
    if model_judgments[idx]==ans:
        accuracy+=1
print(accuracy*100/len(answers))

684
684
57.89473684210526


In [228]:
# Unsegmented
# MIN = 10, N = 4
# Direct embedding

questions, answers, options = get_wbst_dataset("evaluation/mr.wbst.json")
model_judgments = get_model_judgments(questions, options)
accuracy = 0
for idx, ans in enumerate(answers):
    if model_judgments[idx]==ans:
        accuracy+=1
print(accuracy*100/len(answers))

1183
1183
51.90194420963652


In [226]:
# Unsegmented
# MIN = 10, N = 5
# Direct embedding

questions, answers, options = get_wbst_dataset("evaluation/mr.wbst.json")
model_judgments = get_model_judgments(questions, options)
accuracy = 0
for idx, ans in enumerate(answers):
    if model_judgments[idx]==ans:
        accuracy+=1
print(accuracy*100/len(answers))

1183
1183
51.22569737954353


In [227]:
# Unsegmented
# MIN = 50, N = 4
# Direct embedding

questions, answers, options = get_wbst_dataset("evaluation/mr.wbst.json")
model_judgments = get_model_judgments(questions, options)
accuracy = 0
for idx, ans in enumerate(answers):
    if model_judgments[idx]==ans:
        accuracy+=1
print(accuracy*100/len(answers))

293
293
58.02047781569966


In [233]:
# Crosslingual double data
# MIN = 20, N = 5
# Direct embedding

questions, answers, options = get_wbst_dataset("evaluation/mr.wbst.json")
model_judgments = get_model_judgments(questions, options)
accuracy = 0
for idx, ans in enumerate(answers):
    if model_judgments[idx]==ans:
        accuracy+=1
print(accuracy*100/len(answers))

684
684
58.187134502923975


In [234]:
# Crosslingual double data
# MIN = 20, N = 4
# Direct embedding

questions, answers, options = get_wbst_dataset("evaluation/mr.wbst.json")
model_judgments = get_model_judgments(questions, options)
accuracy = 0
for idx, ans in enumerate(answers):
    if model_judgments[idx]==ans:
        accuracy+=1
print(accuracy*100/len(answers))

684
684
64.32748538011695


In [237]:
# Crosslingual double data
# MIN = 10, N = 4
# Direct embedding

questions, answers, options = get_wbst_dataset("evaluation/mr.wbst.json")
model_judgments = get_model_judgments(questions, options)
accuracy = 0
for idx, ans in enumerate(answers):
    if model_judgments[idx]==ans:
        accuracy+=1
print(accuracy*100/len(answers))

1183
1183
61.53846153846154


In [241]:
# Crosslingual double data
# MIN = 10, N = 5
# Direct embedding

questions, answers, options = get_wbst_dataset("evaluation/mr.wbst.json")
model_judgments = get_model_judgments(questions, options)
accuracy = 0
for idx, ans in enumerate(answers):
    if model_judgments[idx]==ans:
        accuracy+=1
print(accuracy*100/len(answers))

1183
1183
56.29754860524091


In [236]:
# Crosslingual double data
# MIN = 50, N = 4
# Direct embedding

questions, answers, options = get_wbst_dataset("evaluation/mr.wbst.json")
model_judgments = get_model_judgments(questions, options)
accuracy = 0
for idx, ans in enumerate(answers):
    if model_judgments[idx]==ans:
        accuracy+=1
print(accuracy*100/len(answers))

293
293
68.9419795221843


In [243]:
# Crosslingual double data
# MIN = 50, N = 4
# Direct embedding
import copy
questions, answers, options = get_wbst_dataset("evaluation/mr.wbst.json")
mr_embeddings = copy.copy(iter1)
model_judgments = get_model_judgments(questions, options)
accuracy = 0
for idx, ans in enumerate(answers):
    if model_judgments[idx]==ans:
        accuracy+=1
print(accuracy*100/len(answers))

1183
1183
55.790363482671175


In [245]:
# Crosslingual double data
# MIN = 50, N = 4
# Direct embedding
import copy
questions, answers, options = get_wbst_dataset("evaluation/mr.wbst.json")
mr_embeddings = copy.copy(unseg)
model_judgments = get_model_judgments(questions, options)
accuracy = 0
for idx, ans in enumerate(answers):
    if model_judgments[idx]==ans:
        accuracy+=1
print(accuracy*100/len(answers))

1183
1183
47.08368554522401


In [129]:
#SELF Hindi cc 300
questions, answers, options = get_wbst_dataset("evaluation/mr.wbst.json")
model_judgments = get_model_judgments(questions, options)


435
435


In [72]:
accuracy = 0
for idx, ans in enumerate(answers):
    if model_judgments[idx]==ans:
        accuracy+=1
print(accuracy*100/len(answers))

47.781569965870304


In [75]:
accuracy = 0
for idx, ans in enumerate(answers):
    if model_judgments[idx]==ans:
        accuracy+=1
print(accuracy*100/len(answers))

46.58703071672355


In [78]:
accuracy = 0
for idx, ans in enumerate(answers):
    if model_judgments[idx]==ans:
        accuracy+=1
print(accuracy*100/len(answers))

46.58703071672355


In [85]:
accuracy = 0
for idx, ans in enumerate(answers):
    if model_judgments[idx]==ans:
        accuracy+=1
print(accuracy*100/len(answers))

46.58703071672355


In [110]:
accuracy = 0
for idx, ans in enumerate(answers):
    if model_judgments[idx]==ans:
        accuracy+=1
print(accuracy*100/len(answers))

52.493660185967876


In [100]:
accuracy = 0
for idx, ans in enumerate(answers):
    if model_judgments[idx]==ans:
        accuracy+=1
print(accuracy*100/len(answers))

55.452240067624686


In [103]:
accuracy = 0
for idx, ans in enumerate(answers):
    if model_judgments[idx]==ans:
        accuracy+=1
print(accuracy*100/len(answers))

70.07607776838546


In [118]:
accuracy = 0
for idx, ans in enumerate(answers):
    if model_judgments[idx]==ans:
        accuracy+=1
print(accuracy*100/len(answers))

54.69146238377007


In [123]:
accuracy = 0
for idx, ans in enumerate(answers):
    if model_judgments[idx]==ans:
        accuracy+=1
print(accuracy*100/len(answers))

62.52873563218391


In [127]:
accuracy = 0
for idx, ans in enumerate(answers):
    if model_judgments[idx]==ans:
        accuracy+=1
print(accuracy*100/len(answers))

57.241379310344826


In [130]:
accuracy = 0
for idx, ans in enumerate(answers):
    if model_judgments[idx]==ans:
        accuracy+=1
print(accuracy*100/len(answers))

82.75862068965517


In [131]:
questions, answers, options = get_wbst_dataset("evaluation/mr.wbst.json")
model_judgments = get_model_judgments(questions, options)
accuracy = 0
for idx, ans in enumerate(answers):
    if model_judgments[idx]==ans:
        accuracy+=1
print(accuracy*100/len(answers))


811
811
83.23057953144266


In [133]:
questions, answers, options = get_wbst_dataset("evaluation/mr.wbst.json")
model_judgments = get_model_judgments(questions, options)
accuracy = 0
for idx, ans in enumerate(answers):
    if model_judgments[idx]==ans:
        accuracy+=1
print(accuracy*100/len(answers))



811
811
56.473489519112206


In [135]:
questions, answers, options = get_wbst_dataset("evaluation/mr.wbst.json")
model_judgments = get_model_judgments(questions, options)
accuracy = 0
for idx, ans in enumerate(answers):
    if model_judgments[idx]==ans:
        accuracy+=1
print(accuracy*100/len(answers))




811
811
56.59679408138101


In [136]:
questions, answers, options = get_wbst_dataset("evaluation/mr.wbst.json")
model_judgments = get_model_judgments(questions, options)
accuracy = 0
for idx, ans in enumerate(answers):
    if model_judgments[idx]==ans:
        accuracy+=1
print(accuracy*100/len(answers))





494
494
59.716599190283404


In [139]:
questions, answers, options = get_wbst_dataset("evaluation/mr.wbst.json")
model_judgments = get_model_judgments(questions, options)
accuracy = 0
for idx, ans in enumerate(answers):
    if model_judgments[idx]==ans:
        accuracy+=1
print(accuracy*100/len(answers))






494
494
61.336032388663966


In [140]:
questions, answers, options = get_wbst_dataset("evaluation/mr.wbst.json")
model_judgments = get_model_judgments(questions, options)
accuracy = 0
for idx, ans in enumerate(answers):
    if model_judgments[idx]==ans:
        accuracy+=1
print(accuracy*100/len(answers))






94
94
62.765957446808514
