In [34]:
# !pip3 install --upgrade pyiwn

In [1]:
from pyiwn import iwn
import random

In [2]:
#Load wordnet of lang
iwnobj = iwn.IndoWordNet(iwn.Language.KONKANI)

2022-04-13:11:23:41,677 INFO     [iwn.py:43] Loading konkani language synsets...
2022-04-13:11:23:42,105 INFO     [utils.py:157] NumExpr defaulting to 8 threads.


In [3]:
all_verbs = set(iwnobj.all_words(iwn.PosTag.VERB))
all_nouns = set(iwnobj.all_words(iwn.PosTag.NOUN))

In [4]:
list(all_nouns)[:10]

['परपंरावाद',
 'खोलपी',
 'चैतन्य_म्हाप्रभू',
 'पंचायत',
 'संपाती',
 'महापरिमिर्वाण',
 'फ्लावर_फुलावर',
 'दमघोष',
 'निरोगीपण',
 'इंद्रदमन']

In [5]:
len(all_nouns)

26556

In [42]:
# Getting lang freqlist
corpus_words = list([pair.split()[1] for pair in open("data/konkani_data/kon.tok.freqlist.txt", "r").read().split("\n") if len(pair.split())==2])
freqs = list([pair.split()[0] for pair in open("nepali_data/nep.tok.freqlist.txt", "r").read().split("\n") if len(pair.split())==2])

In [43]:
frequency = {pair.split()[1]:int(pair.split()[0]) for pair in open("nepali_data/nep.tok.freqlist.txt", "r").read().split("\n") if (len(pair.split())==2)}

In [47]:


def get_detractors(related_words, n=10):
    
    detractors = list()
    while len(detractors)!=n:
        detractor = random.choice(freq_words)
        if detractor not in related_words and len(detractor)>2 and len(detractor.split())==1:
            detractors.append(detractor)
    
    return detractors
    
    


def get_qa(word, n=10):
    try:
        synsets = iwnobj.synsets(word)
    except KeyError:
        return None, None
    if len(synsets) == 0:
        return None, None
#     print(synsets)
    all_lemmas = [l for syn in synsets for l in syn.lemma_names() if (l!=word and len(l.split(" "))==1 and len(l)>2 and l in frequency and frequency[l]>MIN_FREQ)]
#     print(all_lemmas)
    if not all_lemmas:
        return None, None
#     print(all_lemmas)
    x = random.randint(0,len(all_lemmas)-1)
    answer = all_lemmas[x]
#     print(answer)
    detractors = get_detractors(all_lemmas+[word], n)
    return (answer, detractors)

In [48]:
# Getting query words 

parameters = [(10, 6), (10, 5), (20, 6), (20, 5), (50, 5)]

for MIN_FREQ, N in parameters:

    query_set = list()
    nouns=0
    verbs=0

    all_words = iwnobj.all_words()
    freq_words = list(filter(lambda x: x in frequency and frequency[x]>MIN_FREQ, all_words))


    for idx, word in enumerate(corpus_words):
    #     if len(query_set)==1000:
    #         break

        if int(freqs[idx])<MIN_FREQ:
            break
        if len(word)<=2:
            continue

        if word not in all_nouns and word not in all_verbs:
            continue

        answer, detractors = get_qa(word, n=N)
        if answer is not None:
            query_set.append((word, answer, detractors))
        if word in all_nouns:
            nouns+=1
        else:
            if word in all_verbs:
                verbs+=1
                
    qa_set = {word:{"answer": answer, "detractors": detractors} for word, answer, detractors in query_set}
    print(query_set[:5])
    
    with open("evaluation/nep_wbst/nep.wbst-{}-{}.json".format(MIN_FREQ, N), "w") as f:
        json.dump(qa_set, f, ensure_ascii=False, indent=2)




[('दिन', 'दिवस', ['नाटक', 'चार', 'सतह', 'समर्थ', 'सूचना', 'विराजमान']), ('हुने', 'दुर्घटना', ['टापु', 'प्रवेश', 'जनमत', 'पखेटा', 'प्रकार', 'खाद्यान्न']), ('गरी', 'गुदी', ['टाइप', 'नियोजन', 'पाउ', 'विपरीत', 'स्थान', 'कोङ्कणी']), ('पछि', 'ढिलो', ['गुणा', 'वर्ग', 'संगीतकार', 'विशेषज्ञ', 'ढाँचा', 'ज्योतिष']), ('प्रयोग', 'उपयोग', ['सीमा', 'पूरा', 'हल्ला', 'निशुल्क', 'बहादुर', 'चुच्चो'])]
[('दिन', 'दिवस', ['कर्तव्य', 'जानु', 'गायिका', 'श्रम', 'सहयोगी']), ('हुने', 'दुर्घटना', ['सुचारु', 'साधना', 'पराजित', 'पवित्र', 'वंशावली']), ('गरी', 'गुदी', ['भोटो', 'मणि', 'दृष्टिकोण', 'सार्वजनिक', 'रेकर्ड']), ('पछि', 'पछाडि', ['जीन', 'फ्रान्स', 'अर्चना', 'हस्तक्षेप', 'अतिथि']), ('प्रयोग', 'व्यवहार', ['महँगो', 'समाजवादी', 'सङ्केत', 'बिन्दु', 'सक्नु'])]
[('दिन', 'दिवस', ['श्रोत', 'धार्मिक', 'फारसी', 'ठुलो', 'सम्पूर्ण', 'प्रवृत्ति']), ('हुने', 'दुर्घटना', ['केन्द्र', 'जमिन', 'महाकाव्य', 'स्थायी', 'तीर्थ', 'मनोरञ्जन']), ('पछि', 'ढिलो', ['रमाइलो', 'राजमार्ग', 'उदाहरण', 'नगर', 'रुचि', 'साँचो']), ('प्रयोग', 'खर्

In [17]:
len(qa_set)

1414

[('धेरै', 'असङ्ख्य', ['विवेक', 'दूरसञ्चार', 'देवी', 'सम्भव', 'इन्जिनियर']),
 ('प्राप्त', 'उपलब्ध', ['अभिन्न', 'राष्ट्रीय', 'न्यानो', 'बाकी', 'राजकुमारी']),
 ('कारण', 'मतलब', ['सादा', 'केन्द्र', 'कटहर', 'पुलिस', 'सम्झनु']),
 ('नाम', 'प्रसिद्धि', ['सबैमा', 'क्षेत्र', 'सखरखण्ड', 'पल्ट', 'ढङ्गले']),
 ('जुन', 'चन्द्रमा', ['प्रतिनिधि', 'पुरस्कार', 'अनुबन्धित', 'युद्ध', 'कमल'])]

In [19]:
# Writing dataset
import json
    