In [35]:
import zipfile
import os
from collections import defaultdict

from fasttext_vectorizer import FasttextVectorizer
from ruwordnet.ruwordnet_reader import RuWordnet
from predict_models import BaselineModel, SecondOrderModel, SecondOrderModelTransform
from main import save_to_file

In [6]:
ft_vec = FasttextVectorizer("models/cc.ru.300.bin")
ruwordnet = RuWordnet(db_path="../data/ruwordnet.db", ruwordnet_path=None)

Model loaded


In [None]:
noun_synsets = defaultdict(list)
verb_synsets = defaultdict(list)
for sense_id, synset_id, text in ruwordnet.get_all_senses():
    if synset_id.endswith("N"):
        noun_synsets[synset_id].append(text.lower())
    elif synset_id.endswith("V"):
        verb_synsets[synset_id].append(text.lower())

ft_vec.vectorize_ruwordnet(noun_synsets, "models/vectors/ruwordnet_nouns_fasttext.txt")
ft_vec.vectorize_ruwordnet(verb_synsets, "models/vectors/ruwordnet_verbs_fasttext.txt")

Model loaded


In [9]:
def process_data(input_file, output_file):
    phrases = []
    with open(input_file, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            phrase = line.split('\t', 1)[0] # it can be both one new word and new words combination
            phrase = phrase.strip().lower() 
            if phrase:
                phrases.append(phrase)
    print(f'In process data {phrases[:2]}')
    ft_vec.vectorize_data(phrases, output_file, save_first_word=False)

In [26]:
process_data("../data/public_test/nouns_public.tsv", "models/vectors/fasttext/nouns_public_fasttext.txt")
process_data("../data/public_test/verbs_public.tsv", "models/vectors/fasttext/verbs_public_fasttext.txt")
process_data("../data/private_test/nouns_private.tsv", "models/vectors/fasttext/nouns_private_fasttext.txt")
process_data("../data/private_test/verbs_private.tsv", "models/vectors/fasttext/verbs_private_fasttext.txt")


In process data ['абдоминопластика', 'абсорбент']
In process data ['абсолютизировать', 'активировать']
In process data ['абсентеизм', 'абсолютизация']
In process data ['адсорбировать', 'акать']


In [27]:
def main(params):
    models = {"baseline": BaselineModel, "second_order": SecondOrderModel, 'second_order_transform': SecondOrderModelTransform}
    
    test_data = []
    with open(params['test_path'], 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            phrase = line.split('\t', 1)[0].strip().lower() # it can be both one new word and new words combination 
            phrase = phrase.upper().replace(" ", "_")
            if phrase:
                test_data.append(phrase)
    
    baseline = models[params["model"]](params)
    print("Model loaded")
    results = baseline.predict_hypernyms(list(test_data))
    save_to_file(results, params['output_path'], baseline.ruwordnet)


In [28]:
params1 = {
  "ruwordnet_vectors_path": "models/vectors/ruwordnet_nouns_fasttext.txt",
  "data_vectors_path": "models/vectors/fasttext/nouns_public_fasttext.txt",
  "test_path": "../data/public_test/nouns_public.tsv",
  "output_path": "predictions/predicted_public_nouns.tsv",
  "db_path": "../data/ruwordnet.db",
  "ruwordnet_path": None,
  "model": "second_order"
}    
main(params1)

Model loaded


In [29]:
params2 = {
  "ruwordnet_vectors_path": "models/vectors/ruwordnet_verbs_fasttext.txt",
  "data_vectors_path": "models/vectors/fasttext/verbs_public_fasttext.txt",
  "test_path": "../data/public_test/verbs_public.tsv",
  "output_path": "predictions/predicted_public_verbs.tsv",
  "db_path": "../data/ruwordnet.db",
  "ruwordnet_path": None,
  "model": "second_order"
}    
main(params2)

Model loaded


In [30]:
params3 = {
  "ruwordnet_vectors_path": "models/vectors/ruwordnet_nouns_fasttext.txt",
  "data_vectors_path": "models/vectors/fasttext/nouns_private_fasttext.txt",
  "test_path": "../data/private_test/nouns_private.tsv",
  "output_path": "predictions/predicted_private_nouns.tsv",
  "db_path": "../data/ruwordnet.db",
  "ruwordnet_path": None,
  "model": "second_order"
}    
main(params3)

Model loaded


In [31]:
params4 = {
  "ruwordnet_vectors_path": "models/vectors/ruwordnet_verbs_fasttext.txt",
  "data_vectors_path": "models/vectors/fasttext/verbs_private_fasttext.txt",
  "test_path": "../data/private_test/verbs_private.tsv",
  "output_path": "predictions/predicted_private_verbs.tsv",
  "db_path": "../data/ruwordnet.db",
  "ruwordnet_path": None,
  "model": "second_order"
}    
main(params4)

Model loaded


In [None]:
prediction_files = [
    "predictions/predicted_public_nouns.tsv",
    "predictions/predicted_public_verbs.tsv",
    "predictions/predicted_private_nouns.tsv",
    "predictions/predicted_private_verbs.tsv"
]

for tsv_file in prediction_files:
    if os.path.exists(tsv_file):
        zip_file = tsv_file.replace('.tsv', '.tsv.zip')
        with zipfile.ZipFile(zip_file, 'w', zipfile.ZIP_DEFLATED) as zipf:
            zipf.write(tsv_file, os.path.basename(tsv_file))