In [1]:
from os.path import join as join_path

from tqdm import tqdm as log_progress

# Datasets

In [2]:
# dir = 'data/eval'
# !wget https://rusvectores.org/static/testsets/ru_simlex965_tagged.tsv -P {dir}
# !wget https://rusvectores.org/static/testsets/ru_simlex965.tsv -P {dir}

# !wget https://github.com/nlpub/russe-evaluation/raw/master/russe/evaluation/hj.csv -P {dir}
# !wget https://raw.githubusercontent.com/nlpub/russe-evaluation/master/russe/evaluation/rt.csv -P {dir}
# !wget https://github.com/nlpub/russe-evaluation/raw/master/russe/evaluation/ae-train.csv -P {dir}
# !wget https://github.com/nlpub/russe-evaluation/raw/master/russe/evaluation/ae-test.csv -P {dir}
# !wget https://raw.githubusercontent.com/nlpub/russe-evaluation/master/russe/evaluation/ae2.csv -P {dir}

In [3]:
from navec.eval.dataset import (
    Dataset,
    load_pairs as load_pairs_,

    SIMLEX_965, HJ, RT, AE, AE2,
    CORR, CLF,
    
    noun_tagged,
    get_pos_analyzer,
    pos_tagged
)


def load_pairs(filename, **kwargs):
    path = join_path('data', 'eval', filename)
    pairs = load_pairs_(path, **kwargs)
    return list(pairs)


simlex965 = Dataset(
    SIMLEX_965, CORR,
    load_pairs('ru_simlex965.tsv', delimiter='\t'),
    load_pairs('ru_simlex965_tagged.tsv', delimiter='\t')
)

pairs = load_pairs('hj.csv')
hj = Dataset(
    HJ, CORR,
    pairs, list(noun_tagged(pairs))
)

pairs = load_pairs('rt.csv')
rt = Dataset(
    RT, CLF,
    pairs, list(noun_tagged(pairs))
)

analyzer = get_pos_analyzer()
pairs = (
    load_pairs('ae-train.csv', column=3)
    + load_pairs('ae-test.csv')
)
ae = Dataset(
    AE, CLF,
    pairs, list(pos_tagged(pairs, analyzer))
)

pairs = load_pairs('ae2.csv')
ae2 = Dataset(
    AE2, CLF,
    pairs, list(noun_tagged(pairs))
)
datasets = [simlex965, hj, rt, ae, ae2]

# Rusvectores

In [4]:
# dir = 'data/models/rusvectores'
# !wget http://vectors.nlpl.eu/repository/11/180.zip -O {dir}/ruscorpora_upos_cbow_300_20_2019.zip
# !unzip {dir}/ruscorpora_upos_cbow_300_20_2019.zip -d {dir}/ruscorpora_upos_cbow_300_20_2019
# !rm {dir}/ruscorpora_upos_cbow_300_20_2019.zip {dir}/ruscorpora_upos_cbow_300_20_2019/model.txt

# !wget http://vectors.nlpl.eu/repository/11/182.zip -O {dir}/ruwikiruscorpora_upos_skipgram_300_2_2019.zip
# !unzip {dir}/ruwikiruscorpora_upos_skipgram_300_2_2019.zip -d {dir}/ruwikiruscorpora_upos_skipgram_300_2_2019
# !rm {dir}/ruwikiruscorpora_upos_skipgram_300_2_2019.zip {dir}/ruwikiruscorpora_upos_skipgram_300_2_2019/model.txt

# !wget http://vectors.nlpl.eu/repository/11/185.zip -O {dir}/tayga_upos_skipgram_300_2_2019.zip
# !unzip {dir}/tayga_upos_skipgram_300_2_2019.zip -d {dir}/tayga_upos_skipgram_300_2_2019
# !rm {dir}/tayga_upos_skipgram_300_2_2019.zip {dir}/tayga_upos_skipgram_300_2_2019/model.txt

# !wget http://vectors.nlpl.eu/repository/11/187.zip -O {dir}/tayga_none_fasttextcbow_300_10_2019.zip
# !unzip {dir}/tayga_none_fasttextcbow_300_10_2019.zip -d {dir}/tayga_none_fasttextcbow_300_10_2019
# !rm {dir}/tayga_none_fasttextcbow_300_10_2019.zip

# !wget https://rusvectores.org/static/models/rusvectores4/fasttext/araneum_none_fasttextcbow_300_5_2018.tgz -O {dir}/araneum_none_fasttextcbow_300_5_2018.tgz
# !mkdir {dir}/araneum_none_fasttextcbow_300_5_2018
# !tar xzvf {dir}/araneum_none_fasttextcbow_300_5_2018.tgz -C {dir}/araneum_none_fasttextcbow_300_5_2018
# !rm {dir}/araneum_none_fasttextcbow_300_5_2018.tgz

In [5]:
from navec.eval.model import (
    RusvectoresScheme,
    RusvectoresFasttextScheme
)


def get_path(dir, filename='model.bin'):
    return join_path('data', 'models', 'rusvectores', dir, filename)


ruscorpora_upos_cbow_300_20_2019 = RusvectoresScheme(
    'ruscorpora_upos_cbow_300_20_2019',
    get_path('ruscorpora_upos_cbow_300_20_2019')
)
ruwikiruscorpora_upos_skipgram_300_2_2019 = RusvectoresScheme(
    'ruwikiruscorpora_upos_skipgram_300_2_2019',
    get_path('ruwikiruscorpora_upos_skipgram_300_2_2019')
)
tayga_none_fasttextcbow_300_10_2019 = RusvectoresFasttextScheme(
    'tayga_none_fasttextcbow_300_10_2019',
    get_path(
        'tayga_none_fasttextcbow_300_10_2019',
        'model.model'
    )
)
tayga_upos_skipgram_300_2_2019 = RusvectoresScheme(
    'tayga_upos_skipgram_300_2_2019',
    get_path('tayga_upos_skipgram_300_2_2019')
)
araneum_none_fasttextcbow_300_5_2018 = RusvectoresFasttextScheme(
    'araneum_none_fasttextcbow_300_5_2018',
    get_path(
        'araneum_none_fasttextcbow_300_5_2018',
        'araneum_none_fasttextcbow_300_5_2018.model'
    )
)
rusvectores = [
    ruscorpora_upos_cbow_300_20_2019,
    ruwikiruscorpora_upos_skipgram_300_2_2019,
    tayga_upos_skipgram_300_2_2019,
    tayga_none_fasttextcbow_300_10_2019,
    araneum_none_fasttextcbow_300_5_2018
]

# Eval

In [6]:
from navec.eval.metrics import eval_schemes

records = eval_schemes(rusvectores, datasets)
records = list(log_progress(records))

5it [02:05, 25.12s/it]


In [7]:
from navec.eval.report import report_table

report_table(records, rusvectores, datasets)

Unnamed: 0_level_0,"init, s","get, µs","sim, µs","disk, mb","ram, mb",simlex965,hj,rt,ae,ae2
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ruscorpora_upos_cbow_300_20_2019,12.79,5.92,41.63,220.63,236.14,0.354,0.613,0.75,0.63,0.824
ruwikiruscorpora_upos_skipgram_300_2_2019,16.39,6.12,42.57,290.02,309.44,0.314,0.619,0.723,0.619,0.797
tayga_upos_skipgram_300_2_2019,16.29,5.82,40.38,290.74,310.89,0.424,0.683,0.779,0.624,0.852
tayga_none_fasttextcbow_300_10_2019,3.28,9.66,76.37,910.59,909.73,0.37,0.643,0.792,0.695,0.809
araneum_none_fasttextcbow_300_5_2018,5.03,8.46,64.07,945.27,926.52,0.349,0.67,0.804,0.717,0.796
