In [1]:
from os.path import join as join_path

from tqdm import tqdm as log_progress

# Datasets

In [2]:
dir = 'data/eval'

# !wget https://rusvectores.org/static/testsets/ru_simlex965_tagged.tsv -P {dir}
# !wget https://rusvectores.org/static/testsets/ru_simlex965.tsv -P {dir}

# !wget https://github.com/nlpub/russe-evaluation/raw/master/russe/evaluation/hj.csv -P {dir}
# !wget https://raw.githubusercontent.com/nlpub/russe-evaluation/master/russe/evaluation/rt.csv -P {dir}
# !wget https://github.com/nlpub/russe-evaluation/raw/master/russe/evaluation/ae-train.csv -P {dir}
# !wget https://github.com/nlpub/russe-evaluation/raw/master/russe/evaluation/ae-test.csv -P {dir}
# !wget https://raw.githubusercontent.com/nlpub/russe-evaluation/master/russe/evaluation/ae2.csv -P {dir}

# !wget https://tlk.s3.yandex.net/dataset/LRWC.zip -P {dir}
# !unzip -p {dir}/LRWC.zip LRWC/lrwc-1.1-aggregated.tsv > {dir}/lrwc.tsv
# !rm {dir}/LRWC.zip

In [3]:
from navec.eval.dataset import (
    Dataset,
    load_pairs as load_pairs_,

    SIMLEX_965, HJ, RT, AE, AE2, LRWC,
    CORR, CLF,
    
    noun_tagged,
    get_pos_analyzer,
    pos_tagged
)


def load_pairs(filename, **kwargs):
    path = join_path('data', 'eval', filename)
    pairs = load_pairs_(path, **kwargs)
    return list(pairs)


simlex965 = Dataset(
    SIMLEX_965, CORR,
    load_pairs('ru_simlex965.tsv', delimiter='\t'),
    load_pairs('ru_simlex965_tagged.tsv', delimiter='\t')
)

pairs = load_pairs('hj.csv')
hj = Dataset(
    HJ, CORR,
    pairs, list(noun_tagged(pairs))
)

pairs = load_pairs('rt.csv')
rt = Dataset(
    RT, CLF,
    pairs, list(noun_tagged(pairs))
)

analyzer = get_pos_analyzer()
pairs = (
    load_pairs('ae-train.csv', column=3)
    + load_pairs('ae-test.csv')
)
ae = Dataset(
    AE, CLF,
    pairs, list(pos_tagged(pairs, analyzer))
)

pairs = load_pairs('ae2.csv')
ae2 = Dataset(
    AE2, CLF,
    pairs, list(noun_tagged(pairs))
)

pairs = load_pairs('lrwc.tsv', delimiter='\t', column=3)
lrwc = Dataset(
    LRWC, CLF,
    pairs, list(noun_tagged(pairs))
)
datasets = [simlex965, hj, rt, ae, ae2, lrwc]

# Models

## Rusvectores

In [4]:
# dir = 'data/models/rusvectores'
# !wget http://vectors.nlpl.eu/repository/11/180.zip -O {dir}/ruscorpora_upos_cbow_300_20_2019.zip
# !unzip {dir}/ruscorpora_upos_cbow_300_20_2019.zip -d {dir}/ruscorpora_upos_cbow_300_20_2019
# !rm {dir}/ruscorpora_upos_cbow_300_20_2019.zip {dir}/ruscorpora_upos_cbow_300_20_2019/model.txt

# !wget http://vectors.nlpl.eu/repository/11/182.zip -O {dir}/ruwikiruscorpora_upos_skipgram_300_2_2019.zip
# !unzip {dir}/ruwikiruscorpora_upos_skipgram_300_2_2019.zip -d {dir}/ruwikiruscorpora_upos_skipgram_300_2_2019
# !rm {dir}/ruwikiruscorpora_upos_skipgram_300_2_2019.zip {dir}/ruwikiruscorpora_upos_skipgram_300_2_2019/model.txt

# !wget http://vectors.nlpl.eu/repository/11/185.zip -O {dir}/tayga_upos_skipgram_300_2_2019.zip
# !unzip {dir}/tayga_upos_skipgram_300_2_2019.zip -d {dir}/tayga_upos_skipgram_300_2_2019
# !rm {dir}/tayga_upos_skipgram_300_2_2019.zip {dir}/tayga_upos_skipgram_300_2_2019/model.txt

# !wget http://vectors.nlpl.eu/repository/11/187.zip -O {dir}/tayga_none_fasttextcbow_300_10_2019.zip
# !unzip {dir}/tayga_none_fasttextcbow_300_10_2019.zip -d {dir}/tayga_none_fasttextcbow_300_10_2019
# !rm {dir}/tayga_none_fasttextcbow_300_10_2019.zip

# !wget https://rusvectores.org/static/models/rusvectores4/fasttext/araneum_none_fasttextcbow_300_5_2018.tgz -O {dir}/araneum_none_fasttextcbow_300_5_2018.tgz
# !mkdir {dir}/araneum_none_fasttextcbow_300_5_2018
# !tar xzvf {dir}/araneum_none_fasttextcbow_300_5_2018.tgz -C {dir}/araneum_none_fasttextcbow_300_5_2018
# !rm {dir}/araneum_none_fasttextcbow_300_5_2018.tgz

In [5]:
from navec.eval.model import (
    RusvectoresScheme,
    RusvectoresFasttextScheme,
)


def get_path(dir, filename='model.bin'):
    return join_path('data', 'models', 'rusvectores', dir, filename)


ruscorpora_upos_cbow_300_20_2019 = RusvectoresScheme(
    'ruscorpora_upos_cbow_300_20_2019',
    get_path('ruscorpora_upos_cbow_300_20_2019')
)
ruwikiruscorpora_upos_skipgram_300_2_2019 = RusvectoresScheme(
    'ruwikiruscorpora_upos_skipgram_300_2_2019',
    get_path('ruwikiruscorpora_upos_skipgram_300_2_2019')
)
tayga_none_fasttextcbow_300_10_2019 = RusvectoresFasttextScheme(
    'tayga_none_fasttextcbow_300_10_2019',
    get_path(
        'tayga_none_fasttextcbow_300_10_2019',
        'model.model'
    )
)
tayga_upos_skipgram_300_2_2019 = RusvectoresScheme(
    'tayga_upos_skipgram_300_2_2019',
    get_path('tayga_upos_skipgram_300_2_2019')
)
araneum_none_fasttextcbow_300_5_2018 = RusvectoresFasttextScheme(
    'araneum_none_fasttextcbow_300_5_2018',
    get_path(
        'araneum_none_fasttextcbow_300_5_2018',
        'araneum_none_fasttextcbow_300_5_2018.model'
    )
)
rusvectores = [
    ruscorpora_upos_cbow_300_20_2019,
    ruwikiruscorpora_upos_skipgram_300_2_2019,
    tayga_upos_skipgram_300_2_2019,
    tayga_none_fasttextcbow_300_10_2019,
    araneum_none_fasttextcbow_300_5_2018
]

## Navec

In [6]:
from navec.eval.model import NavecScheme

path = join_path('data', 'models', 'navec', 'librusec_12B_500k_300d_100q.tar')
navec = NavecScheme('navec_librusec_12B_500k_300d_100q', path)
navecs = [navec]

# Eval

In [7]:
from navec.eval.metrics import eval_schemes

schemes = rusvectores + navecs
records = eval_schemes(schemes, datasets)
records = list(log_progress(records))

6it [02:06, 21.17s/it]


In [63]:
# from navec.eval.report import report_table

report_table(records, schemes, datasets)

Unnamed: 0_level_0,"init, s","get, µs","disk, mb","ram, mb",simlex965,hj,rt,ae,ae2,lrwc
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ruscorpora_upos_cbow_300_20_2019,12.6,4.6,220.6,236.1,0.359 | 961,0.685 | 378,0.852 | 61428,0.758 | 16213,0.896 | 49895,0.602 | 6234
ruwikiruscorpora_upos_skipgram_300_2_2019,15.9,4.5,290.0,309.4,0.321 | 961,0.723 | 376,0.817 | 66197,0.801 | 17067,0.860 | 54479,0.629 | 6173
tayga_upos_skipgram_300_2_2019,15.5,4.4,290.7,310.9,0.429 | 959,0.749 | 382,0.871 | 65091,0.771 | 17372,0.899 | 54082,0.639 | 6297
tayga_none_fasttextcbow_300_10_2019,3.3,14.3,910.6,909.7,0.370 | 965,0.643 | 398,0.792 | 114066,0.695 | 22907,0.809 | 86772,0.533 | 10596
araneum_none_fasttextcbow_300_5_2018,4.5,11.1,945.3,926.5,0.349 | 965,0.670 | 398,0.804 | 114066,0.717 | 22910,0.796 | 86771,0.578 | 10596
navec_librusec_12B_500k_300d_100q,1.0,64.7,49.5,95.3,0.309 | 958,0.704 | 390,0.842 | 81408,0.932 | 21698,0.923 | 71667,0.604 | 6733
