In [None]:
%load_ext autoreload
%autoreload 2
%run -n main.py

# datasets

In [None]:
# path = join_path(DATA_DIR, DATASET)
# !mkdir -p {path}

In [None]:
# for name in DATASETS:
#     paths = (
#         join_path(CORUS_DATA_DIR, _)
#         for _ in CORUS_FILES[name]
#     )
#     records = (
#         record
#         for path in paths
#         for record in load_dataset(path)
#     )
#     records = log_progress(records, desc=name)
#     records = sample(records, 1000)

#     path = join_path(DATA_DIR, DATASET, name + JL + GZ)
#     items = as_jsons(records)
#     lines = format_jl(items)
#     dump_gz_lines(lines, path)

In [None]:
datasets = {}
for name in DATASETS:
    path = join_path(DATA_DIR, DATASET, name + JL + GZ)
    lines = load_gz_lines(path)
    items = parse_jl(lines)
    datasets[name] = list(from_jsons(items, Markup))

# models

In [None]:
# for name in MODELS:
#     path = join_path(DATA_DIR, name)
#     !mkdir -p {path}

## cpu

In [None]:
docker = docker_client()

In [None]:
model_name = SPACY
model = MODELS[model_name]()
model.start(docker)
model.wait()

In [None]:
for dataset_name in DATASETS:
    records = model.map(_.words for _ in datasets[dataset_name])
    records = log_progress(records, desc=dataset_name)

    path = join_path(DATA_DIR, model_name, dataset_name + JL + GZ)
    items = as_jsons(records)
    lines = format_jl(items)
    dump_gz_lines(lines, path)

In [None]:
model.stop(docker)

## gpu

In [None]:
# !vast search offers | grep '1 x  GTX 1080 Ti'

In [None]:
# model = DeeppavlovBERTModel()
# model = SlovnetBERTModel()
# model = StanzaModel()

In [None]:
# !vast create instance 498795 --image {model.image} --disk 30

In [None]:
# !vast show instances

In [None]:
# !ssh ssh4.vast.ai -p 20908 -l root -Nf -L {model.port}:localhost:{model.container_port}

In [None]:
# for dataset_name in DATASETS:
#     records = datasets[dataset_name]
#     records = log_progress(records, desc=dataset_name)
#     records = model.map(_.words for _ in records)

#     path = join_path(DATA_DIR, model.name, dataset_name + JL + GZ)
#     items = as_jsons(records)
#     lines = format_jl(items)
#     dump_gz_lines(lines, path)

In [None]:
# !vast destroy instance 500908

# score

In [None]:
dataset_models = {}
for dataset in DATASETS:
    for model in MODELS:
        path = join_path(DATA_DIR, model, dataset + JL + GZ)
        lines = load_gz_lines(path)
        items = parse_jl(lines)
        dataset_models[dataset, model] = list(from_jsons(items, Markup))

In [None]:
scores = {}
for dataset, model in log_progress(dataset_models):
    preds = dataset_models[dataset, model]
    targets = datasets[dataset]
    scores[dataset, model] = score_markups(preds, targets)

# report

In [None]:
scores_table = scores_report_table(scores, DATASETS, MODELS)
html = format_scores_report(scores_table)
patch_readme(SYNTAX1, html, README)
patch_readme(SYNTAX1, html, SLOVNET_README)
HTML(html)

In [None]:
BENCH = [
    Bench(
        UDPIPE,
        init=6.91,
        disk=45 * MB,
        ram=242 * MB,
        speed=56.2,
    ),
    Bench(
        SPACY,
        init=9,
        disk=140 * MB,
        ram=579 * MB,
        speed=41,
    ),
    Bench(
        DEEPPAVLOV_BERT,
        init=34,
        disk=(706 + 721) * MB,  # BERT + model
        ram=8.5 * GB,
        speed=75,
        device=GPU
    ),
    Bench(
        SLOVNET_BERT,
        init=5,
        disk=504 * MB,
        ram=3427 * MB,
        speed=200,
        device=GPU
    ),
    Bench(
        SLOVNET,
        init=1,
        disk=27 * MB,
        ram=125 * MB,
        speed=450,
    ),
    Bench(
        STANZA,
        init=3,
        disk=591 * MB,
        ram=890 * MB,
        speed=12,
    ),
]

bench_table = bench_report_table(BENCH, MODELS)
html = format_bench_report(bench_table)
patch_readme(SYNTAX2, html, README)
patch_readme(SYNTAX2, html, SLOVNET_README)
HTML(html)