In [54]:
from secos import Decomposition
import pandas as pd
import urllib.request
import tarfile
import subprocess
import os
import warnings
import numpy as np

warnings.filterwarnings("ignore", message="Mean of empty slice", category=RuntimeWarning)
warnings.filterwarnings("ignore", message="invalid value encountered in scalar divide", category=RuntimeWarning)

In [11]:
url = 'http://ltdata1.informatik.uni-hamburg.de/SECOS/datasets/wiktionary_compounds.tar.gz'
urllib.request.urlretrieve(url, "evaluation.tar.gz")

In [26]:
os.makedirs("wiktionary", exist_ok=True)

with tarfile.open("evaluation.tar.gz", "r:gz") as tar:
    tar.extractall("wiktionary")

In [57]:
langmap = {
    "da": "Danish",
    "de": "German",
    "en": "English",
    "es": "Spanish",
    "et": "Estonian",
    "fi": "Finnish",
    "hu": "Hungarian",
    "la": "Latin",
    "lv": "Latvian",
    "nl": "Dutch",
    "no": "Norwegian",
    "sv": "Swedish"
}

In [59]:
print("""\
Precision and Recall are defined as usual: Precision is obtained by dividing
the number of correct splits by the number of splits taken by the method, recall is
the number of correct splits divided by the total number of splits in the evaluation
set. We further report results in the F1 measure, which is the harmonic mean of
Precision and Recall.\
""")

Precision and Recall are defined as usual: Precision is obtained by dividing
the number of correct splits by the number of splits taken by the method, recall is
the number of correct splits divided by the total number of splits in the evaluation
set. We further report results in the F1 measure, which is the harmonic mean of
Precision and Recall.


In [58]:
for key, value in langmap.items():
    Decomposition.download_model(key, overwrite=True)
    model = Decomposition.load_model(key)
    secos = Decomposition(model)

    with open(f"./wiktionary/{value}_downloaded", "r") as f:
        gold_standard = []
        words = []
        for line in f.readlines():
            w,s = line.lower().split("\t")[0:2]
            gold_standard.append(s.strip())
            words.append(w.strip())
    
    predictions = []
    for word in words:
        predictions.append('-'.join(secos.decompose(word)))

    os.makedirs("./output", exist_ok=True)
    with open(f"./output/{key}.txt", "w") as f:
        for word, gold, prediction in zip(words, gold_standard, predictions):
            f.write(f"{word}\t{gold}\t{prediction}\n")

    print(key, value)
    print("-"*50)

    cmd = f"""\
          cat output/{key}.txt | python eval_decompounding.py 1 2\
          """
    process = subprocess.Popen(
        ("conda run -n secos " + cmd).split(), stdout=subprocess.PIPE
    )
    output, error = process.communicate()

    print("-"*50)

100%|██████████| 16.0M/16.0M [00:00<00:00, 17.9MB/s]


Download completed!
da Danish
--------------------------------------------------


Precision	Recall	F1
0.705882	0.687747	0.696697
    0.7059 &     0.6877&    0.6967
Considered	Correct	Percentage of Correct ones
493.000000	186.000000	0.377282



--------------------------------------------------


100%|██████████| 166M/166M [00:09<00:00, 17.8MB/s] 


Download completed!
de German
--------------------------------------------------


Precision	Recall	F1
0.801512	0.758380	0.779350
    0.8015 &     0.7584&    0.7793
Considered	Correct	Percentage of Correct ones
5688.000000	3108.000000	0.546414



--------------------------------------------------


100%|██████████| 32.8M/32.8M [00:01<00:00, 29.8MB/s]


Download completed!
en English
--------------------------------------------------


Precision	Recall	F1
0.750104	0.732869	0.741386
    0.7501 &     0.7329&    0.7414
Considered	Correct	Percentage of Correct ones
10865.000000	5323.000000	0.489922



--------------------------------------------------


100%|██████████| 37.7M/37.7M [00:01<00:00, 26.6MB/s]


Download completed!
es Spanish
--------------------------------------------------


Precision	Recall	F1
0.688776	0.671642	0.680101
    0.6888 &     0.6716&    0.6801
Considered	Correct	Percentage of Correct ones
98.000000	37.000000	0.377551



--------------------------------------------------


100%|██████████| 18.6M/18.6M [00:01<00:00, 14.0MB/s]


Download completed!
et Estonian
--------------------------------------------------


Precision	Recall	F1
0.653689	0.644444	0.649034
    0.6537 &     0.6444&    0.6490
Considered	Correct	Percentage of Correct ones
244.000000	75.000000	0.307377



--------------------------------------------------


100%|██████████| 42.7M/42.7M [00:01<00:00, 29.3MB/s]


Download completed!
fi Finnish
--------------------------------------------------


Precision	Recall	F1
0.793124	0.739154	0.765189
    0.7931 &     0.7392&    0.7652
Considered	Correct	Percentage of Correct ones
11052.000000	5686.000000	0.514477



--------------------------------------------------


100%|██████████| 52.1M/52.1M [00:03<00:00, 16.9MB/s]


Download completed!
hu Hungarian
--------------------------------------------------


Precision	Recall	F1
0.728800	0.719021	0.723878
    0.7288 &     0.7190&    0.7239
Considered	Correct	Percentage of Correct ones
625.000000	283.000000	0.452800



--------------------------------------------------


100%|██████████| 5.94M/5.94M [00:00<00:00, 15.1MB/s]


Download completed!
la Latin
--------------------------------------------------


Precision	Recall	F1
0.566327	0.563452	0.564885
    0.5663 &     0.5635&    0.5649
Considered	Correct	Percentage of Correct ones
98.000000	13.000000	0.132653



--------------------------------------------------


100%|██████████| 9.36M/9.36M [00:00<00:00, 19.2MB/s]


Download completed!
lv Latvian
--------------------------------------------------


Precision	Recall	F1
0.622024	0.609329	0.615611
    0.6220 &     0.6093&    0.6156
Considered	Correct	Percentage of Correct ones
168.000000	36.000000	0.214286



--------------------------------------------------


100%|██████████| 34.6M/34.6M [00:01<00:00, 26.6MB/s]


Download completed!
nl Dutch
--------------------------------------------------


Precision	Recall	F1
0.788689	0.762115	0.775174
    0.7887 &     0.7621&    0.7752
Considered	Correct	Percentage of Correct ones
4058.000000	2208.000000	0.544110



--------------------------------------------------


100%|██████████| 24.0M/24.0M [00:01<00:00, 18.6MB/s]


Download completed!
no Norwegian
--------------------------------------------------


Precision	Recall	F1
0.621951	0.566667	0.593023
    0.6220 &     0.5667&    0.5930
Considered	Correct	Percentage of Correct ones
41.000000	7.000000	0.170732



--------------------------------------------------


100%|██████████| 32.6M/32.6M [00:01<00:00, 19.7MB/s]


Download completed!
sv Swedish
--------------------------------------------------
--------------------------------------------------


Precision	Recall	F1
0.753323	0.726621	0.739731
    0.7533 &     0.7266&    0.7397
Considered	Correct	Percentage of Correct ones
1279.000000	610.000000	0.476935

