In [93]:
import requests
from bs4 import BeautifulSoup
import re, os
import gzip
import json

In [122]:
url = 'http://ltdata1.informatik.uni-hamburg.de/SECOS/models_jobimtext/'

resp = requests.get(url)
soup = BeautifulSoup(resp.text)

links = list(map(lambda x: url + x.get('href'), soup.find_all('a', href=lambda x: all([x.endswith('.gz'), "wikipedia_" in x]))))

In [123]:
languages = set(list(map(lambda x: re.search(r'(?<=wikipedia_)\w{2}(?=\_)', x).group(0), links)))

In [32]:
def download_links(language):
    candidates_filename, wordcount_filename = [l for l in links if f"wikipedia_{language}_" in l]
    
    r = requests.get(candidates_filename, stream=True)
    with open(candidates_filename.split('/')[-1], 'wb') as f:
        for chunk in r.raw.stream(1024, decode_content=False):
            if chunk:
                f.write(chunk)

    r = requests.get(wordcount_filename, stream=True)
    with open(wordcount_filename.split('/')[-1], 'wb') as f:
        for chunk in r.raw.stream(1024, decode_content=False):
            if chunk:
                f.write(chunk)
    
    return candidates_filename.split('/')[-1], wordcount_filename.split('/')[-1]

In [95]:
def unzip_file(zip_file, writepath='.'):
    with gzip.open(zip_file, 'rb') as f:
        content = f.read().decode()
    with open(writepath + '/' + zip_file.replace('gz', 'txt'), 'w') as f:
        f.write(content)
    return zip_file.replace('gz', 'txt')

In [104]:
languages

{'da', 'de', 'en', 'es', 'et', 'fa', 'fi', 'hu', 'la', 'lv', 'nl', 'no', 'sv'}

In [121]:
for language in languages:
    directory = f"./{language}"
    if not os.path.exists(directory):
        os.mkdir(directory)

    candidates_zipfile, wordcount_zipfile = download_links(language)
    candidates_file = unzip_file(candidates_zipfile, f'./{language}')
    wordcount_file = unzip_file(wordcount_zipfile, f'./{language}')

    wordcount = {}
    with open(f'./{language}/' + wordcount_file, 'r') as f:
        for line in f:
            word, cnt = line.strip().split('\t')
            if all([
                not re.findall(r"\b[^\u0000-\u05C0\u2100-\u214F]+\b", word), # Non european characters
                not re.findall(r"[\d]", word), # No digits,
                not re.findall(r"[^\w]", word), # No non alphanum chars,
                len(word) >= 5, # At least 5 characters long
                int(cnt) >=3 # At least 3 observations
                ]):
                wordcount[word] = int(cnt)
    
    compounds = []
    with open(f'./{language}/' + candidates_file, 'r') as f:
        for line in f:
            word = (line.split('\t')[0]).strip()
            if word in wordcount:
                compounds.append(word)

    with open(f'./{language}/wordcounts.json',  'w') as f:
        json.dump(wordcount, f)
    
    with open(f'./{language}/compounds.txt',  'w') as f:
        for line in compounds:
            f.write(line + '\n')

In [105]:
import subprocess

In [117]:
cmd = """\
python decompound_secos.py \
{language}/wikipedia_{language}_tokenized_trigram__FreqSigLMI__PruneContext_s_0.0_w_2_f_2_wf_0_wpfmax_1000_wpfmin_2_p_1000__AggrPerFt__SimCount_sc_one_ac_False__SimSortlimit_200_minsim_0_candidates.txt \
{language}/wikipedia_{language}_tokenized_trigram__WordCount.txt \
50 \
{language}/compounds.txt \
0 3 3 5 3 lower 0.01 {language}\
> {language}/output.txt
"""

In [124]:
for language in languages:
    print(language)
    process = subprocess.Popen(
        ("conda run -n secos " + cmd.format(language=language)).split(),stdout=subprocess.PIPE
    )
    output, error = process.communicate()

fa


read knowledge
extract single words
decompound



la


read knowledge
extract single words
decompound



nl


read knowledge
extract single words
decompound



no


read knowledge
extract single words
decompound



en


read knowledge
extract single words
decompound



fi


read knowledge
extract single words
decompound



sv


read knowledge
extract single words
decompound



et


read knowledge
extract single words
decompound



lv


read knowledge
extract single words
decompound



hu


read knowledge
extract single words
decompound



da


read knowledge
extract single words
decompound



es


read knowledge
extract single words
decompound



de


read knowledge
extract single words
decompound



In [125]:
langmap = {
    "da": "Danish",
    "de": "German",
    "en": "English",
    "es": "Spanish",
    "et": "Estonian",
    "fa": "Farsi",
    "fi": "Finish",
    "hu": "Hungarian",
    "la": "Latin",
    "lv": "Latvian",
    "nl": "Dutch",
    "no": "Norwegian",
    "sv": "Swedish"
}

In [146]:
for language in languages:
    
    with open(f'./{language}/wordcounts.json', 'r') as f:
        wordcounts = json.loads(f.read())
    wordcounts =  {k.lower(): v for k, v in wordcounts.items()}

    with open(f'./{language}/generated_dictionary.txt', 'r') as f:
        generated_dictionary = f.read().lower().split("\n")
    
    with open(f'./{language}/output.txt', 'r') as f:
        data = f.read().lower().split("\n")

    precomputed_splits = {}
    for item in data:
        compounds = item.split("\t")[1].split('-')
        key = item.split("\t")[-1]
        precomputed_splits[key]=compounds

    model = dict(
           language = langmap.get(language),
           precomputed_splits=precomputed_splits,
           generated_dictionary=generated_dictionary,
           word_frequencies = wordcounts,
           total_wordcount = sum(wordcounts.values()),
           n_words = len(wordcounts.keys())
    )

    with open(f'./{language}/{language}.json','w') as f:
        json.dump(model, f)