In [1]:
import re
import string
from gensim import utils
from gensim.parsing.preprocessing import strip_punctuation, strip_short, strip_numeric, strip_multiple_whitespaces, remove_stopwords
import urllib.request
import zipfile
import lzma
import shutil

In [2]:
import logging
logging.basicConfig(level=logging.INFO, force = True)
logger = logging.getLogger()
logger.info("Logging initialized")

INFO:root:Logging initialized


In [3]:
#Link found here: https://metatext.io/datasets/cc100-belarusian
urllib.request.urlretrieve('https://data.statmt.org/cc-100/be.txt.xz', 
                           'be.txt.xz')

urllib.request.urlretrieve('https://github.com/Belarus/GrammarDB/archive/refs/tags/PUBLICATION_2021.zip', 
                           'GrammarDB.zip')

('GrammarDB.zip', <http.client.HTTPMessage at 0x174864150>)

In [4]:
with lzma.open("be.txt.xz", "rb") as fsrc:
    with open("be.txt", "wb") as fdst:
        shutil.copyfileobj(fsrc, fdst)

with zipfile.ZipFile('GrammarDB.zip', 'r') as zip_ref:
    zip_ref.extractall('.')

In [5]:
import xml.dom.minidom
from itertools import islice

def calculate_mapping_from_forms_to_base(filepath, tag_prefixes=[]):
    xml_doc = xml.dom.minidom.parse(filepath)
    paradigms = xml_doc.getElementsByTagName('Paradigm')
    result = {}
    collision_count = 0
    collisions = set()
    for paradigm in paradigms:
        tag = paradigm.getAttribute('tag')
        if len(tag_prefixes) == 0 or any([tag.startswith(p) for p in tag_prefixes]):
            variants = paradigm.getElementsByTagName('Variant')
            for variant in variants:
                base = variant.getAttribute('lemma').replace("+", "").lower()
                if base not in BASE_FORM_BLACKLIST:
                    forms = variant.getElementsByTagName('Form')
                    local_map = {}
                    citation_count = max([form.getAttribute('slouniki').count(',') for form in forms]) + 1
                    for form in forms:
                        if len(form.childNodes) > 0:
                            word = form.childNodes[0].data.replace("+", "").lower()
                            local_map[word] = (base, citation_count)
                    for k, v in local_map.items():
                        if k in result:
                            if result[k][1] == v[1] and result[k][0] != v[0]:
                                collision_count += 1
                                collisions.add(v[0])
                                collisions.add(result[k][0])
                            elif result[k][1] < v[1]:
                                result[k] = v
                        else:
                            result[k] = v
                    
    logger.info(f"Collisions (forms leading to different base word, and having same amount of citation): {collision_count}")
    logger.info(f"Examples of collisions: {list(islice(collisions, 5))}")
    for k in result:
        result[k] = result[k][0]
    return result

In [6]:
BASE_FORM_BLACKLIST = [
    'як' # can mean 'bull', but mostly used as particle 
] + [chr(ord('а')+delta) for delta in range(0, 32)] # alphabet letters

DERIVED_FORM_BLACKLIST = [
    'але', # can mean geographic place 'Ала', but mostly used as particle 'але'
    'калі', # weird form of 'калій' - 'каль', but used as particle 'калі'
    'вось', # can mean 'axis', but mostly used as particle
    'нам', # can mean short form of 'намеснік', but mostly used as pronoun 'мы'
    'наша', # some weird noun 'наша', but mostly used as pronoun 'мы'
    'нашы', # can be used as noun, but motly used as pronoun 'мы'
    'яму' # can be used as rare noun 'ям', but mostly used as pronoun 'ён'
]

In [7]:
#verbs
v = calculate_mapping_from_forms_to_base('GrammarDB-PUBLICATION_2021/V.xml')

#proper nouns
nprop = calculate_mapping_from_forms_to_base('GrammarDB-PUBLICATION_2021/NP.xml', ['NPII'])

#nouns
n1 = calculate_mapping_from_forms_to_base('GrammarDB-PUBLICATION_2021/N1.xml')
n2 = calculate_mapping_from_forms_to_base('GrammarDB-PUBLICATION_2021/N2.xml')
n3 = calculate_mapping_from_forms_to_base('GrammarDB-PUBLICATION_2021/N3.xml')

#adjectives
adj1 = calculate_mapping_from_forms_to_base('GrammarDB-PUBLICATION_2021/A1.xml', ['ARP', 'AQP'])
adj2 = calculate_mapping_from_forms_to_base('GrammarDB-PUBLICATION_2021/A2.xml', ['ARP', 'AQP'])

WORD_MAP = {}
WORD_MAP.update(v)
WORD_MAP.update(nprop)
WORD_MAP.update(n1)
WORD_MAP.update(n2)
WORD_MAP.update(n3)
WORD_MAP.update(adj1)
WORD_MAP.update(adj2)

INFO:root:Collisions (forms leading to different base word, and having same amount of citation): 2597
INFO:root:Examples of collisions: ['абвадніць', 'загануць', 'перапрызначыцца', 'абучыцца', 'прывучыць']
INFO:root:Collisions (forms leading to different base word, and having same amount of citation): 33
INFO:root:Examples of collisions: ['герцэгавіна', 'палестына', 'палесціна', 'полацак', 'днепр']
INFO:root:Collisions (forms leading to different base word, and having same amount of citation): 1345
INFO:root:Examples of collisions: ['гумоз', 'залогадавец', 'валок', 'забел', 'гурыец']
INFO:root:Collisions (forms leading to different base word, and having same amount of citation): 1155
INFO:root:Examples of collisions: ['наркот', 'паўднёвец', 'пыха', 'ляха', 'махор']
INFO:root:Collisions (forms leading to different base word, and having same amount of citation): 954
INFO:root:Examples of collisions: ['ранне', 'учын', 'страха', 'рубка', 'фарманта']
INFO:root:Collisions (forms leading to d

In [8]:
print(len(WORD_MAP))
print(WORD_MAP['рухам'])
print(WORD_MAP['беларусі'])

2262675
рух
беларусь


In [9]:
MANUAL_WORD_MAP = {
    'людзмі': WORD_MAP['людзьмі'],
    'расеі': WORD_MAP['расіі'],
    'расея': 'расія',
    'расею': WORD_MAP['расію'],
    'расеяй': WORD_MAP['расіяй'],
    'ссср': 'ссср',
    'бсср': 'бсср',
    'бнр': 'бнр',
    'вкл': 'вкл',
    'смі': 'смі',
    'шоў': 'шоў',
    'тыс': 'тысяча',
    'млн': 'мільён',
    'вул': 'вуліца',
    'вобл': 'вобласць',
    'тэл': 'тэлефон',
    'км': WORD_MAP['кіламетр'],
    'навінаў': WORD_MAP['навін'],
    'тысячаў': WORD_MAP['тысяч'],
    'прэзыдэнта': WORD_MAP['прэзідэнта'],
    'прэзыдэнт': WORD_MAP['прэзідэнт'],
    'камэнтары': WORD_MAP['каментары'],
    'сыстэму': WORD_MAP['сістэму'],
    'сытуацыі': WORD_MAP['сітуацыі'],
    'сытуацыя': WORD_MAP['сітуацыя'],
    'цэнтар': WORD_MAP['цэнтр'],
    'вільня': WORD_MAP['вільнюс'],
    'вільню': WORD_MAP['вільнюс'],
    'сьмерці': WORD_MAP['смерці'],
    'грамадзтва': WORD_MAP['грамадства'],
    'эўропы': WORD_MAP['еўропы'],
    'сябраў': WORD_MAP['сяброў'],
    'апазыцыі': WORD_MAP['апазіцыі'],
    'міністар': WORD_MAP["міністр"],
    'спэцыяльныя': WORD_MAP["спецыяльныя"],
    'мэню': WORD_MAP["меню"],
    'інтэрвію': WORD_MAP["інтэрв'ю"],
    'газэты': WORD_MAP["газеты"],
    'дакумэнты': WORD_MAP["дакументы"],
    'сытуацыю': WORD_MAP["сітуацыю"],
    'разьдзел': WORD_MAP["раздзел"],
    'сьмерць': WORD_MAP["смерць"],
    'грамадзкі': WORD_MAP["грамадскі"],
    'калёніі': WORD_MAP["калоніі"],
    'газэта': WORD_MAP["газета"],
}
WORD_MAP.update(MANUAL_WORD_MAP)

In [10]:
def strip_trailing_newline(iterable):
    for i in iterable:
        yield i.rstrip()

# this function is based on gensim.parser.preprocessing.strip_punctuation
# we replace gensim's version to correctly handle symbol ' in words, such as п'еса or кар'ера
RE_PUNCTUATION = re.compile(r'([%s])+' % re.escape(string.punctuation.replace("'","")), re.UNICODE)
def strip_punctuation(s):
    s = utils.to_unicode(s)
    return RE_PUNCTUATION.sub(" ", s)

CHARACTERS_MAP = {'’': '\'', 'ý': 'ў', ' ў': ' у', 'i': 'і', 'ньн': 'нн', 'цьц': 'цц', 'сьц': 'сц', 'сьл':'сл', 'дзьдз': 'ддз', 'сьв': 'св', 'зьв': 'зв', 'сьп': 'сп', 'сьс': 'сс', 'сьн': 'сн', 'зьм': 'зм', 'зьн': 'зн', 'зьл': 'зл'}
def lower_and_replace_characters(iterable):
    for s in iterable:
        s = s.lower()
        for k, v in CHARACTERS_MAP.items():
            s = s.replace(k, v)
        yield s

def split_sentences(iterable):
    for i in iterable:
        merged_dots = re.sub("[\.]+", ".", i)
        sentences = merged_dots.split('.')
        for s in sentences:
            yield s

def process_and_filter_word(raw_words):
    valid_words = []
    removed_words = []
    for w in raw_words:
        w = w.strip("'")
        if w in WORD_MAP:
            valid_words.append(WORD_MAP[w])
        else:
            removed_words.append(w)
    return (valid_words, removed_words)

global_removed_words = []
def preprocess_sentences(iterable):
    for i in iterable:
        s = strip_multiple_whitespaces(strip_numeric(strip_short(strip_punctuation(i))))
        s = re.sub("[«»“”„…—°′²]", "", s)
        s = remove_stopwords(s, stopwords=DERIVED_FORM_BLACKLIST)
        valid_words, removed_words = process_and_filter_word(s.split())
        s = ' '.join(valid_words)
        global_removed_words.extend(removed_words)
        yield s

def remove_short_lines(iterable):
    for i in iterable:
        if not i.isspace() and len(i) >= 20:
            yield i


In [11]:
with open('be.txt', 'r') as original_file:
    with open('processed-corpus.txt', 'w') as sentences_file:
        with open('removed-words.txt', 'w') as removed_words_file:
            for s in remove_short_lines(preprocess_sentences(split_sentences(lower_and_replace_characters(strip_trailing_newline(original_file))))):
                sentences_file.write(s + "\n")
                removed_words_file.write(' '.join(global_removed_words) + "\n")
                global_removed_words.clear()