In [1]:
import urllib.request
import zipfile
import re
import string
import json
import math
from collections import defaultdict
from gensim.models import KeyedVectors
import numpy as np
from compute_accuracy import compute_accuracy

In [2]:
np.set_printoptions(suppress=True)

In [3]:
import logging
logging.basicConfig(level=logging.INFO, force = True)
logger = logging.getLogger()
logger.info("Logging initialized")

INFO:root:Logging initialized


In [4]:
urllib.request.urlretrieve('https://belarus-embedding.s3.eu-central-1.amazonaws.com/tsbm.dsl.zip', 'tsbm.dsl.zip')

('tsbm.dsl.zip', <http.client.HTTPMessage at 0x136388a10>)

In [5]:
with zipfile.ZipFile('tsbm.dsl.zip', 'r') as zip_ref:
    zip_ref.extractall('.')

In [6]:
# this function is based on gensim.parser.preprocessing.strip_punctuation
# we replace gensim's version to correctly handle symbol ' in words, such as п'еса or кар'ера
RE_PUNCTUATION = re.compile(r'([!"#$%&()*+,-./:;<=>?@[\]^_`{|}~«»\\])+', re.UNICODE)
def strip_punctuation(s):
    return RE_PUNCTUATION.sub(" ", s)

In [7]:
with open('word_map.txt', 'r') as f:
    word_map = json.load(f)

In [8]:
processed_dict = {}
with open('tsbm.dsl') as f:
    current_word = None
    explanation_words = []
    for line in f:
        line = line.rstrip()
        if len(line) == 0:
            if current_word in word_map and len(explanation_words) > 0:
                processed_dict[current_word] = explanation_words
            current_word = None
            explanation_words = []
        elif line[0] != ' ' and line[0] != '\t':
            current_word = line
        else:
            example_index = line.find("[ex]")
            if example_index > 0:
                line = line[:example_index]
            if line.find("[m1]") > 0:
                line = ''
            line = line.lower()
            line = line.replace("знач.", "").replace("зал.", "").replace("дзеянне", "")
            line = strip_punctuation(line)
            
            words = line.split()
            for w in words:
                if w in word_map and word_map[w] != current_word:
                    explanation_words.append(word_map[w])

In [9]:
processed_dict['удаў']

['вялікі', 'драпежны', 'неядавіты', 'змяя', 'вадзіцца', 'тропік']

In [10]:
words_frequency = defaultdict(int)
for word, explanation in processed_dict.items():
    unique_expl = set(explanation)
    for word_expl in unique_expl:
        words_frequency[word_expl] += 1
idf = {}
N = len(processed_dict)
for word, frequency in words_frequency.items():
    idf[word] = math.log(1.0 * N / frequency)
print(idf['акула'])
print(idf['чалавек'])

10.304286234236946
4.0090202327973


In [11]:
wv = KeyedVectors.load_word2vec_format('word2vec-cc100-cbow-d100-w3-min10.vectors', binary=False)

INFO:gensim.models.keyedvectors:loading projection weights from word2vec-cc100-cbow-d100-w3-min10.vectors
INFO:gensim.utils:KeyedVectors lifecycle event {'msg': 'loaded (79373, 100) matrix of type float32 from word2vec-cc100-cbow-d100-w3-min10.vectors', 'binary': False, 'encoding': 'utf8', 'datetime': '2023-06-04T20:16:09.883258', 'gensim': '4.3.1', 'python': '3.11.3 | packaged by conda-forge | (main, Apr  6 2023, 08:58:31) [Clang 14.0.6 ]', 'platform': 'macOS-12.6-arm64-arm-64bit', 'event': 'load_word2vec_format'}


In [12]:
compute_accuracy(lambda w1, w2: wv.similarity(w1, w2), log_failed_cases=False)

0.8333333333333334

In [13]:
wv["акула"][:10]

array([-0.2922294 ,  0.40758115, -0.34638226, -0.36782512, -0.2914711 ,
       -0.23186941, -0.08000712,  0.9142046 , -0.42972255,  0.05153808],
      dtype=float32)

In [14]:
FACTOR = 0.6
ITERATIONS = 5
ALPHA = FACTOR / ITERATIONS
for i in range(0, ITERATIONS):
    print(i)
    for word, explanation in processed_dict.items():
        if word in wv:
            s = np.zeros_like(wv[word])
            coef_sum = 0.0
            for exp_word in explanation:
                if exp_word in wv:
                    s = s + idf[exp_word] * wv[exp_word]
                    coef_sum += idf[exp_word]
            if coef_sum > 0.0:
                avg = s / coef_sum
                wv[word] = (1 - ALPHA) * wv[word] + ALPHA * avg

0
1
2
3
4


In [15]:
compute_accuracy(lambda w1, w2: wv.similarity(w1, w2), log_failed_cases=False)

0.8505747126436781

In [16]:
wv["акула"][:10]

array([-0.30332246,  0.595187  , -0.26627123, -0.00242245,  0.42038128,
       -0.52545875, -0.36166292,  0.93984926, -0.03500113, -0.2031757 ],
      dtype=float32)

In [17]:
wv.most_similar('пітон', topn=20)

[('удаў', 0.9108102917671204),
 ('ігуана', 0.9065163135528564),
 ('бурундук', 0.8689566254615784),
 ('раскашаваць', 0.8682907223701477),
 ('насарог', 0.8580600023269653),
 ('суслік', 0.8533772230148315),
 ('арангутан', 0.8494911789894104),
 ('шаблязубы', 0.8486717343330383),
 ('глушэц', 0.8426281213760376),
 ('акула', 0.8423356413841248),
 ('пелікан', 0.8389197587966919),
 ('няясыць', 0.8328167796134949),
 ('гаротнік', 0.8257499933242798),
 ('шымпанзэ', 0.8243662714958191),
 ('сурок', 0.8232481479644775),
 ('баклан', 0.8230767846107483),
 ('марал', 0.8194829225540161),
 ('сенбернар', 0.819077730178833),
 ('раскашоўвацца', 0.8165847063064575),
 ('балацянік', 0.8151198625564575)]