In [14]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM


model_name = "flax-community/papuGaPT2"
device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
device

'cuda'

<!-- @format -->

# Bezkontekstowe osadzenia słów


In [15]:
def get_word_embeddings_papuga(word: str):

    word = " " + word
    tokens_ids = tokenizer(word, return_tensors="pt")["input_ids"][0]

    n = len(tokens_ids)
    embeddings = model.transformer.wte.weight.detach().cpu().numpy()

    weights = np.exp(-np.arange(n, dtype=float) * 0.5)
    weights /= np.sum(weights)

    word_emb = np.zeros(768)
    for emd_token, weight in zip(embeddings[tokens_ids], weights):
        scaled_token = emd_token * weight
        word_emb += scaled_token

    return word_emb

In [17]:
CLUSTERED_TEXT = "clustered_text.txt"


with open(CLUSTERED_TEXT, "r", encoding="utf-8") as file:
    clusters_txt = file.read()

WORDS = {}

for x in clusters_txt.split("\n"):
    L = x.split()
    if len(L) < 2:
        continue
    WORDS[L[0]] = L[1:]

In [20]:
OUTPUT_FILE = "output\word_embedings_file.txt"


def paste_embedding_into_file(file_name):
    with open(file_name, "w") as file:
        pass
    words = [w for word in WORDS.values() for w in word]

    for w in words:
        w_emb = get_word_embeddings_papuga(w)

        word_emb_str = ""
        for n in w_emb:
            word_emb_str = f"{word_emb_str} {n}"
        with open(file_name, "a") as file:
            print(f"{w}{word_emb_str}", file=file)


paste_embedding_into_file(OUTPUT_FILE)

In [45]:
from word_emb_evaluation import benchmark

benchmark(CLUSTERED_TEXT=CLUSTERED_TEXT, CALCULATED_EMBEDINGS=OUTPUT_FILE)

PROBLEMS: 0.0
Start
TOTAL SCORE: 0.595764


<!-- @format -->

# Kontekstowe osadzenia


In [3]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel

model_name = "allegro/herbert-base-cased"
device = "cuda" if torch.cuda.is_available() else "cpu"
device = "cpu"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)
text = "owoc pomarańcza"
token_ids = tokenizer(text, return_tensors="pt")["input_ids"][0]
tokens = [tokenizer.decode(idx) for idx in token_ids][1:-1]
outputs = model(token_ids.unsqueeze(0).to(device))
embeddings = outputs.last_hidden_state[0][1:-1]


for token, embedding in zip(tokens, embeddings):

    print(f"\nToken: '{token}'")

    print(f"Embedding: {embedding[:5]}")

Some weights of the model checkpoint at allegro/herbert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.sso.sso_relationship.bias', 'cls.sso.sso_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).



Token: 'owo'
Embedding: tensor([-0.0063,  0.1560, -0.1094,  0.4298,  0.0949], grad_fn=<SliceBackward0>)

Token: 'c'
Embedding: tensor([-0.8746,  0.3371, -0.9575,  0.4812,  1.0283], grad_fn=<SliceBackward0>)

Token: 'pomara'
Embedding: tensor([-0.1905,  0.2520, -0.2108,  0.0751,  0.2352], grad_fn=<SliceBackward0>)

Token: 'ńcza'
Embedding: tensor([-0.6236,  0.0452, -0.1211,  0.1778,  0.6638], grad_fn=<SliceBackward0>)


In [4]:
def get_word_embedding(word, model, tokenizer):
    tokens_ids = tokenizer(word, return_tensors="pt")["input_ids"][0]

    with torch.no_grad():
        embeddings = model.get_input_embeddings()(tokens_ids)
    word_embedding = embeddings.mean(dim=0).squeeze().numpy()
    return word_embedding

<!-- @format -->

## Wyniki


In [5]:
CLUSTERED_TEXT = "clustered_text.txt"


with open(CLUSTERED_TEXT, "r", encoding="utf-8") as file:
    clusters_txt = file.read()

WORDS = {}

for x in clusters_txt.split("\n"):
    L = x.split()
    if len(L) < 2:
        continue
    WORDS[L[0]] = L[1:]

print(WORDS)

{'piśmiennicze:': ['pisak', 'flamaster', 'ołówek', 'długopis', 'pióro'], 'małe_ssaki:': ['mysz', 'szczur', 'chomik', 'łasica', 'kuna', 'bóbr'], 'okręty:': ['niszczyciel', 'lotniskowiec', 'trałowiec', 'krążownik', 'pancernik', 'fregata', 'korweta'], 'lekarze:': ['lekarz', 'pediatra', 'ginekolog', 'kardiolog', 'internista', 'geriatra'], 'zupy:': ['rosół', 'żurek', 'barszcz'], 'uczucia:': ['miłość', 'przyjaźń', 'nienawiść', 'gniew', 'smutek', 'radość', 'strach'], 'działy_matematyki:': ['algebra', 'analiza', 'topologia', 'logika', 'geometria'], 'budynki_sakralne:': ['kościół', 'bazylika', 'kaplica', 'katedra', 'świątynia', 'synagoga', 'zbór'], 'stopień_wojskowy:': ['chorąży', 'podporucznik', 'porucznik', 'kapitan', 'major', 'pułkownik', 'generał', 'podpułkownik'], 'grzyby_jadalne:': ['pieczarka', 'borowik', 'gąska', 'kurka', 'boczniak', 'kania'], 'prądy_filozoficzne:': ['empiryzm', 'stoicyzm', 'racjonalizm', 'egzystencjalizm', 'marksizm', 'romantyzm'], 'religie:': ['chrześcijaństwo', 'budd

In [6]:
OUTPUT_FILE = "output\word_embeddings_context.txt"

# czyszczenie pliku
with open(OUTPUT_FILE, "w") as file:
    pass


# zapisanie policzonych embedingów
def calulate_context_embedings(file_name):
    with open(file_name, "w") as file:
        for k, wrds in WORDS.items():
            for w in wrds:
                w_emb = get_word_embedding(w, model, tokenizer)
                word_emb_str = " ".join(map(str, w_emb.tolist()))
                print(f"{w} {word_emb_str}", file=file)


calulate_context_embedings(file_name=OUTPUT_FILE)

In [7]:
from word_emb_evaluation import benchmark

benchmark(CLUSTERED_TEXT=CLUSTERED_TEXT, CALCULATED_EMBEDINGS=OUTPUT_FILE)

PROBLEMS: 0.0
Start ABX tests
TOTAL SCORE: 0.63291


<!-- @format -->

# Testy ABX dla osadzeń kontekstowych dla zniesztłaconych danych


<!-- @format -->

## Piewszy typ


In [8]:
CLUSTERED_TEXT = "distored_clusters\distorted_swapped.txt"

with open(CLUSTERED_TEXT, "r", encoding="utf-8") as file:
    clusters_txt = file.read()

WORDS = {}

for x in clusters_txt.split("\n"):
    L = x.split()
    if len(L) < 2:
        continue
    WORDS[L[0]] = L[1:]

print(WORDS)

{'piśmiennicze:': ['sipak', 'flmaaster', 'olowke', 'ilugopds', 'iporo'], 'małe_ssaki:': ['symz', 'szczru', 'cmohik', 'alsica', 'knua', 'brbo'], 'okręty:': ['nyszcziciel', 'lotnickowies', 'ecalowitr', 'koawrznik', 'canpernik', 'rfegata', 'kowreta'], 'lekarze:': ['lezark', 'petiadra', 'gkneiolog', 'kargiolod', 'internsita', 'gertaria'], 'zupy:': ['orsol', 'zuerk', 'bzrsacz'], 'uczucia:': ['molisc', 'pjzynazr', 'niencwisa', 'gniwe', 'skutem', 'cadosr', 'shract'], 'działy_matematyki:': ['blgeara', 'anailza', 'lopoaogit', 'logiak', 'gtomreeia'], 'budynki_sakralne:': ['coskiol', 'bklyziaa', 'kapliac', 'ktaedra', 'awiatynis', 'anyagogs', 'bzor'], 'stopień_wojskowy:': ['ahorczy', 'podporukznic', 'pokucznir', 'kapatin', 'maojr', 'pulkownik', 'genarel', 'ppdouokwlnik'], 'grzyby_jadalne:': ['paeczirka', 'borokiw', 'gaska', 'aurkk', 'obczniak', 'aknia'], 'prądy_filozoficzne:': ['emyirpzm', 'stozcymi', 'racaonjlizm', 'egzystmzcjaline', 'amrksizm', 'rtmmnoyza'], 'religie:': ['chrzescjianstwo', 'bddu

In [9]:
OUTPUT_FILE = "output\word_embeddings_context_swapped.txt"

# czyszczenie pliku
with open(OUTPUT_FILE, "w") as file:
    pass


calulate_context_embedings(OUTPUT_FILE)

In [10]:
from word_emb_evaluation import benchmark

benchmark(CLUSTERED_TEXT=CLUSTERED_TEXT, CALCULATED_EMBEDINGS=OUTPUT_FILE)

PROBLEMS: 0.0
Start ABX tests
TOTAL SCORE: 0.539432


<!-- @format -->

## Drugi typ


In [11]:
CLUSTERED_TEXT = "distored_clusters\distorted_next_letter.txt"

with open(CLUSTERED_TEXT, "r", encoding="utf-8") as file:
    clusters_txt = file.read()

WORDS = {}

for x in clusters_txt.split("\n"):
    L = x.split()
    if len(L) < 2:
        continue
    WORDS[L[0]] = L[1:]

print(WORDS)

{'piśmiennicze:': ['pjsak', 'flamcster', 'omowek', 'dlugoqis', 'pjoro'], 'małe_ssaki:': ['mzsz', 'szdzur', 'chomil', 'lasicb', 'kvna', 'bobs'], 'okręty:': ['njsaczyciel', 'lotnitlpwiec', 'tralpwiec', 'krczownik', 'paoceroik', 'fregaua', 'kprweta'], 'lekarze:': ['lekara', 'pedibtra', 'ginekplog', 'kasdjolog', 'intesnista', 'gfriatrb'], 'zupy:': ['rpsol', 'zurfk', 'barszca'], 'uczucia:': ['mimosc', 'prazjazn', 'nifoawisc', 'gnjew', 'smutfk', 'radosd', 'strbch'], 'działy_matematyki:': ['alhebra', 'anamiza', 'toqolpgia', 'logikb', 'geomftrib'], 'budynki_sakralne:': ['kosciom', 'baazlika', 'kaqlica', 'katedrb', 'swiatyoja', 'syoagoga', 'zcor'], 'stopień_wojskowy:': ['ciorazy', 'podporucznil', 'porucznjk', 'kapiuan', 'majos', 'pvmkownik', 'generam', 'poepumkownik'], 'grzyby_jadalne:': ['piedzarkb', 'borowil', 'gatka', 'kurla', 'bodzniak', 'kaoia'], 'prądy_filozoficzne:': ['empirzzm', 'suoidyzm', 'racjonblizm', 'ehzyttencjalizm', 'marksjzm', 'ronantyzm'], 'religie:': ['chszescikanstwp', 'bued

In [12]:
OUTPUT_FILE = "output\word_embeddings_context_next_letter.txt"
calulate_context_embedings(OUTPUT_FILE)

In [13]:
benchmark(CLUSTERED_TEXT=CLUSTERED_TEXT, CALCULATED_EMBEDINGS=OUTPUT_FILE)

PROBLEMS: 0.0
Start ABX tests
TOTAL SCORE: 0.529846
