In [38]:
import json
import os
import numpy as np

In [1]:
import torch
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
model = AutoModel.from_pretrained("cointegrated/rubert-tiny2")

Some weights of the model checkpoint at cointegrated/rubert-tiny2 were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
def embed_bert_cls(text, model, tokenizer):
    t = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**{k: v.to(model.device) for k, v in t.items()})
    embeddings = model_output.last_hidden_state[:, 0, :]
    embeddings = torch.nn.functional.normalize(embeddings)
    return embeddings[0].cpu().numpy()

In [211]:
def cos_distance(a , b):
    '''Метрика косинусного расстояния'''
    try:
        return a@b/((a@a)*(b@b))**0.5
    except:
        return -2

In [17]:
where_from = 'rbc'

with open(f'./data/news_{where_from}.json', 'r') as f:
    data = json.load(f)

In [42]:
if f'vectors_news_{where_from}.json' not in os.listdir(os.getcwd() + '\\data\\vectors\\'):

    for id in data:
        data[id]['news_embedding'] = embed_bert_cls(data[id]['news_title'], model, tokenizer).tolist()
        data[id].pop('news_link', None)
        data[id].pop('news_text', None)

    with open(f'./data/vectors/vectors_news_{where_from}.json', 'w') as f:
        json.dump(data, f)

else:

    with open(f'./data/vectors/vectors_news_{where_from}.json', 'r') as f:
        data_vectors = json.load(f)

    news_hashes = list(data_vectors.keys())

    for id in data:
        if id not in news_hashes:
            data_vectors[id]['news_embedding'] = embed_bert_cls(data[id]['news_title'], model, tokenizer).tolist()

In [103]:
import torch
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("cointegrated/LaBSE-en-ru")
model = AutoModel.from_pretrained("cointegrated/LaBSE-en-ru")
sentences = ["Hello World", "Привет Мир"]


Some weights of the model checkpoint at cointegrated/LaBSE-en-ru were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [104]:
def LaBSE(text):
    encoded_input = tokenizer(text, padding=True, truncation=True, max_length=64, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**encoded_input)
    embeddings = model_output.pooler_output
    embeddings = torch.nn.functional.normalize(embeddings)
    return embeddings[0].numpy()

In [116]:
a = 'Охранник отказался покидать свой пост'
b = 'Джонсон не любит министров'

cos_distance(LaBSE(a), LaBSE(b))

0.7168594693891301

In [231]:
import sentencepiece as spm
import json
from gensim.models import Word2Vec
from gensim.models import Phrases
from razdel import tokenize, sentenize

In [168]:
def spm_encode(string):
    '''Кодируем строку в BPE-Dropout вектор'''
    return sp.encode(string, nbest_size=-1, out_type='str')
    # return sp.encode(string, enable_sampling=True, alpha=0.1, nbest_size=-1, out_type='str')

In [235]:
def get_sent(text):
    return [sent.text for sent in list(sentenize(text))]

In [200]:
def get_tokens(string):
    tokens = list(tokenize(string))
    return [_.text for _ in tokens]

In [169]:
sources = ['rbc', 'lenta', 'ria']

for where_from in sources:

    with open(f'./data/news_{where_from}.json', 'r', encoding="utf-8") as f:
        data = json.load(f)

    if where_from == 'rbc':
        all_titles = [data[i]['news_text'] for i in data]
    else:
        all_titles.extend([data[i]['news_text'] for i in data])

In [242]:
all_titles_sent = []
 
for text in all_titles:
    all_titles_sent.extend(get_sent(text))

all_titles_sent_tokens = [get_tokens(sent) for sent in all_titles_sent]

In [170]:
# with open(f'./data/sentencepeace/titles.txt', 'w', encoding="utf-8") as f:
#     f.write("\n".join(all_titles))

In [189]:
# spm.SentencePieceTrainer.train(input = './data/sentencepeace/titles.txt', model_prefix = 'm', vocab_size = 1000, model_type = 'bpe')
# sp = spm.SentencePieceProcessor(model_file = 'm.model')

In [190]:
# spm_encode_ids = [spm_encode(text) for text in all_titles]

In [249]:
%%time

model_wv = Word2Vec(sentences = all_titles_sent_tokens, vector_size=2000, window=5, min_count=1, workers=8)
model_wv.save("word2vec.model")

CPU times: total: 20.4 s
Wall time: 6.03 s


In [183]:
def w2v_cast(x):
    c = 0
    s = model_wv.wv[0] * 0
    
    for i in x:
        if i in model_wv.wv:
            if c != 0:
                s = s + model_wv.wv[i]
            else:
                s = model_wv.wv[i]
            c += 1
        else:
            continue
    return s

In [250]:
text = 'В Госдепе обещали усилить санкции против России'
text_b = 'В США '
cos_distance(w2v_cast(get_tokens(text)), w2v_cast(get_tokens(text_b)))

0.9990397132164921