# Генерация эмбеддингов постов при помощи моделей нейросетей.
Ноутбук выполнялся в kaggle.

In [1]:
!pip install psycopg2-binary

Collecting psycopg2-binary
  Downloading psycopg2_binary-2.9.6-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m27.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: psycopg2-binary
Successfully installed psycopg2-binary-2.9.6
[0m

In [2]:
import pandas as pd
import numpy as np

from sqlalchemy import create_engine

import re

import sys
import gc
import os

from dotenv import load_dotenv

from tqdm import tqdm

from transformers import AutoTokenizer
from transformers import BertModel  # https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertModel
from transformers import RobertaModel  # https://huggingface.co/docs/transformers/model_doc/roberta#transformers.RobertaModel
from transformers import DistilBertModel  # https://huggingface.co/docs/transformers/model_doc/distilbert#transformers.DistilBertModel

import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding

In [3]:
# Класс создаёт набор данных из токенизированных текстов.
# Тексты токенизируется сразу же токенизатором tokenizer, при создании экземпляра класса.
class PostDataset(Dataset):
    def __init__(self, texts, tokenizer):
        super().__init__()

        self.texts = tokenizer.batch_encode_plus(
            texts,
            add_special_tokens=True,
            return_token_type_ids=False,
            return_tensors='pt',
            truncation=True,
            padding=True
        )
        self.tokenizer = tokenizer

    def __getitem__(self, idx):
        return {'input_ids': self.texts['input_ids'][idx], 'attention_mask': self.texts['attention_mask'][idx]}

    def __len__(self):
        return len(self.texts['input_ids'])

In [4]:
# Функция для загрузки модели bert, roberta или distilbert из huggingface
def get_model(model_name):
    
    assert model_name in ['bert', 'roberta', 'distilbert']

    checkpoint_names = {
        'bert': 'bert-base-cased',  # https://huggingface.co/bert-base-cased
        'roberta': 'roberta-base',  # https://huggingface.co/roberta-base
        'distilbert': 'distilbert-base-cased'  # https://huggingface.co/distilbert-base-cased
    }

    model_classes = {
        'bert': BertModel,
        'roberta': RobertaModel,
        'distilbert': DistilBertModel
    }

    return AutoTokenizer.from_pretrained(checkpoint_names[model_name]), model_classes[model_name].from_pretrained(checkpoint_names[model_name])

In [5]:
# Подсчёт моделью эмбеддингов токенизированных текстов.
# Эмбеддинги берутся с первого слоя last_hidden_state.
@torch.inference_mode()
def get_embeddings_labels(model, loader, device):
    model.eval()
    
    total_embeddings = []
    
    for batch in tqdm(loader):
        batch = {key: batch[key].to(device) for key in ['attention_mask', 'input_ids']}

        embeddings = model(**batch)['last_hidden_state'][:, 0, :]

        total_embeddings.append(embeddings.cpu())

    return torch.cat(total_embeddings, dim=0)

In [6]:
# Загружаем доступы к базе данных соцальной сети Karpov Courses
load_dotenv(r'/kaggle/input/karpov-startml-db/env')

True

In [7]:
POSTGRESUSER = os.environ['POSTGRESUSER']
POSTGRES_PASSWORD = os.environ['POSTGRES_PASSWORD']
POSTGRES_HOST = os.environ['POSTGRES_HOST']
POSTGRES_PORT = os.environ['POSTGRES_PORT']
POSTGRES_DATABASE = os.environ['POSTGRES_DATABASE']

In [8]:
str_con_db = f'postgresql://{POSTGRESUSER}:{POSTGRES_PASSWORD}@{POSTGRES_HOST}:{POSTGRES_PORT}/{POSTGRES_DATABASE}'

In [9]:
engine = create_engine(str_con_db)

In [10]:
# Загрузка таблицы постов.
posts_info = pd.read_sql("""SELECT * FROM public.post_text_df""", con=engine)

In [11]:
# Загрузка модели и токенайзера.
tokenizer, model = get_model('distilbert')

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/263M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:
# Создаём набор данных из токенизированных текстов постов.
dataset = PostDataset(posts_info['text'].values.tolist(), tokenizer)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

loader = DataLoader(dataset, batch_size=32, collate_fn=data_collator, pin_memory=True, shuffle=False)
batch_data = next(iter(loader))

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [13]:
# Проверка размера батча.
print(batch_data['input_ids'].shape)
print(batch_data['attention_mask'].shape)

torch.Size([32, 512])
torch.Size([32, 512])


In [14]:
# Элемент dataset имеет input_ids - токенизированный текст и
# attention_mask - маску внимания.
dataset[8]

{'input_ids': tensor([  101,  4439,   118,  3398,  5093,  2239,  1120,  3187,  4439,  1116,
          5151,  1107,  3398,  1116,  5093,  2380,  2736,  1383,  1106,  1129,
         15420,  1170,  1157,  4583,  5093,  3016,  1486,  1157,  5151,  1175,
         26551,  1118, 13904,   119,  3398,  1116,  6254,  4751,  1118,   170,
          1415,  2656,  1106,  2195, 17037,  4661, 22494,  1116,  8219,  1107,
           170,  1207,  5093,  2443,  1121,  3102,   110,  1106,  3927,   110,
           119,  1109,  1815,   117,  1134,  1108, 16489,  1113,  1569,  2699,
          4745,   117,  3226,  1126,  2206,  2992,  1118, 13904,  1106,  1660,
          2310,   170, 27484,  1166,  2880, 12372,   119, 17037,  4661, 22494,
          1163,  1103,  2383,  6986,  1103, 11040,  4309,  1106,  1103,  1933,
           119,  1966,  1103,  1419,  1116,  4195,  1163,  1122,  1156,  2760,
          1106,  8804,  9093,   117, 15793,  1163,  1152,  1354, 17037,  4661,
         22494,  1108,  1383,  1106,  3

In [15]:
# Размер датасета.
print(dataset.texts['input_ids'].shape)
print(dataset.texts['attention_mask'].shape)

torch.Size([7023, 512])
torch.Size([7023, 512])


In [16]:
# Проверка батча.
# Каждый пост будет иметь эмбеддинг размерностью 768.
out_batch = model(**batch_data)
print(out_batch['last_hidden_state'][:, 0, :].shape)

torch.Size([32, 768])


In [17]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

if device.type == 'cuda':
    print(torch.cuda.get_device_name())

model = model.to(device)

cuda:0
Tesla T4


In [18]:
embeddings = get_embeddings_labels(model, loader, device).numpy()

100%|██████████| 220/220 [02:00<00:00,  1.83it/s]


In [19]:
# Создадим pandas фрейм из таблицы эмбеддингов и объединим с таблицей постов.
columns = [f'text_feat_{i}' for i in range(embeddings.shape[1])]
embeddings = pd.DataFrame(embeddings, columns=columns)
posts_info_data = pd.concat((posts_info, embeddings), axis=1)

In [20]:
posts_info_data

Unnamed: 0,post_id,text,topic,text_feat_0,text_feat_1,text_feat_2,text_feat_3,text_feat_4,text_feat_5,text_feat_6,...,text_feat_758,text_feat_759,text_feat_760,text_feat_761,text_feat_762,text_feat_763,text_feat_764,text_feat_765,text_feat_766,text_feat_767
0,1,UK economy facing major risks\n\nThe UK manufa...,business,0.363151,0.048938,-0.264081,-0.160672,-0.211844,-0.207537,0.325393,...,0.349080,0.290132,-0.244970,0.078532,0.137399,0.208097,-0.058624,-0.141593,0.015918,0.000092
1,2,Aids and climate top Davos agenda\n\nClimate c...,business,0.236416,-0.159501,-0.327798,-0.372885,-0.292128,-0.019028,0.333677,...,0.311639,0.297819,-0.177003,0.130227,-0.063239,0.190171,-0.018153,-0.289936,0.119365,-0.001623
2,3,Asian quake hits European shares\n\nShares in ...,business,0.375191,-0.113944,-0.240547,-0.282425,-0.264252,0.061839,0.249180,...,0.353615,0.308457,-0.207151,0.056724,0.056596,0.125301,0.021575,-0.338920,0.058694,-0.021266
3,4,India power shares jump on debut\n\nShares in ...,business,0.273770,-0.048748,-0.440433,-0.189999,-0.410856,-0.100587,0.255757,...,0.321182,0.218213,-0.267988,-0.093801,0.176987,0.251617,0.028331,-0.155708,0.136188,0.044055
4,5,Lacroix label bought by US firm\n\nLuxury good...,business,0.297853,-0.073203,-0.146820,-0.127284,-0.133966,0.045766,0.176495,...,0.168949,0.208978,-0.051180,0.045685,0.173985,0.148893,0.097254,-0.239587,0.228066,0.189831
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7018,7315,"OK, I would not normally watch a Farrelly brot...",movie,0.338545,0.084620,-0.225981,-0.115433,-0.064816,-0.129863,0.358163,...,0.495477,0.203647,-0.138003,0.148754,0.138934,0.199463,0.054088,-0.110224,0.039229,-0.003550
7019,7316,I give this movie 2 stars purely because of it...,movie,0.354057,0.053933,-0.099446,-0.161002,0.009353,-0.190430,0.258928,...,0.322798,0.176826,-0.154205,-0.010798,0.100222,0.093780,0.051934,-0.119376,0.182106,0.072430
7020,7317,I cant believe this film was allowed to be mad...,movie,0.340383,0.066492,-0.163184,-0.115225,-0.102958,-0.181814,0.346562,...,0.372651,0.169435,-0.041522,-0.033723,0.047250,0.173592,-0.027378,-0.086563,0.203404,0.032091
7021,7318,The version I saw of this film was the Blockbu...,movie,0.432092,0.011092,-0.117306,-0.123570,0.066756,-0.103376,0.243298,...,0.468103,0.156609,-0.054083,0.210478,0.116214,0.064117,0.084667,0.075401,0.102739,0.015274


In [21]:
posts_info_data.columns

Index(['post_id', 'text', 'topic', 'text_feat_0', 'text_feat_1', 'text_feat_2',
       'text_feat_3', 'text_feat_4', 'text_feat_5', 'text_feat_6',
       ...
       'text_feat_758', 'text_feat_759', 'text_feat_760', 'text_feat_761',
       'text_feat_762', 'text_feat_763', 'text_feat_764', 'text_feat_765',
       'text_feat_766', 'text_feat_767'],
      dtype='object', length=771)

In [22]:
# Проверка на NaN значения.
posts_info_is_na = posts_info_data.isna().sum()
posts_info_is_na[posts_info_is_na > 0]

Series([], dtype: int64)

In [23]:
# Запись таблицы признаков постов в БД.
posts_info_data.to_sql("pavel55645_posts_info_features_dl_emb", con=engine, if_exists='replace')

В итоге создана таблица с эмбеддингами постов. Каждый пост представлен вектором размерности 768. Далее с этими признаками можно выполнить какие-либо преобразования, например, кластеризовать посты и подсчитать расстояния от каждого поста до кластеров.