In [1]:
import numpy as np 
import pandas as pd 

import torch
import torch.nn as nn
from torch.utils.data import DataLoader

from datasets import load_dataset
from transformers import BertTokenizer, BertModel
from transformers import DataCollatorWithPadding

from warnings import filterwarnings
filterwarnings('ignore')
from sqlalchemy import create_engine

In [None]:
import os
from dotenv import load_dotenv
load_dotenv()

user = os.getenv("POSTGRES_USER")
password = os.getenv("POSTGRES_PASSWORD")
host = os.getenv("POSTGRES_HOST")
port = os.getenv("POSTGRES_PORT")
db = os.getenv("POSTGRES_DATABASE")
db_url = f"postgresql://{user}:{password}@{host}:{port}/{db}"

In [None]:
engine = create_engine(db_url)

In [3]:
df_data_post = pd.read_sql("""SELECT * 
                      FROM post_text_df """, 
                      con=engine)

In [4]:
df_data_post

Unnamed: 0,post_id,text,topic
0,1,UK economy facing major risks\n\nThe UK manufa...,business
1,2,Aids and climate top Davos agenda\n\nClimate c...,business
2,3,Asian quake hits European shares\n\nShares in ...,business
3,4,India power shares jump on debut\n\nShares in ...,business
4,5,Lacroix label bought by US firm\n\nLuxury good...,business
...,...,...,...
7018,7315,"OK, I would not normally watch a Farrelly brot...",movie
7019,7316,I give this movie 2 stars purely because of it...,movie
7020,7317,I cant believe this film was allowed to be mad...,movie
7021,7318,The version I saw of this film was the Blockbu...,movie


In [13]:
dataset = df_data_post.text.to_list()

In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel
from tqdm import tqdm


tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
model = BertModel.from_pretrained("bert-base-cased")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

dataset = df_data_post.text.to_list()

class TextDataset(Dataset):
    def __init__(self, texts, tokenizer):
        self.texts = texts
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        tokens = self.tokenizer(
            self.texts[idx],
            add_special_tokens=True,
            padding="max_length",
            truncation=True,
            max_length=512,
            return_tensors="pt",
        )
        return {key: val.squeeze(0) for key, val in tokens.items()} 

text_dataset = TextDataset(dataset, tokenizer)
dataloader = DataLoader(text_dataset, batch_size=64, shuffle=False, pin_memory=True)

# Функция для получения эмбеддингов
@torch.inference_mode()
def get_embeddings(model, loader):
    model.eval()
    total_embeddings = []

    for batch in tqdm(loader):
        batch = {key: batch[key].to(device) for key in ["input_ids", "attention_mask"]}
        embeddings = model(**batch)["last_hidden_state"][:, 0, :]  # CLS-токен
        total_embeddings.append(embeddings.cpu())

    return torch.cat(total_embeddings, dim=0)

# Получаем эмбеддинги
embeddings = get_embeddings(model, dataloader)
print(embeddings.shape)  


100%|██████████| 110/110 [04:19<00:00,  2.36s/it]

torch.Size([7023, 768])





In [29]:
data = pd.DataFrame(embeddings)

In [32]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,0.140364,-0.140695,-0.575681,-0.118173,-0.315324,-0.114378,0.431977,-0.144285,0.003233,-1.187819,...,0.051768,0.614536,-0.613106,0.131061,0.202857,0.175932,-0.167769,-0.13788,0.04296,0.142284
1,0.15753,-0.097739,-0.230651,-0.364431,-0.242781,0.310065,0.374489,-0.089235,0.202153,-1.130713,...,0.586032,0.652154,-0.112308,-0.085171,-0.051814,0.240048,0.200298,-0.300891,0.190542,0.019753
2,0.314568,-0.115162,-0.181322,-0.274698,-0.357378,0.285228,0.266597,0.002641,-0.033905,-1.092078,...,0.416195,0.641697,-0.326961,-0.042803,-0.07385,0.212023,-0.090192,-0.35405,-0.204323,-0.027026
3,0.415117,-0.241301,-0.260733,-0.436027,-0.194695,0.130077,0.458805,-0.235223,-0.032935,-1.008515,...,0.791115,0.562938,-0.194188,0.022461,0.108904,0.019628,0.362089,-0.150884,-0.048834,0.083071
4,0.614585,-0.235812,-0.047732,-0.406701,-0.284798,0.12415,0.545586,-0.284447,0.047562,-1.139113,...,0.605897,0.518612,0.007371,0.032436,0.015634,0.055695,0.145705,-0.061322,-0.02112,0.121545


In [35]:
from sklearn.decomposition import PCA

pca = PCA(n_components=10)

PCA_dataset = pca.fit_transform(data)

PCA_dataset = pd.DataFrame(PCA_dataset)

In [36]:
PCA_dataset

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.954858,-1.755621,0.206967,1.697693,2.381051,0.014757,-0.139394,0.572930,2.037780,-0.975256
1,3.082158,-0.872362,-1.121495,0.695179,-0.051388,0.234773,-0.309023,0.001493,0.246477,0.529407
2,2.298748,-0.771815,-1.484300,0.921830,-0.043763,0.577127,0.102371,0.678363,1.163933,-1.000728
3,3.830453,-0.031839,-1.308001,-2.101629,-0.484519,-0.095330,0.206701,-0.808279,-0.210362,0.079621
4,2.248869,0.231332,-1.637724,-1.685151,-0.175619,-0.363440,0.877840,-0.201483,-0.476351,0.036897
...,...,...,...,...,...,...,...,...,...,...
7018,-2.656976,-1.065925,-1.688350,-0.109286,-0.562750,-0.528290,0.269968,0.190010,-1.075562,-0.433189
7019,-2.547194,-0.549889,-0.247589,0.458581,-0.235310,-0.913798,0.242953,0.429244,0.440683,-0.022328
7020,-2.438390,-0.878835,-1.034499,-0.249474,-0.589288,-0.427734,0.258952,0.983801,-0.138989,-0.731065
7021,-2.499132,-1.253244,-0.025749,-0.992778,0.589764,0.471239,0.410803,-0.300642,-0.129530,-0.345278


In [37]:
PCA_dataset.to_csv('PCA_dataset.csv', index=False) 