# EMBEDDINGS FOR POST_TEST_DF USING PYTORCH

___

In [1]:
import pandas as pd
import datetime

import torch
import torch.nn as nn
import numpy as np

from datasets import load_dataset

from transformers import AutoTokenizer
from transformers import BertModel  # https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertModel
from transformers import RobertaModel  # https://huggingface.co/docs/transformers/model_doc/roberta#transformers.RobertaModel
from transformers import DistilBertModel  # https://huggingface.co/docs/transformers/model_doc/distilbert#transformers.DistilBertModel

from tqdm import tqdm

from transformers import DataCollatorWithPadding

from torch.utils.data import Subset

from torch.utils.data import DataLoader



___

In [2]:
post_text_df = pd.read_csv('/kaggle/input/post-text/post_text_df.csv')

In [3]:
post_text_df.drop('Unnamed: 0', axis=1, inplace=True)

In [4]:
post_text_df

Unnamed: 0,post_id,text,topic
0,1,UK economy facing major risks\n\nThe UK manufa...,business
1,2,Aids and climate top Davos agenda\n\nClimate c...,business
2,3,Asian quake hits European shares\n\nShares in ...,business
3,4,India power shares jump on debut\n\nShares in ...,business
4,5,Lacroix label bought by US firm\n\nLuxury good...,business
...,...,...,...
7018,7315,"OK, I would not normally watch a Farrelly brot...",movie
7019,7316,I give this movie 2 stars purely because of it...,movie
7020,7317,I cant believe this film was allowed to be mad...,movie
7021,7318,The version I saw of this film was the Blockbu...,movie


In [5]:
from datasets import Dataset

texts_dataset = Dataset.from_pandas(post_text_df[['text']])

In [6]:
texts_dataset

Dataset({
    features: ['text'],
    num_rows: 7023
})

___

### FUNCTIONS

In [7]:
def get_model(model_name):
    assert model_name in ['bert', 'roberta', 'distilbert']
    
    checkpoint_names = {
        'bert': 'bert-base-cased',  # https://huggingface.co/bert-base-cased
        'roberta': 'roberta-base',  # https://huggingface.co/roberta-base
        'distilbert': 'distilbert-base-cased'  # https://huggingface.co/distilbert-base-cased
    }
    
    model_classes = {
        'bert': BertModel,
        'roberta': RobertaModel,
        'distilbert': DistilBertModel
    }
    
    return AutoTokenizer.from_pretrained(checkpoint_names[model_name]), model_classes[model_name].from_pretrained(checkpoint_names[model_name])

In [8]:
def tokenization(example):
    return tokenizer.batch_encode_plus(example['text'], add_special_tokens=True, return_token_type_ids=False, truncation=True)

In [9]:
@torch.inference_mode()
def get_embeddings(model, loader):
    model.eval()
    
    total_embeddings = []

    for batch in tqdm(loader):
        batch = {key: batch[key].to(device) for key in ['attention_mask', 'input_ids']}

        embeddings = model(**batch)['last_hidden_state'][:, 0, :]

        total_embeddings.append(embeddings.cpu())

    return torch.cat(total_embeddings, dim=0)

___

### MODEL

In [11]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

print(device)

cuda:0


In [12]:
tokenizer, model = get_model('distilbert')

model = model.to(device)

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/263M [00:00<?, ?B/s]

In [13]:
texts_dataset = texts_dataset.map(tokenization, batched=True)

  0%|          | 0/8 [00:00<?, ?ba/s]

In [14]:
texts_dataset.set_format(type="torch", columns=['input_ids', 'attention_mask'])

In [15]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [16]:
texts_dataset_loader = DataLoader(texts_dataset, 
                                 batch_size=32, 
                                 collate_fn=data_collator, 
                                 pin_memory=True, 
                                 shuffle=False)

In [17]:
text_embeddings = get_embeddings(model, texts_dataset_loader) 

  0%|          | 0/220 [00:00<?, ?it/s]You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 220/220 [01:37<00:00,  2.26it/s]


In [18]:
text_embeddings

tensor([[ 3.6315e-01,  4.8937e-02, -2.6408e-01,  ..., -1.4159e-01,
          1.5918e-02,  9.1941e-05],
        [ 2.3642e-01, -1.5950e-01, -3.2780e-01,  ..., -2.8994e-01,
          1.1937e-01, -1.6235e-03],
        [ 3.7519e-01, -1.1394e-01, -2.4055e-01,  ..., -3.3892e-01,
          5.8694e-02, -2.1266e-02],
        ...,
        [ 3.4038e-01,  6.6492e-02, -1.6318e-01,  ..., -8.6563e-02,
          2.0340e-01,  3.2091e-02],
        [ 4.3209e-01,  1.1092e-02, -1.1731e-01,  ...,  7.5402e-02,
          1.0274e-01,  1.5274e-02],
        [ 3.0428e-01, -7.6216e-02, -6.7759e-02,  ..., -5.4349e-02,
          2.4438e-01, -1.4149e-02]])

In [19]:
text_embeddings_np = text_embeddings.detach().numpy()

# Creating a DataFrame
embeddings_df = pd.DataFrame(text_embeddings_np)

In [20]:
embeddings_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,0.363151,0.048937,-0.264081,-0.160672,-0.211843,-0.207537,0.325394,-0.026139,0.087266,-1.041420,...,0.349080,0.290132,-0.244970,0.078532,0.137399,0.208097,-0.058624,-0.141593,0.015918,0.000092
1,0.236416,-0.159501,-0.327798,-0.372885,-0.292128,-0.019028,0.333677,-0.179642,0.004258,-1.026999,...,0.311639,0.297819,-0.177003,0.130227,-0.063239,0.190171,-0.018153,-0.289936,0.119366,-0.001623
2,0.375192,-0.113944,-0.240547,-0.282425,-0.264252,0.061839,0.249180,0.019152,-0.072067,-0.992547,...,0.353615,0.308457,-0.207151,0.056724,0.056596,0.125301,0.021575,-0.338919,0.058694,-0.021266
3,0.273770,-0.048748,-0.440433,-0.189999,-0.410856,-0.100587,0.255757,0.036902,0.044896,-0.798385,...,0.321182,0.218213,-0.267988,-0.093800,0.176987,0.251618,0.028331,-0.155708,0.136188,0.044054
4,0.297853,-0.073203,-0.146820,-0.127284,-0.133966,0.045766,0.176495,-0.218839,0.191196,-1.062085,...,0.168949,0.208978,-0.051180,0.045685,0.173986,0.148893,0.097255,-0.239587,0.228066,0.189832
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7018,0.338545,0.084620,-0.225981,-0.115433,-0.064816,-0.129863,0.358163,-0.178001,0.135029,-1.093267,...,0.495477,0.203647,-0.138003,0.148754,0.138935,0.199463,0.054088,-0.110224,0.039229,-0.003550
7019,0.354057,0.053933,-0.099446,-0.161002,0.009353,-0.190430,0.258928,-0.199819,0.087599,-1.032675,...,0.322798,0.176826,-0.154205,-0.010798,0.100222,0.093780,0.051934,-0.119376,0.182106,0.072430
7020,0.340383,0.066492,-0.163184,-0.115224,-0.102958,-0.181814,0.346562,-0.133692,0.065351,-1.006693,...,0.372651,0.169435,-0.041522,-0.033723,0.047250,0.173592,-0.027378,-0.086563,0.203404,0.032091
7021,0.432092,0.011092,-0.117306,-0.123570,0.066756,-0.103376,0.243299,-0.241603,0.045067,-1.110857,...,0.468103,0.156609,-0.054083,0.210478,0.116214,0.064117,0.084667,0.075402,0.102740,0.015274


___

### PCA (Principal Component Analysis) 

In [23]:
from sklearn.decomposition import PCA

def get_PCA(n_components: int, data: pd.DataFrame):
    
    centered = data - data.mean()
    pca = PCA(n_components) 
    return pca.fit_transform(centered)

In [25]:
pca_matrix = get_PCA(5, embeddings_df)

In [26]:
col_names = [f'feature_{i + 1}' for i in range(5)]

In [27]:
pca_df = pd.DataFrame(pca_matrix, columns=col_names)

In [28]:
new_post_text_df = post_text_df.drop('text', axis=1)

In [29]:
new_post_text_df = pd.concat([new_post_text_df, pca_df], axis=1)

In [30]:
new_post_text_df.to_csv('new_post_text_df.csv', index=False)

In [31]:
new_post_text_df

Unnamed: 0,post_id,topic,feature_1,feature_2,feature_3,feature_4,feature_5
0,1,business,0.789076,1.578995,-1.421161,-0.292796,0.196602
1,2,business,0.792763,1.521057,-0.897739,0.228329,0.108478
2,3,business,0.801291,1.223608,-0.669456,1.323434,-0.081123
3,4,business,0.867123,0.971222,-1.577184,0.832963,0.840134
4,5,business,0.411659,0.809848,-0.682368,0.791090,-0.185699
...,...,...,...,...,...,...,...
7018,7315,movie,-0.983508,-0.483879,0.001966,0.066610,0.058621
7019,7316,movie,-0.821776,-0.706065,-0.164486,-0.390919,0.030236
7020,7317,movie,-0.469400,-0.874857,0.482077,-0.138485,-0.257369
7021,7318,movie,-1.579792,-0.448613,0.126693,0.100836,0.284148
