# Word Embeddings Pipeline

* Extract CLS embedding layer of hidden states
    * Run on Apple Metal ('mps')

In [1]:
import torch
import pandas as pd
from transformers import AutoModel, AutoTokenizer
from tqdm import tqdm

In [2]:
fold = "DATA/llm-detect-ai-generated-text/"

df = pd.read_csv(f'{fold}final_data_text_ref.csv')
df['text']=df['text'].apply(lambda x : x.strip())

In [3]:
MODEL_NAME = 'allenai/longformer-base-4096'
#MODEL_NAME = 'distilbert-base-uncased'

device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
model = AutoModel.from_pretrained(MODEL_NAME).to(device)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

In [4]:
def batch(iterable, n=1):
    l = len(iterable)
    for ndx in range(0,l,n):
        yield iterable[ndx:min(ndx+n,l)]

In [5]:
# Batching and Epochs

outer=[]
for i in tqdm(range(0,len(df),32)):
    corpus=df['text'][i:i+32]
    inner=[]
    for epoch in batch(corpus, 16):
        tokenized_train = tokenizer(epoch.values.tolist(), 
                                    padding=True, 
                                    pad_to_multiple_of=512,
                                    truncation = True,
                                    max_length=4096, 
                                    return_tensors='pt')
        tokenized_train = {k:v.detach().clone().to(device) for k,v in tokenized_train.items()}
        with torch.no_grad():
            hidden_train = model(**tokenized_train) #dim : [batch_size(nr_sentences), tokens, emb_dim]


            # get only the [CLS] embeddings
            cls_train = hidden_train.last_hidden_state[:,0,:]
            inner.append(cls_train.to('cpu'))
    outer.append(torch.cat(inner))


100%|██████████| 784/784 [3:20:35<00:00, 15.35s/it]  


In [6]:
CLS_hidden_states=torch.cat(outer)

In [7]:
CLS_hidden_states.shape

torch.Size([25059, 768])