In [1]:
import numpy as np
import pandas as pd
import time
import os
import sys
import config
import torch

from transformers import AutoModel, BertTokenizer, AutoTokenizer # version: 4.5.1, pip install transformers
from transformers import GPT2Tokenizer, GPT2Model, AutoModelForCausalLM



In [2]:
##################### Test Model #####################
ROBERTA_BASE = 'roberta-base'
T5_BASE = 't5-base'
LLAMA2_7B = 'llama-2-7b' # https://huggingface.co/meta-llama/Llama-2-7b
QWEN25_7B = 'qwen2_5-7b' 

In [3]:
model_dir = os.path.join(config.PATH_TO_PRETRAINED_MODELS, QWEN25_7B)

In [None]:
model = AutoModelForCausalLM.from_pretrained(model_dir)
tokenizer = AutoTokenizer.from_pretrained(model_dir, use_fast=False)

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [None]:
if torch.cuda.is_available():
    model = model.to(device)
model.eval()

In [7]:
sentence = "I'll never let go, Jack. I'll never let go."

In [14]:
input_ids = tokenizer.encode(sentence, return_tensors='pt')[0]
input_ids

tensor([  40, 3278, 2581, 1077,  728,   11, 7607,   13,  358, 3278, 2581, 1077,
         728,   13])

In [15]:
output = tokenizer.decode(input_ids)
output

"I'll never let go, Jack. I'll never let go."

In [None]:
start, end = None, None
sentence_no_space = sentence.replace(' ', '')
for start in range(0,3,1):
    output = tokenizer.decode(input_ids[start:]).replace(' ', '')
    if (output == sentence_no_space):
        print(f"0Start index: {start}, End index: {end}")
        break
    if output.startswith(sentence_no_space):
        print(f"1Start index: {start}, End index: {end}")
        break
for end in range(-1,-3,-1):
    output = tokenizer.decode(input_ids[start:end]).replace(' ', '')
    if (output == sentence_no_space):
        break
assert tokenizer.decode(input_ids[start:end]).replace(' ', '') == sentence_no_space
print (f'start: {start};  end: {end}')


In [19]:
sentence = "I'll never let go, Jack. I'll never let go."
inputs = tokenizer(sentence, return_tensors="pt").to('cpu')
outputs = model(**inputs, output_hidden_states=True).hidden_states

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)

In [89]:
transform_outputs = torch.stack(outputs)[[-1,-2,-3,-4]].sum(dim=0)
transform_outputs = transform_outputs.detach().cpu().numpy()
embeddings = transform_outputs[0,start:end]

In [90]:
embeddings = np.array(embeddings).squeeze()
embeddings.shape

(14, 768)

In [91]:
embeddings = np.mean(embeddings, axis=0)
embeddings.shape

(768,)