In [1]:
from transformers import GPT2Model, GPT2Tokenizer
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm

if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

There are 1 GPU(s) available.
Device name: NVIDIA A100-SXM4-40GB


In [3]:
df = pd.read_csv('https://www.cs.fsu.edu/~liux/courses/deepRL/assignments/amazon_reviews.csv')
del df[df.columns[0]] # first column is just index so we drop

In [4]:
print(df)

      reviewerName  overall  \
0              NaN      4.0   
1             0mie      5.0   
2              1K3      4.0   
3              1m2      5.0   
4     2&amp;1/2Men      5.0   
...            ...      ...   
4910        ZM "J"      1.0   
4911            Zo      5.0   
4912     Z S Liske      5.0   
4913      Z Taylor      5.0   
4914           Zza      5.0   

                                             reviewText  reviewTime  day_diff  \
0                                            No issues.  2014-07-23       138   
1     Purchased this for my device, it worked as adv...  2013-10-25       409   
2     it works as expected. I should have sprung for...  2012-12-23       715   
3     This think has worked out great.Had a diff. br...  2013-11-21       382   
4     Bought it with Retail Packaging, arrived legit...  2013-07-13       513   
...                                                 ...         ...       ...   
4910  I bought this Sandisk 16GB Class 10 to use wit...  201

In [5]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, data_list):
        self.data_list = data_list

    def __len__(self):
        return len(self.data_list)

    def __getitem__(self, idx):
        return self.data_list[idx]

text = df['reviewText'].tolist()
text = [item for item in text if isinstance(item, str)] # removing any reviews that are not strings
dataset = MyDataset(text)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])
train_text = [train_dataset[i] for i in range(len(train_dataset))]
test_text = [test_dataset[i] for i in range(len(test_dataset))]

In [6]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2').to(device)

tokenizer.pad_token = tokenizer.eos_token  # Use the end of sequence token as padding token.
encoded_inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")


embeddings = []
# obtain embeddings

chunk = 10
for i in tqdm(range(0,len(encoded_inputs['input_ids']),chunk)):
    with torch.no_grad():
        outputs = model(encoded_inputs['input_ids'][i:i+chunk].to(device), output_hidden_states=True)
        hidden_states = outputs.last_hidden_state
        embeddings.append(torch.mean(hidden_states, dim=1))  # mean pooling

print(embeddings)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
100%|██████████| 492/492 [01:31<00:00,  5.37it/s]


[tensor([[-0.0209,  0.1681, -0.0474,  ..., -0.3096, -0.2179, -0.0158],
        [-0.0191,  0.0903,  0.0546,  ..., -0.1715, -0.1446, -0.0047],
        [-0.0282,  0.0981,  0.0409,  ..., -0.1989, -0.1470, -0.0161],
        ...,
        [ 0.0288,  0.0762,  0.0236,  ..., -0.1835, -0.1366, -0.0473],
        [-0.0145,  0.0905,  0.0178,  ..., -0.1945, -0.1545, -0.0164],
        [-0.0274,  0.1120,  0.0272,  ..., -0.1993, -0.1605, -0.0135]],
       device='cuda:0'), tensor([[ 0.0290,  0.0832, -0.0194,  ..., -0.1706, -0.1196, -0.0389],
        [-0.0016,  0.0953,  0.0174,  ..., -0.2019, -0.1320, -0.0340],
        [-0.0504,  0.1298,  0.0230,  ..., -0.2249, -0.1991,  0.0033],
        ...,
        [ 0.0329,  0.1801, -0.0032,  ..., -0.1702, -0.1401, -0.0482],
        [ 0.0203,  0.1017,  0.0443,  ..., -0.1977, -0.1457, -0.0442],
        [ 0.0036,  0.1241,  0.0412,  ..., -0.2495, -0.1839, -0.0081]],
       device='cuda:0'), tensor([[ 0.0295,  0.0671,  0.0108,  ..., -0.1759, -0.1055, -0.0220],
        [-0

In [10]:
embeddings_cat = torch.cat(embeddings, dim=0)

In [11]:
embeddings_cat

torch.Size([4914, 768])

In [12]:
# save the tensor
torch.save(embeddings_cat, 'embeddings.pt')