In [5]:
from transformers import GPT2Model, GPT2Tokenizer
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm

if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('Device name:', device)

Device name: cpu


In [23]:
df = pd.read_csv('C:/Users/farez/amazon_reviews.csv')
del df[df.columns[0]] # first column is just index so we drop

In [41]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, data_list):
        self.data_list = data_list

    def __len__(self):
        return len(self.data_list)

    def __getitem__(self, idx):
        return self.data_list[idx]

text = df['reviewText'].tolist()
text = [item for item in text if isinstance(item, str)] # removing any reviews that are not strings
dataset = MyDataset(text)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])
train_text = [train_dataset[i] for i in range(len(train_dataset))]
test_text = [test_dataset[i] for i in range(len(test_dataset))]

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')

tokenizer.pad_token = tokenizer.eos_token  # Use the end of sequence token as padding token.
encoded_inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")


embeddings = []
# obtain embeddings

chunk = 10
for i in tqdm(range(0,len(encoded_inputs['input_ids']),chunk)):
    with torch.no_grad():
        outputs = model(encoded_inputs['input_ids'][i:i+chunk])
        hidden_states = outputs.last_hidden_state
        embeddings.append(torch.mean(hidden_states, dim=1))  # mean pooling

# print(embeddings)
embeddings_cat = torch.cat(embeddings, dim=0)
torch.save(embeddings_cat, 'embeddings.pt')

  0%|          | 0/492 [00:00<?, ?it/s]

In [9]:
embeddings = torch.load("embeddings.pt", map_location=device)

In [29]:
from torch import nn

class GPT2ForClassification(nn.Module):
    def __init__(self, num_labels):
        super(GPT2ForClassification, self).__init__()
        self.gpt2 = GPT2Model.from_pretrained('gpt2')
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.gpt2.config.n_embd, num_labels)

    def forward(self, inputs_embeds, attention_mask=None):
        outputs = self.gpt2(inputs_embeds=inputs_embeds, attention_mask=attention_mask)
        hidden_state = outputs.last_hidden_state[:, -1, :]  # Use the last token's output for classification
        hidden_state = self.dropout(hidden_state)
        logits = self.classifier(hidden_state)
        return logits


In [47]:
# need to remove the same index from the ratings that we removed earlier from the reviews that was "not-a-string"
non_string_review = df['reviewText'].apply(lambda x: not isinstance(x, str))
non_string_indices = non_string_review[non_string_review].index
print("Indices of non-string values in column 'reviewText':", non_string_indices.tolist())

ratings = df['overall'].drop(df['overall'].index[non_string_indices.values[0]]) # removal of problem index from the ratings
print(len(ratings))

Indices of non-string values in column 'reviewText': [125]
4914


In [48]:
from sklearn.model_selection import train_test_split

# Convert tensors to numpy arrays for splitting
embeddings_np = embeddings.numpy()
ratings = ratings.to_numpy().astype(int) # convert to integer numpy array for classification

# Split the data
embeddings_train, embeddings_test, labels_train, labels_test = train_test_split(
    embeddings_np, ratings, test_size=0.20)

# Convert numpy arrays back to tensors
embeddings_train = torch.tensor(embeddings_train)
embeddings_test = torch.tensor(embeddings_test)
labels_train = torch.tensor(labels_train)
labels_test = torch.tensor(labels_test)

In [49]:
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader

# Create dataset and dataloader for more efficient training
train_dataset = TensorDataset(embeddings_train, labels_train)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

# Initialize the model and optimizer
model = GPT2ForClassification(num_labels=5)
optimizer = Adam(model.parameters(), lr=1e-5)
loss_fn = nn.CrossEntropyLoss()

# Training loop
model.train()
for epoch in range(3):  # number of epochs
    for embeddings_batch, labels_batch in tqdm(train_loader):
        optimizer.zero_grad()
        logits = model(embeddings_batch)
        loss = loss_fn(logits, labels_batch)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch + 1}, Loss: {loss.item()}")



  0%|          | 0/246 [00:00<?, ?it/s]


TypeError: GPT2ForClassification.forward() missing 1 required positional argument: 'attention_mask'

In [32]:
from sklearn.metrics import accuracy_score

# Switch to evaluation mode
model.eval()
with torch.no_grad():
    logits_test = model(embeddings_test)
    predictions = torch.argmax(logits_test, dim=1)
    accuracy = accuracy_score(labels_test, predictions.numpy())
    print(f"Test Accuracy: {accuracy}")


4915

In [33]:
print(df)

      reviewerName  overall  \
0              NaN      4.0   
1             0mie      5.0   
2              1K3      4.0   
3              1m2      5.0   
4     2&amp;1/2Men      5.0   
...            ...      ...   
4910        ZM "J"      1.0   
4911            Zo      5.0   
4912     Z S Liske      5.0   
4913      Z Taylor      5.0   
4914           Zza      5.0   

                                             reviewText  reviewTime  day_diff  \
0                                            No issues.  2014-07-23       138   
1     Purchased this for my device, it worked as adv...  2013-10-25       409   
2     it works as expected. I should have sprung for...  2012-12-23       715   
3     This think has worked out great.Had a diff. br...  2013-11-21       382   
4     Bought it with Retail Packaging, arrived legit...  2013-07-13       513   
...                                                 ...         ...       ...   
4910  I bought this Sandisk 16GB Class 10 to use wit...  201