In [None]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import os

# Create output directories if they don’t exist
os.makedirs('./results', exist_ok=True)
os.makedirs('./logs', exist_ok=True)

# Load Dataset
df = pd.read_csv("amazon_reviews_for_products.csv", encoding='utf-8')
df = df[['reviews.text', 'reviews.rating']].dropna()
df['reviews.text'] = df['reviews.text'].astype(str)
df['reviews.rating'] = pd.to_numeric(df['reviews.rating'], errors='coerce')
df = df.dropna()

# Print first few rows to verify data
print("First few rows of the dataset:")
print(df.head())

# Convert ratings to sentiment (Positive: 4-5, Neutral: 3, Negative: 1-2)
def get_sentiment(rating):
    if rating >= 4:
        return 2  # Positive
    elif rating == 3:
        return 1  # Neutral
    else:
        return 0  # Negative

df['sentiment'] = df['reviews.rating'].apply(get_sentiment)

# Print sentiment distribution
print("\nSentiment distribution:")
print(df['sentiment'].value_counts())

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Custom Dataset Class
class ReviewsDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        encoding = tokenizer(self.texts[idx], padding='max_length', truncation=True, max_length=128, return_tensors="pt")
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Prepare Dataset
dataset = ReviewsDataset(df['reviews.text'].tolist(), df['sentiment'].tolist())

# Print first item to verify dataset
print("\nFirst item in dataset:")
print(dataset[0])

# Load BERT Model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

# Training Arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=16,
    save_strategy="epoch",
    evaluation_strategy="no",  # Disable evaluation since no eval dataset is provided
    logging_dir='./logs',
    logging_steps=10,  # More frequent logging for debugging
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset
)

# Train the Model
trainer.train()

# Save the Model and Tokenizer
model.save_pretrained("./bert_sentiment_model")
tokenizer.save_pretrained("./bert_sentiment_model")

print("Model and tokenizer saved successfully!")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>=0.26.0'`