In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import torch
import torch.nn.functional as F
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm
from torch.utils.data import DataLoader, TensorDataset

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the dataset
df = pd.read_csv('/content/drive/MyDrive/research/dataset_new/Reviews.csv')

# Inspect dataset columns
print(df.columns)

# Define the columns
TEXT_COLUMN = 'Text'  # Review text column
LABEL_COLUMN = 'Score'  # Rating column

# Drop rows with missing values in essential columns
df = df.dropna(subset=[TEXT_COLUMN, LABEL_COLUMN])

# Define label mapping
rating_map = {
    1: 0,  # Very Poor
    2: 1,  # Poor
    3: 2,  # Neutral
    4: 3,  # Good
    5: 4   # Very Good
}

# Map numerical ratings to categorical labels
df["Sentiment"] = df[LABEL_COLUMN].map(rating_map)

# Split the data into training and test sets (80-20 split)
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df[TEXT_COLUMN].values, df["Sentiment"].values, test_size=0.2, random_state=42
)



Using device: cuda
Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'],
      dtype='object')


In [None]:

# Load the DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenization function
def tokenize_data(texts, labels, tokenizer, max_length=128):
    inputs = tokenizer(
        list(texts),
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )
    return inputs.input_ids, inputs.attention_mask, torch.tensor(labels, dtype=torch.long)

# Tokenize the data
train_input_ids, train_attention_masks, train_labels = tokenize_data(train_texts, train_labels, tokenizer)
test_input_ids, test_attention_masks, test_labels = tokenize_data(test_texts, test_labels, tokenizer)

# Load pre-trained DistilBERT model for classification with 5 output labels
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=5)
model.to(device)  # Move model to GPU if available

# Create DataLoaders
batch_size = 16
train_data = TensorDataset(train_input_ids, train_attention_masks, train_labels)
test_data = TensorDataset(test_input_ids, test_attention_masks, test_labels)

train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

# Define optimizer
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:

# Training loop
epochs = 1  # You can increase this for better accuracy
for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in tqdm(train_dataloader, desc=f"Training Epoch {epoch+1}"):
        b_input_ids, b_attention_masks, b_labels = [x.to(device) for x in batch]

        optimizer.zero_grad()

        # Forward pass
        outputs = model(b_input_ids, attention_mask=b_attention_masks)
        loss = F.cross_entropy(outputs.logits, b_labels)
        total_loss += loss.item()

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1} - Training Loss: {avg_train_loss:.4f}")

# Evaluation
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Evaluating"):
        b_input_ids, b_attention_masks, b_labels = [x.to(device) for x in batch]

        # Forward pass
        outputs = model(b_input_ids, attention_mask=b_attention_masks)
        predictions = torch.argmax(outputs.logits, dim=-1)

        all_preds.extend(predictions.cpu().numpy())
        all_labels.extend(b_labels.cpu().numpy())

# Compute accuracy
accuracy = accuracy_score(all_labels, all_preds)
print(f"\nTest Accuracy: {accuracy:.4f}")

# Classification report
print("\nClassification Report:")
print(classification_report(all_labels, all_preds, target_names=['Very Poor', 'Poor', 'Neutral', 'Good', 'Very Good']))

# Save the model and tokenizer
model.save_pretrained('./distilbert_sentiment_model')
tokenizer.save_pretrained('./distilbert_sentiment_model')

print("\nModel saved successfully!")

# Sample text prediction function
def predict_sentiment(text):
    model.eval()
    inputs = tokenizer(text, padding="max_length", truncation=True, max_length=128, return_tensors="pt")

    # Move input tensors to device
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    with torch.no_grad():
        output = model(input_ids, attention_mask=attention_mask)
        prediction = torch.argmax(output.logits, dim=-1).item()

    sentiment_labels = {0: "Very Poor", 1: "Poor", 2: "Neutral", 3: "Good", 4: "Very Good"}
    return sentiment_labels[prediction]

# Example Predictions
sample_texts = [
    "The product is amazing! I love it so much.",
    "It's an okay product, nothing special.",
    "Terrible quality. Completely disappointed."
]

print("\nSample Predictions:")
for text in sample_texts:
    print(f"Text: {text} -> Sentiment: {predict_sentiment(text)}")

Training Epoch 1: 100%|██████████| 28423/28423 [1:22:00<00:00,  5.78it/s]


Epoch 1 - Training Loss: 0.5983


Evaluating: 100%|██████████| 7106/7106 [06:49<00:00, 17.36it/s]



Test Accuracy: 0.8053

Classification Report:
              precision    recall  f1-score   support

   Very Poor       0.68      0.85      0.76     10326
        Poor       0.55      0.35      0.43      5855
     Neutral       0.59      0.49      0.54      8485
        Good       0.61      0.45      0.52     16123
   Very Good       0.89      0.95      0.92     72902

    accuracy                           0.81    113691
   macro avg       0.66      0.62      0.63    113691
weighted avg       0.79      0.81      0.79    113691


Model saved successfully!

Sample Predictions:
Text: The product is amazing! I love it so much. -> Sentiment: Very Good
Text: It's an okay product, nothing special. -> Sentiment: Neutral
Text: Terrible quality. Completely disappointed. -> Sentiment: Very Poor
