In [None]:
from google.colab import drive
import pandas as pd
import torch
import torch.nn.functional as F
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm
import xgboost as xgb

# Mount Google Drive
drive.mount('/content/drive')

# Check device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load dataset
df = pd.read_csv('/content/drive/MyDrive/research/dataset_new/Reviews.csv')

# Define columns
TEXT_COLUMN = 'Text'
LABEL_COLUMN = 'Score'

df = df.dropna(subset=[TEXT_COLUMN, LABEL_COLUMN])

# Label mapping
rating_map = {1: 0, 2: 1, 3: 2, 4: 3, 5: 4}
df['Sentiment'] = df[LABEL_COLUMN].map(rating_map)

# Train-test split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df[TEXT_COLUMN].values, df['Sentiment'].values, test_size=0.2, random_state=42)

# Load tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
bert_model = DistilBertModel.from_pretrained('distilbert-base-uncased').to(device)

# Tokenization function
def tokenize_data(texts, tokenizer, max_length=128):
    inputs = tokenizer(
        list(texts),
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors='pt'
    )
    return inputs.input_ids, inputs.attention_mask

# Extract BERT embeddings
def extract_embeddings(texts, tokenizer, model, batch_size=16):
    model.eval()
    all_embeddings = []

    with torch.no_grad():
        for i in tqdm(range(0, len(texts), batch_size), desc="Extracting Embeddings"):
            batch_texts = texts[i:i+batch_size]
            input_ids, attention_masks = tokenize_data(batch_texts, tokenizer)
            input_ids, attention_masks = input_ids.to(device), attention_masks.to(device)

            outputs = model(input_ids, attention_mask=attention_masks)
            embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            all_embeddings.append(embeddings)

    return np.vstack(all_embeddings)



Mounted at /content/drive
Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [None]:
# Generate embeddings
import numpy as np
train_embeddings = extract_embeddings(train_texts, tokenizer, bert_model)
test_embeddings = extract_embeddings(test_texts, tokenizer, bert_model)

# Train XGBoost classifier
xgb_classifier = xgb.XGBClassifier(objective='multi:softmax', num_class=5, eval_metric='mlogloss')
xgb_classifier.fit(train_embeddings, train_labels)

# Predictions
y_pred = xgb_classifier.predict(test_embeddings)

# Evaluate results
accuracy = accuracy_score(test_labels, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(test_labels, y_pred, target_names=['Very Poor', 'Poor', 'Neutral', 'Good', 'Very Good']))

# Sample text prediction
def predict_sentiment(text):
    bert_model.eval()
    input_ids, attention_mask = tokenize_data([text], tokenizer)
    input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)

    with torch.no_grad():
        output = bert_model(input_ids, attention_mask=attention_mask)
        embedding = output.last_hidden_state[:, 0, :].cpu().numpy()

    prediction = xgb_classifier.predict(embedding)[0]
    sentiment_labels = {0: "Very Poor", 1: "Poor", 2: "Neutral", 3: "Good", 4: "Very Good"}
    return sentiment_labels[prediction]

# Example Predictions
sample_texts = [
    "The product is amazing! I love it so much.",
    "It's an okay product, nothing special.",
    "Terrible quality. Completely disappointed."
]

print("\nSample Predictions:")
for text in sample_texts:
    print(f"Text: {text} -> Sentiment: {predict_sentiment(text)}")


Extracting Embeddings: 100%|██████████| 28423/28423 [32:07<00:00, 14.74it/s]
Extracting Embeddings: 100%|██████████| 7106/7106 [08:00<00:00, 14.78it/s]


Test Accuracy: 0.7627

Classification Report:
              precision    recall  f1-score   support

   Very Poor       0.66      0.69      0.68     10326
        Poor       0.63      0.34      0.44      5855
     Neutral       0.58      0.36      0.44      8485
        Good       0.62      0.29      0.39     16123
   Very Good       0.80      0.96      0.87     72902

    accuracy                           0.76    113691
   macro avg       0.66      0.53      0.56    113691
weighted avg       0.74      0.76      0.73    113691


Sample Predictions:
Text: The product is amazing! I love it so much. -> Sentiment: Very Good
Text: It's an okay product, nothing special. -> Sentiment: Very Good
Text: Terrible quality. Completely disappointed. -> Sentiment: Very Good
