In [19]:
import pandas as pd
import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score
import numpy as np


In [20]:
# 1. Device Configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [21]:
df=pd.read_csv(r"abc.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Index,Review,Cleaned_Review,Cleaned_Name,Severity,Urgency,Customer_Satisfaction,Review_Days,Response_Days,Predicted_Category,Probabilities,Mapped_Category,Hygiene,Food Quality,Atmosphere,Value for Money,Service Issue,Positive Review,Food Options
0,0,0,"Stop eating at this place, I have visited bang...",stop eating place visited bangalores nd punes ...,pramod kumar,High,Urgent,No Response,180,-1,"['Quality, taste, or freshness issues with foo...","[0.9965850710868835, 0.9946374893188477, 0.844...","['Food Quality', 'Hygiene', 'Service Issue']",1,1,0,0,1,0,0
1,1,1,Food 2/5\nService 2/5\nAmbience 2/5 …,food service ambience,abhinav deep,Medium,Non-Urgent,High Satisfaction,365,365,"['Poor customer service or staff behavior', 'Q...","[0.8227755427360535, 0.7571902871131897]","['Service Issue', 'Food Quality']",0,1,0,0,1,0,0
2,2,2,Idiotic varieties for the price they have char...,idiotic varieties price charged varieties boil...,vijay nammi,High,Urgent,No Response,30,-1,"['Quality, taste, or freshness issues with foo...","[0.9860756993293762, 0.98301100730896]","['Food Quality', 'Value for Money']",0,1,0,1,0,0,0
3,3,3,"I am posting this live now, this is one of the...",posting live one worst places dont visit pathe...,surya ajay,High,Urgent,High Satisfaction,365,365,"['Poor customer service or staff behavior', 'Q...","[0.992651641368866, 0.9484805464744568]","['Service Issue', 'Food Quality']",0,1,0,0,1,0,0
4,4,4,"We are pure vegetarians, I ordered veg biryani...",pure vegetarians ordered veg biryani swiggy go...,sai hithesh,Low,Non-Urgent,No Response,180,-1,"['Poor customer service or staff behavior', 'C...","[0.9473494291305542, 0.8612152338027954, 0.833...","['Service Issue', 'Value for Money', 'Food Qua...",0,1,0,1,1,0,0


In [22]:
categories_to_add = ['Service Issue', 'Food Options', 'Food Quality', 'Atmosphere', 'Value for Money', 'Hygiene', 'Positive Review']

In [23]:
# 2. Split Data into Training and Test Sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['Cleaned_Review'], df[categories_to_add], test_size=0.2, random_state=42
)

In [24]:
# 3. Load BERT Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True, max_length=128)

In [25]:
# 4. Convert Labels to Tensors
train_labels = torch.tensor(train_labels.values).float()
test_labels = torch.tensor(test_labels.values).float()

In [26]:
# 5. Custom Dataset Class
class ReviewDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):  # Corrected constructor method name
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):  # Corrected method name
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):  # Corrected method name
        return len(self.labels)

In [27]:

train_dataset = ReviewDataset(train_encodings, train_labels)
test_dataset = ReviewDataset(test_encodings, test_labels)

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=2)

In [28]:
# Load BERT model for classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=7).to(device)
criterion = torch.nn.BCEWithLogitsLoss()  # Standard binary cross-entropy loss
optimizer = AdamW(model.parameters(), lr=2e-5)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:

# Training function
def train(model, train_loader, optimizer, criterion, accumulation_steps=4):
    model.train()
    total_loss = 0
    optimizer.zero_grad()
    for i, batch in enumerate(train_loader):
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
        labels = batch['labels'].to(device)

        outputs = model(**inputs)
        loss = criterion(outputs.logits, labels)
        loss = loss / accumulation_steps
        loss.backward()

        if (i + 1) % accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
        
        total_loss += loss.item()

    return total_loss / len(train_loader)


In [None]:
# # Evaluation function
# def evaluate(model, test_loader, threshold=0.5):
#     model.eval()
#     all_preds = []
#     all_labels = []
#     with torch.no_grad():
#         for batch in test_loader:
#             inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
#             labels = batch['labels'].cpu().numpy()
#             outputs = model(**inputs)
#             preds = torch.sigmoid(outputs.logits).cpu().numpy()

#             # Apply threshold for multi-label classification
#             preds = (preds > threshold).astype(int)
#             all_preds.extend(preds)
#             all_labels.extend(labels)

#     return np.array(all_preds), np.array(all_labels)

In [31]:
# Training loop
epochs = 3
for epoch in range(epochs):
    train_loss = train(model, train_loader, optimizer, criterion)
    print(f"Epoch {epoch + 1}/{epochs}, Training Loss: {train_loss:.4f}")


Epoch 1/3, Training Loss: 0.1104
Epoch 2/3, Training Loss: 0.0798
Epoch 3/3, Training Loss: 0.0603


In [None]:
# # Evaluation and metrics
# preds, labels = evaluate(model, test_loader, threshold=0.5)
# predicted_labels = (preds > 0.5).astype(int)
# print(classification_report(labels, predicted_labels, target_names=['Service Issue', 'Food Options', 'Food Quality', 'Atmosphere', 'Value for Money', 'Hygiene', 'Positive Review']))
# print("Accuracy:", accuracy_score(labels, predicted_labels))
# print("F1 Score (Micro):", f1_score(labels, predicted_labels, average='micro'))
# print("F1 Score (Macro):", f1_score(labels, predicted_labels, average='macro'))

                 precision    recall  f1-score   support

  Service Issue       0.82      0.84      0.83       392
   Food Options       0.62      0.51      0.56       278
   Food Quality       0.81      0.86      0.83       465
     Atmosphere       0.65      0.59      0.62       103
Value for Money       0.74      0.51      0.60       184
        Hygiene       0.69      0.62      0.65        58
Positive Review       0.82      0.44      0.57       160

      micro avg       0.76      0.69      0.73      1640
      macro avg       0.74      0.63      0.67      1640
   weighted avg       0.76      0.69      0.72      1640
    samples avg       0.76      0.72      0.71      1640

Accuracy: 0.375743162901308
F1 Score (Micro): 0.7262247838616714
F1 Score (Macro): 0.6687380966557391


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [46]:
category_labels = ['Service Issue', "Food Quality", "Atmosphere", "Value for Money", "Hygiene","Food Options","Positive Review"]

In [None]:
# # 8. Evaluation Function with Adjustable Threshold
# def evaluate(model, test_loader):
#     model.eval()
#     all_preds = []
#     all_labels = []
#     with torch.no_grad():
#         for batch in test_loader:
#             inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
#             labels = batch['labels'].cpu().numpy()
#             outputs = model(**inputs)
#             preds = torch.sigmoid(outputs.logits).cpu().numpy()  # Get probabilities

#             # Initialize lists for predicted categories and their probabilities
#             predicted_categories = []
#             probabilities_list = []

#             # Apply Threshold Logic
#             for pred in preds:
#                 predicted_category = []
#                 current_probabilities = []

#                 for label, score in zip(category_labels, pred):
#                     if score > 0.75:
#                         predicted_category.append(label)
#                         current_probabilities.append(score)

#                 # If no categories were found with scores > 0.75, check for scores between 0.2 and 0.75
#                 if not predicted_category:
#                     max_score = -1
#                     max_label = None
#                     positive_review_score = pred[category_labels.index("Positive Review")] if "Positive Review" in category_labels else 0

#                     for label, score in zip(category_labels, pred):
#                         if 0.2 <= score < 0.75:
#                             if score > max_score:
#                                 max_score = score
#                                 max_label = label
                    
#                     # Prioritize "Positive Review" if its score is the highest
#                     if positive_review_score > max_score:
#                         predicted_category = ["Positive Review"]
#                         current_probabilities = [positive_review_score]
#                     elif max_label is not None:
#                         predicted_category.append(max_label)
#                         current_probabilities.append(max_score)

#                 # If no categories were predicted, assign "Positive Review" with its probability
#                 if not predicted_category:
#                     predicted_category = ["Positive Review"]
#                     current_probabilities = [positive_review_score]  # Use the actual probability for "Positive Review"

#                 # Store results
#                 predicted_categories.append(predicted_category)
#                 probabilities_list.append(current_probabilities)

#             all_preds.extend(predicted_categories)
#             all_labels.extend(labels)

#     return np.array(all_preds), np.array(all_labels), np.array(probabilities_list)

In [94]:
def evaluate(model, test_loader):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in test_loader:
            inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
            labels = batch['labels'].cpu().numpy()  # True labels
            outputs = model(**inputs)
            preds = torch.sigmoid(outputs.logits).cpu().numpy()  # Get probabilities
            
            # Initialize predictions
            binary_preds = np.zeros((preds.shape[0], preds.shape[1]))

            # Process predictions
            for idx, prob in enumerate(preds):
                predicted_categories = []
                probabilities = []

                # Step 1: Check for categories with probability > 0.75
                for i, score in enumerate(prob):
                    if score > 0.75:
                        predicted_categories.append(category_labels[i])
                        probabilities.append(score)

                # Step 2: If no categories above 0.75, check for max probability in range 0.2-0.75
                if not predicted_categories:
                    max_prob = 0.0
                    max_category = None
                    for i, score in enumerate(prob):
                        if 0.2 <= score < 0.75 and score > max_prob:
                            max_prob = score
                            max_category = category_labels[i]

                    if max_category:
                        predicted_categories.append(max_category)
                        probabilities.append(max_prob)
                    else:
                        # Step 3: Assign positive review with its probability if all are below 0.2
                        positive_review_prob = prob[category_labels.index("Positive Review")]
                        predicted_categories.append("Positive Review")
                        probabilities.append(positive_review_prob)

                # Convert the predicted categories to binary format
                for category in category_labels:
                    if category in predicted_categories:
                        binary_preds[idx, category_labels.index(category)] = 1

            all_preds.append(binary_preds)
            all_labels.append(labels)

    # Convert lists to arrays
    return np.vstack(all_preds), np.vstack(all_labels)


In [95]:
# Sample Usage of the Evaluate Function
predicted_labels, true_labels = evaluate(model, test_loader)


In [96]:
# Check the lengths
print("Length of true labels:", true_labels.shape[0])
print("Length of predicted labels:", predicted_labels.shape[0])

Length of true labels: 841
Length of predicted labels: 841


In [97]:
# Generate the classification report
predicted_labels_binary = (predicted_labels > 0.5).astype(float)  # Binary representation based on probabilities

# Generate the classification report
print(classification_report(true_labels, predicted_labels_binary, target_names=category_labels))
print("Accuracy:", accuracy_score(true_labels, predicted_labels_binary))

                 precision    recall  f1-score   support

  Service Issue       0.89      0.76      0.82       392
   Food Quality       0.68      0.27      0.39       278
     Atmosphere       0.86      0.79      0.82       465
Value for Money       0.78      0.50      0.61       103
        Hygiene       0.79      0.46      0.58       184
   Food Options       0.83      0.50      0.62        58
Positive Review       0.80      0.42      0.56       160

      micro avg       0.83      0.59      0.69      1640
      macro avg       0.80      0.53      0.63      1640
   weighted avg       0.81      0.59      0.67      1640
    samples avg       0.83      0.65      0.69      1640

Accuracy: 0.38882282996432815


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [98]:
import torch

def classify_reviews(model, reviews, category_labels):
    model.eval()
    all_preds = []
    all_probs = []

    # Tokenize the reviews (ensure you have a tokenizer defined)
    tokenized_reviews = tokenizer(reviews, padding=True, truncation=True, return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = model(**tokenized_reviews)
        preds = torch.sigmoid(outputs.logits).cpu().numpy()  # Get probabilities

        for prob in preds:
            predicted_categories = []
            probabilities = []

            # Step 1: Check for categories with probability > 0.75
            for i, score in enumerate(prob):
                if score > 0.75:
                    predicted_categories.append(category_labels[i])
                    probabilities.append(score)

            # Step 2: If no categories above 0.75, check for max probability in range 0.2-0.75
            if not predicted_categories:
                max_prob = 0.0
                max_category = None
                for i, score in enumerate(prob):
                    if 0.2 <= score < 0.75 and score > max_prob:
                        max_prob = score
                        max_category = category_labels[i]

                if max_category:
                    predicted_categories.append(max_category)
                    probabilities.append(max_prob)
                else:
                    # Step 3: Assign positive review with its probability if all are below 0.2
                    positive_review_prob = prob[category_labels.index("Positive Review")]
                    predicted_categories.append("Positive Review")
                    probabilities.append(positive_review_prob)

            all_preds.append(predicted_categories)
            all_probs.append(prob)

    return all_preds, all_probs

# Sample usage
sample_reviews = [
    "The food was excellent and the service was great!",
    "I had a terrible experience with the hygiene of the restaurant.",
    "The ambiance was lovely, but the food options were limited.",
    "I would recommend this place for its value for money.",
    "Overall, a great place to dine with family."
]

predicted_categories, predicted_probabilities = classify_reviews(model, sample_reviews, category_labels)

# Display results
for review, categories, probs in zip(sample_reviews, predicted_categories, predicted_probabilities):
    print(f"Review: {review}")
    print(f"Predicted Categories: {categories}")
    print(f"Probabilities: {probs}")
    print("-----")


Review: The food was excellent and the service was great!
Predicted Categories: ['Positive Review']
Probabilities: [0.04665793 0.06056271 0.1329418  0.01477121 0.02874581 0.01821567
 0.8305358 ]
-----
Review: I had a terrible experience with the hygiene of the restaurant.
Predicted Categories: ['Atmosphere', 'Food Options']
Probabilities: [0.59470195 0.33350614 0.9323903  0.35559288 0.15185976 0.9710812
 0.05553968]
-----
Review: The ambiance was lovely, but the food options were limited.
Predicted Categories: ['Food Quality']
Probabilities: [0.02972805 0.7892593  0.28067574 0.05442077 0.25652114 0.00831567
 0.3068922 ]
-----
Review: I would recommend this place for its value for money.
Predicted Categories: ['Hygiene']
Probabilities: [0.06078795 0.29377833 0.41276404 0.06392154 0.9379374  0.02645649
 0.21547987]
-----
Review: Overall, a great place to dine with family.
Predicted Categories: ['Positive Review']
Probabilities: [0.01040346 0.10566269 0.04997773 0.07108229 0.05596929 0.02

In [99]:
def classify_reviews(model, reviews, category_labels, threshold=0.5):
    model.eval()
    results = []

    with torch.no_grad():
        for review in reviews:
            # Prepare the input for the model
            inputs = tokenizer(review, return_tensors="pt", truncation=True, padding=True)
            inputs = {k: v.to(device) for k, v in inputs.items()}

            # Get predictions from the model
            outputs = model(**inputs)
            probs = torch.sigmoid(outputs.logits).cpu().numpy()[0]  # Get probabilities
            
            predicted_categories = []
            probabilities = []

            # Check for categories with probability > 0.5
            for i, score in enumerate(probs):
                if score > threshold:
                    predicted_categories.append(category_labels[i])
                    probabilities.append(score)

            # If no categories were predicted above the threshold, handle that case
            if not predicted_categories:
                # If no category has a score above the threshold, consider it as "No Category"
                predicted_categories.append("No Category")
                probabilities.append(0.0)  # Assign a default for no category

            results.append({
                "review": review,
                "predicted_categories": predicted_categories,
                "probabilities": probabilities
            })

    return results

# Sample reviews to classify
sample_reviews = [
    "The food was excellent and the service was great!",
    "I had a terrible experience with the hygiene of the restaurant.",
    "The ambiance was lovely, but the food options were limited.",
    "I would recommend this place for its value for money.",
    "Overall, a great place to dine with family."
]

# Classify the sample reviews
classified_results = classify_reviews(model, sample_reviews, category_labels)

# Display the classification results
for result in classified_results:
    print(f"Review: {result['review']}")
    print(f"Predicted Categories: {result['predicted_categories']}")
    print(f"Probabilities: {result['probabilities']}")
    print("-----")


Review: The food was excellent and the service was great!
Predicted Categories: ['Positive Review']
Probabilities: [0.83053577]
-----
Review: I had a terrible experience with the hygiene of the restaurant.
Predicted Categories: ['Service Issue', 'Atmosphere', 'Food Options']
Probabilities: [0.5947018, 0.93239015, 0.9710811]
-----
Review: The ambiance was lovely, but the food options were limited.
Predicted Categories: ['Food Quality']
Probabilities: [0.78925955]
-----
Review: I would recommend this place for its value for money.
Predicted Categories: ['Hygiene']
Probabilities: [0.9379374]
-----
Review: Overall, a great place to dine with family.
Predicted Categories: ['Positive Review']
Probabilities: [0.85693985]
-----


In [None]:
import os

# Create a directory to save the model and tokenizer
model_save_path = 'r'
os.makedirs(model_save_path, exist_ok=True)

# Save the model
model.save_pretrained(model_save_path)

# Save the tokenizer
tokenizer.save_pretrained(model_save_path)

print(f'Model and tokenizer saved to {model_save_path}')