# ABC X1 Smartwatch: Sentiment Analysis (BERT Upgrade)

## Objective
Achieve >80% accuracy using a pre-trained Transformer model (`nlptown/bert-base-multilingual-uncased-sentiment`).

In [None]:
# Import Libraries
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax
from sklearn.metrics import accuracy_score, classification_report
import torch
from tqdm import tqdm

# Load Dataset
try:
    df = pd.read_csv('data /smart_watch_review.csv')
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print("Error: File not found.")

In [None]:
# Generate Ground Truth Labels
rating_col = None
for col in df.columns:
    if 'rating' in col.lower() or 'star' in col.lower():
        rating_col = col
        break

if rating_col:
    def extract_rating(val):
        try:
            return float(str(val).split()[0])
        except:
            return 3.0
    df['numeric_rating'] = df[rating_col].apply(extract_rating)
    
    def get_label(rating):
        if rating > 3: return 'Positive'
        elif rating < 3: return 'Negative'
        else: return 'Neutral'
    
    df['Sentiment'] = df['numeric_rating'].apply(get_label)
    print("Labels generated.")
else:
    print("Warning: No rating column found.")

In [None]:
# Load Pre-trained Model (Product Reviews 1-5 stars)
MODEL = "nlptown/bert-base-multilingual-uncased-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
print("BERT Model Loaded.")

# Prediction Function
def predict_sentiment(text):
    try:
        encoded_text = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
        output = model(**encoded_text)
        scores = output.logits[0].detach().numpy()
        scores = softmax(scores)
        
        # Labels: 0->1 star, 1->2 stars, 2->3 stars, 3->4 stars, 4->5 stars
        star_rating = np.argmax(scores) + 1
        
        if star_rating <= 2:
            return 'Negative'
        elif star_rating == 3:
            return 'Neutral'
        else:
            return 'Positive'
    except:
        return "Neutral"

In [None]:
# Debug Data Extraction
print("Unique numeric ratings:", df['numeric_rating'].unique())
print("Unique sentiments:", df['Sentiment'].unique())

# Run Predictions (on a sample to save time, or full dataset)
print("Running predictions on first 500 rows for verification...")
sample_df = df.head(500).copy()
tqdm.pandas()
sample_df['Predicted'] = sample_df['review'].progress_apply(predict_sentiment)

# Evaluate
acc = accuracy_score(sample_df['Sentiment'], sample_df['Predicted'])
print(f"\nAccuracy on Sample: {acc*100:.2f}%")
print("\nClassification Report:\n", classification_report(sample_df['Sentiment'], sample_df['Predicted']))

# Show Misclassified Examples
print("\nMisclassified Examples:")
errors = sample_df[sample_df['Sentiment'] != sample_df['Predicted']].head(10)
for idx, row in errors.iterrows():
    print(f"Text: {row['review'][:100]}...")
    print(f"True: {row['Sentiment']} (Rating: {row['numeric_rating']}) | Pred: {row['Predicted']}")
    print("-" * 50)