## Import Required Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import pipeline
from tqdm import tqdm

## Load Dataset

In [None]:
df = pd.read_csv('../data/raw/car_reviews.csv')
df.head()
df.info()

# Zero-shot Classification

Model Name: facebook/bart-large-mnli

Why?

    It's a popular zero-shot text classification model.

    Based on BART transformer trained on MNLI (Multi-Genre Natural Language Inference).

    High performance in zero-shot inference tasks with label definitions.

    Efficient for multi-label or single-label classification without needing retraining.

In [None]:
# Set up zero-shot classifier
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=0)  # use device=0 for GPU

# Define candidate labels
labels = [
    "talks about driving experience",
    "talks about features",
    "talks about value for money",
    "talks about issues",
    "other"
]

# Batch classification function
def classify_in_batches(texts, labels, batch_size=32):
    results = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        outputs = classifier(batch, labels)
        for output in outputs:
            results.append(output['labels'][0])  # take top predicted label
    return results

# Run classification on full dataset (all rows)
df['talks_about'] = classify_in_batches(df['Review'].tolist(), labels, batch_size=32)

# Preview result
print(df[['Review', 'talks_about']].head())


# Sentiment Analysis 

Model Name: distilbert-base-uncased-finetuned-sst-2-english

Why?

    We selected DistilBERT fine-tuned for sentiment analysis. It is designed for binary classification tasks (POSITIVE or NEGATIVE).

    DistilBERT is a smaller and faster version of BERT. It is ideal for working with large datasets like our 6000 reviews.

    The model offers a good balance between speed and accuracy, making it efficient for large-scale text processing.

    It is a trusted and widely used model on Hugging Face, known for handling general English text like car reviews effectively.

In [None]:
# Load sentiment analysis pipeline with a specific model and truncation
sentiment_model = pipeline(
    "sentiment-analysis", 
    model="distilbert/distilbert-base-uncased-finetuned-sst-2-english", 
    truncation=True, 
    device=0  # optional: use GPU if available
)

# Batch prediction function
def get_sentiment_in_batches(texts, batch_size=32):
    results = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        outputs = sentiment_model(batch)
        for output in outputs:
            results.append(output['label'].lower())  # POSITIVE or NEGATIVE
    return results

# Apply and add sentiment column
df['sentiment'] = get_sentiment_in_batches(df['Review'].tolist())

# Visualizations 

In [None]:
# Set style
sns.set(style="whitegrid")

# Visualization 1: Sentiment Spread
plt.figure(figsize=(8, 5))
sns.countplot(data=df, x='sentiment', palette='coolwarm')
plt.title("Sentiment Spread of Car Reviews")
plt.xlabel("Sentiment")
plt.ylabel("Number of Reviews")
plt.show()

# Visualization 2: talks_about Spread
plt.figure(figsize=(10, 6))
sns.countplot(data=df, y='talks_about', palette='Set3', order=df['talks_about'].value_counts().index)
plt.title("Distribution of Review Categories")
plt.xlabel("Number of Reviews")
plt.ylabel("Review Category")
plt.show()

### Save final CSV

In [None]:
# Save to CSV
df.to_csv("car_reviews_with_classification.csv", index=False)