# Product Review Analysis and RAG Simulation

In [None]:
# Product Review Analysis and RAG Simulation
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import warnings
import os # Added for directory creation

warnings.filterwarnings('ignore')

# Download NLTK resources
print("Downloading NLTK resources (if needed)...")
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
print("NLTK resources checked/downloaded.")

# Set style for visualizations
plt.style.use('ggplot')
sns.set_palette("Set2")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

# Define output directories
plot_output_dir = 'notebooks/output_plots_rag'
data_output_dir = 'data/processed'
os.makedirs(plot_output_dir, exist_ok=True)
os.makedirs(data_output_dir, exist_ok=True)

# Load the product reviews dataset
try:
    reviews_df = pd.read_csv('data/raw/product_reviews.csv')
    print("Loaded data/raw/product_reviews.csv")
except FileNotFoundError as e:
    print(f"Error loading data: {e}")
    print("Please ensure 'data/raw/product_reviews.csv' exists.")
    exit()

# Convert date column to datetime
reviews_df['date'] = pd.to_datetime(reviews_df['date'])

print("\n=== Product Review Analysis ===")
print(f"Total reviews: {len(reviews_df)}")
print(f"Unique products: {reviews_df['product'].nunique()}")
print(f"Categories: {reviews_df['category'].nunique()}")
print(f"Rating distribution: \n{reviews_df['rating'].value_counts().sort_index()}")
print(f"Sentiment distribution: \n{reviews_df['sentiment'].value_counts()}")

# Simple text preprocessing function without NLTK
def preprocess_text(text):
    if not isinstance(text, str): # Handle potential non-string data
        return ""
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Preprocess review text
print("\nPreprocessing review text...")
reviews_df['processed_text'] = reviews_df['review_text'].apply(preprocess_text)
print("Preprocessing complete.")

# Feature-level sentiment analysis
print("\nPerforming feature-level sentiment analysis...")

# Group by feature and sentiment
feature_sentiment = reviews_df.groupby(['feature_mentioned', 'sentiment']).size().unstack(fill_value=0)

# Calculate sentiment score (-1 for negative, 0 for neutral, 1 for positive)
feature_sentiment['total'] = feature_sentiment.sum(axis=1)
feature_sentiment['sentiment_score'] = 0
if 'positive' in feature_sentiment.columns:
    feature_sentiment['sentiment_score'] += feature_sentiment['positive']
if 'negative' in feature_sentiment.columns:
    feature_sentiment['sentiment_score'] -= feature_sentiment['negative']
# Avoid division by zero if total is 0
feature_sentiment['sentiment_score'] = feature_sentiment.apply(
    lambda row: row['sentiment_score'] / row['total'] if row['total'] > 0 else 0, axis=1
)


# Sort by sentiment score
feature_sentiment = feature_sentiment.sort_values('sentiment_score', ascending=False)

# Visualize feature sentiment
plt.figure(figsize=(12, 8))
feature_sentiment['sentiment_score'].head(15).plot(kind='bar')
plt.title('Top 15 Features by Sentiment Score')
plt.xlabel('Feature')
plt.ylabel('Sentiment Score (-1 to 1)')
plt.tight_layout()
plot_path = os.path.join(plot_output_dir, 'feature_sentiment_score.png')
plt.savefig(plot_path)
plt.close()
print(f"Saved plot: {plot_path}")

# Attribute-level sentiment analysis
print("\nPerforming attribute-level sentiment analysis...")
attribute_sentiment = reviews_df.groupby(['attribute_mentioned', 'sentiment']).size().unstack(fill_value=0)

# Calculate sentiment score for attributes
attribute_sentiment['total'] = attribute_sentiment.sum(axis=1)
attribute_sentiment['sentiment_score'] = 0
if 'positive' in attribute_sentiment.columns:
    attribute_sentiment['sentiment_score'] += attribute_sentiment['positive']
if 'negative' in attribute_sentiment.columns:
    attribute_sentiment['sentiment_score'] -= attribute_sentiment['negative']
# Avoid division by zero
attribute_sentiment['sentiment_score'] = attribute_sentiment.apply(
    lambda row: row['sentiment_score'] / row['total'] if row['total'] > 0 else 0, axis=1
)

# Sort by sentiment score
attribute_sentiment = attribute_sentiment.sort_values('sentiment_score', ascending=False)

# Visualize attribute sentiment
plt.figure(figsize=(12, 8))
attribute_sentiment['sentiment_score'].head(15).plot(kind='bar')
plt.title('Top 15 Attributes by Sentiment Score')
plt.xlabel('Attribute')
plt.ylabel('Sentiment Score (-1 to 1)')
plt.tight_layout()
plot_path = os.path.join(plot_output_dir, 'attribute_sentiment_score.png')
plt.savefig(plot_path)
plt.close()
print(f"Saved plot: {plot_path}")


# Product-level analysis
print("\nPerforming product-level analysis...")

# Group by product and calculate average rating and sentiment distribution
product_analysis = reviews_df.groupby('product').agg({
    'rating': 'mean',
    'review_id': 'count'
}).rename(columns={'review_id': 'review_count'})

# Add sentiment distribution
sentiment_counts = pd.crosstab(reviews_df['product'], reviews_df['sentiment'])
product_analysis = product_analysis.join(sentiment_counts, how='left').fillna(0) # Use left join and fillna

# Calculate positive sentiment ratio
if 'positive' in product_analysis.columns:
    product_analysis['positive_ratio'] = product_analysis.apply(
        lambda row: row['positive'] / row['review_count'] if row['review_count'] > 0 else 0, axis=1
    )
else:
    product_analysis['positive_ratio'] = 0

# Sort by average rating
product_analysis = product_analysis.sort_values('rating', ascending=False)

# Visualize product ratings
plt.figure(figsize=(14, 8))
product_analysis['rating'].plot(kind='bar')
plt.title('Average Rating by Product')
plt.xlabel('Product')
plt.ylabel('Average Rating')
plt.axhline(y=product_analysis['rating'].mean(), color='r', linestyle='--', label='Average')
plt.legend()
plt.xticks(rotation=90)
plt.tight_layout()
plot_path = os.path.join(plot_output_dir, 'product_ratings.png')
plt.savefig(plot_path)
plt.close()
print(f"Saved plot: {plot_path}")

# Product strengths and weaknesses analysis
print("\nIdentifying product strengths and weaknesses...")

# Function to get top features and attributes for a product by sentiment
def get_product_insights(product_name, sentiment_type='positive'):
    product_reviews = reviews_df[reviews_df['product'] == product_name]
    filtered_reviews = product_reviews[product_reviews['sentiment'] == sentiment_type]
    # Handle cases where feature/attribute might be missing or all NaN
    feature_counts = filtered_reviews['feature_mentioned'].dropna().value_counts().head(3)
    attribute_counts = filtered_reviews['attribute_mentioned'].dropna().value_counts().head(3)
    return {'features': feature_counts, 'attributes': attribute_counts}

# Get insights for top 5 products
top_products = product_analysis.head(5).index.tolist()
product_insights_summary = {} # Renamed variable

print("\nProduct Insights (Strengths and Weaknesses):")
for product in top_products:
    strengths = get_product_insights(product, 'positive')
    weaknesses = get_product_insights(product, 'negative')
    product_insights_summary[product] = {'strengths': strengths, 'weaknesses': weaknesses} # Use renamed variable

    print(f"\n--- {product} ---")
    print("  Strengths:")
    if not strengths['features'].empty: print(f"    Features: {', '.join(strengths['features'].index)}")
    if not strengths['attributes'].empty: print(f"    Attributes: {', '.join(strengths['attributes'].index)}")
    print("  Weaknesses:")
    if not weaknesses['features'].empty: print(f"    Features: {', '.join(weaknesses['features'].index)}")
    if not weaknesses['attributes'].empty: print(f"    Attributes: {', '.join(weaknesses['attributes'].index)}")


# Generate embeddings for vector search
print("\nGenerating embeddings for vector search...")

# Use TF-IDF to create document vectors
tfidf = TfidfVectorizer(max_features=100) # Reduced features for faster PCA
review_vectors = tfidf.fit_transform(reviews_df['processed_text'])
print("TF-IDF vectors generated.")

# Function to search for similar reviews
def search_similar_reviews(query, top_n=5):
    processed_query = preprocess_text(query)
    query_vector = tfidf.transform([processed_query])
    similarities = cosine_similarity(query_vector, review_vectors).flatten()
    top_indices = similarities.argsort()[:-top_n-1:-1]
    return reviews_df.iloc[top_indices]

# Example search
print("\nExample vector search:")
search_query = "battery life problems"
similar_reviews = search_similar_reviews(search_query)
print(f"Query: '{search_query}'")
print("Top 5 similar reviews:")
for i, (_, review) in enumerate(similar_reviews.iterrows(), 1):
    print(f"{i}. Product: {review['product']}, Rating: {review['rating']}, Sentiment: {review['sentiment']}")
    # print(f"   Review: {review['review_text'][:100]}...") # Keep output concise for script run

# Visualize embeddings with PCA
print("\nVisualizing review embeddings...")
try:
    pca = PCA(n_components=2)
    review_vectors_dense = review_vectors.toarray()
    embeddings_2d = pca.fit_transform(review_vectors_dense)

    embedding_df = pd.DataFrame({
        'x': embeddings_2d[:, 0],
        'y': embeddings_2d[:, 1],
        'sentiment': reviews_df['sentiment'],
        'product': reviews_df['product'],
        'category': reviews_df['category']
    })

    # Plot embeddings by sentiment
    plt.figure(figsize=(12, 10))
    sentiment_colors = {'positive': 'green', 'neutral': 'blue', 'negative': 'red'}
    for sentiment, color in sentiment_colors.items():
        mask = embedding_df['sentiment'] == sentiment
        plt.scatter(embedding_df.loc[mask, 'x'], embedding_df.loc[mask, 'y'], c=color, label=sentiment, alpha=0.7)
    plt.title('Review Embeddings by Sentiment (PCA)')
    plt.xlabel('PCA Component 1')
    plt.ylabel('PCA Component 2')
    plt.legend()
    plt.tight_layout()
    plot_path = os.path.join(plot_output_dir, 'review_embeddings_sentiment.png')
    plt.savefig(plot_path)
    plt.close()
    print(f"Saved plot: {plot_path}")

    # Plot embeddings by category
    plt.figure(figsize=(12, 10))
    categories = embedding_df['category'].unique()
    colors = plt.cm.viridis(np.linspace(0, 1, len(categories))) # Use colormap
    for i, category in enumerate(categories):
        mask = embedding_df['category'] == category
        plt.scatter(embedding_df.loc[mask, 'x'], embedding_df.loc[mask, 'y'], color=colors[i], label=category, alpha=0.7)
    plt.title('Review Embeddings by Product Category (PCA)')
    plt.xlabel('PCA Component 1')
    plt.ylabel('PCA Component 2')
    plt.legend()
    plt.tight_layout()
    plot_path = os.path.join(plot_output_dir, 'review_embeddings_category.png')
    plt.savefig(plot_path)
    plt.close()
    print(f"Saved plot: {plot_path}")

except Exception as e:
    print(f"Error during PCA visualization: {e}")


# Temporal analysis of reviews
print("\nPerforming temporal analysis of reviews...")

# Group by month and calculate average rating
reviews_df['month_year'] = reviews_df['date'].dt.to_period('M')
monthly_ratings = reviews_df.groupby('month_year').agg({
    'rating': 'mean',
    'review_id': 'count'
}).rename(columns={'review_id': 'review_count'})

# Plot monthly average ratings
plt.figure(figsize=(14, 6))
monthly_ratings['rating'].plot(kind='line', marker='o')
plt.title('Average Rating by Month')
plt.xlabel('Month')
plt.ylabel('Average Rating')
plt.grid(True)
plt.tight_layout()
plot_path = os.path.join(plot_output_dir, 'monthly_ratings.png')
plt.savefig(plot_path)
plt.close()
print(f"Saved plot: {plot_path}")

# Plot monthly review counts
plt.figure(figsize=(14, 6))
monthly_ratings['review_count'].plot(kind='bar')
plt.title('Review Count by Month')
plt.xlabel('Month')
plt.ylabel('Number of Reviews')
plt.grid(True)
plt.tight_layout()
plot_path = os.path.join(plot_output_dir, 'monthly_review_counts.png')
plt.savefig(plot_path)
plt.close()
print(f"Saved plot: {plot_path}")

# Feature-attribute correlation analysis
print("\nAnalyzing feature-attribute correlations...")

try:
    # Create a cross-tabulation of features and attributes
    # Drop NaN values before crosstab
    temp_df = reviews_df.dropna(subset=['feature_mentioned', 'attribute_mentioned'])
    if not temp_df.empty:
        feature_attribute = pd.crosstab(
            temp_df['feature_mentioned'],
            temp_df['attribute_mentioned']
        )

        # Normalize by row (feature)
        feature_attribute_norm = feature_attribute.div(feature_attribute.sum(axis=1), axis=0).fillna(0)

        # Plot heatmap of top features and attributes
        top_features = temp_df['feature_mentioned'].value_counts().head(10).index
        top_attributes = temp_df['attribute_mentioned'].value_counts().head(10).index

        # Ensure top features/attributes exist in the crosstab index/columns
        valid_features = [f for f in top_features if f in feature_attribute_norm.index]
        valid_attributes = [a for a in top_attributes if a in feature_attribute_norm.columns]

        if valid_features and valid_attributes:
            plt.figure(figsize=(14, 10))
            sns.heatmap(
                feature_attribute_norm.loc[valid_features, valid_attributes],
                annot=True,
                cmap='YlGnBu',
                fmt='.2f'
            )
            plt.title('Feature-Attribute Correlation Heatmap (Top 10)')
            plt.tight_layout()
            plot_path = os.path.join(plot_output_dir, 'feature_attribute_heatmap.png')
            plt.savefig(plot_path)
            plt.close()
            print(f"Saved plot: {plot_path}")
        else:
            print("Skipped heatmap: Not enough valid top features/attributes found in data.")
    else:
        print("Skipped heatmap: No valid feature/attribute data after dropping NaNs.")
except Exception as e:
    print(f"Error during feature-attribute correlation: {e}")


# RAG Implementation Simulation
print("\n=== RAG Implementation Simulation ===")
print("Note: This is a simplified simulation of how RAG would work")

# Function to simulate RAG for product insights
def simulate_rag_product_insights(product_name):
    product_reviews = reviews_df[reviews_df['product'] == product_name]
    if product_reviews.empty: return f"No reviews found for {product_name}"

    avg_rating = product_reviews['rating'].mean()
    sentiment_dist = product_reviews['sentiment'].value_counts(normalize=True)
    top_features = product_reviews['feature_mentioned'].dropna().value_counts().head(5)
    top_attributes = product_reviews['attribute_mentioned'].dropna().value_counts().head(5)
    positive_reviews = product_reviews[product_reviews['sentiment'] == 'positive']
    negative_reviews = product_reviews[product_reviews['sentiment'] == 'negative']
    positive_features = positive_reviews['feature_mentioned'].dropna().value_counts().head(3)
    negative_features = negative_reviews['feature_mentioned'].dropna().value_counts().head(3)

    response = f"Product Insights for {product_name}:\n"
    response += f"- Average Rating: {avg_rating:.2f}/5 ({len(product_reviews)} reviews)\n"
    response += "- Sentiment: " + ", ".join([f"{s} ({p:.1%})" for s, p in sentiment_dist.items()]) + "\n"
    if not top_features.empty: response += "- Top Features: " + ", ".join(top_features.index) + "\n"
    if not top_attributes.empty: response += "- Top Attributes: " + ", ".join(top_attributes.index) + "\n"
    if not positive_features.empty: response += "- Strengths (Features): " + ", ".join(positive_features.index) + "\n"
    if not negative_features.empty: response += "- Weaknesses (Features): " + ", ".join(negative_features.index) + "\n"
    # Add sample reviews if available
    if not positive_reviews.empty: response += f"- Sample Positive: {positive_reviews.iloc[0]['review_text'][:80]}...\n"
    if not negative_reviews.empty: response += f"- Sample Negative: {negative_reviews.iloc[0]['review_text'][:80]}...\n"
    return response

# Function to simulate RAG for comparative analysis
def simulate_rag_compare_products(product1, product2):
    product1_reviews = reviews_df[reviews_df['product'] == product1]
    product2_reviews = reviews_df[reviews_df['product'] == product2]
    if product1_reviews.empty or product2_reviews.empty: return f"Cannot compare: Reviews missing for one or both products."

    avg_rating1 = product1_reviews['rating'].mean()
    avg_rating2 = product2_reviews['rating'].mean()
    sentiment_dist1 = product1_reviews['sentiment'].value_counts(normalize=True)
    sentiment_dist2 = product2_reviews['sentiment'].value_counts(normalize=True)
    top_features1 = product1_reviews['feature_mentioned'].dropna().value_counts().head(3)
    top_features2 = product2_reviews['feature_mentioned'].dropna().value_counts().head(3)

    response = f"Comparison: {product1} vs {product2}\n"
    response += f"- Rating: {avg_rating1:.2f} vs {avg_rating2:.2f}\n"
    pos1 = sentiment_dist1.get('positive', 0); pos2 = sentiment_dist2.get('positive', 0)
    response += f"- Positive Sentiment: {pos1:.1%} vs {pos2:.1%}\n"
    if not top_features1.empty: response += f"- Top Features ({product1}): {', '.join(top_features1.index)}\n"
    if not top_features2.empty: response += f"- Top Features ({product2}): {', '.join(top_features2.index)}\n"
    winner = product1 if avg_rating1 > avg_rating2 else product2
    margin = abs(avg_rating1 - avg_rating2)
    response += f"- Verdict: {winner} rated higher by {margin:.2f} points.\n"
    return response

# Example RAG queries
print("\nExample RAG Query 1: Product Insights")
product_insights_text = simulate_rag_product_insights("DevBook 13") # Renamed variable
print(product_insights_text)

print("\nExample RAG Query 2: Product Comparison")
product_comparison_text = simulate_rag_compare_products("TechPro X20", "GalaxyWave S5") # Renamed variable
print(product_comparison_text)

# Create a dashboard-ready summary of product reviews
print("\nGenerating dashboard-ready summary...")

# Product category summary
category_summary = reviews_df.groupby('category').agg({
    'rating': 'mean',
    'review_id': 'count'
}).rename(columns={'review_id': 'review_count'})
sentiment_by_cat = pd.crosstab(reviews_df['category'], reviews_df['sentiment'], normalize='index')
category_summary = category_summary.join(sentiment_by_cat, how='left').fillna(0)

# Product summary
product_summary = reviews_df.groupby('product').agg({
    'rating': 'mean',
    'review_id': 'count'
}).rename(columns={'review_id': 'review_count'})
product_category = reviews_df[['product', 'category']].drop_duplicates().set_index('product')
product_summary = product_summary.join(product_category, how='left') # Added how='left'
sentiment_by_prod = pd.crosstab(reviews_df['product'], reviews_df['sentiment'], normalize='index')
product_summary = product_summary.join(sentiment_by_prod, how='left').fillna(0)
product_summary = product_summary.sort_values('rating', ascending=False)

# Save summaries to CSV for dashboard
cat_summary_path = os.path.join(data_output_dir, 'category_summary.csv')
prod_summary_path = os.path.join(data_output_dir, 'product_summary.csv')
category_summary.to_csv(cat_summary_path)
product_summary.to_csv(prod_summary_path)
print(f"Saved category summary: {cat_summary_path}")
print(f"Saved product summary: {prod_summary_path}")

print("\nCategory Summary:")
print(category_summary)
print("\nTop 5 Products by Rating:")
print(product_summary.head())

print("\n--- Script Finished ---")
print("Product Review Analysis and RAG Implementation Completed")
print(f"Visualizations saved to: {plot_output_dir}")
print(f"Summary files saved to: {data_output_dir}")
