# Product Review Analysis and RAG Simulation

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import warnings
warnings.filterwarnings('ignore')

# Download NLTK resources
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

# Set style for visualizations
plt.style.use('ggplot')
sns.set_palette("Set2")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

# Load the product reviews dataset
reviews_df = pd.read_csv('product_reviews.csv')

# Convert date column to datetime
reviews_df['date'] = pd.to_datetime(reviews_df['date'])

print("=== Product Review Analysis ===")
print(f"Total reviews: {len(reviews_df)}")
print(f"Unique products: {reviews_df['product'].nunique()}")
print(f"Categories: {reviews_df['category'].nunique()}")
print(f"Rating distribution: \n{reviews_df['rating'].value_counts().sort_index()}")
print(f"Sentiment distribution: \n{reviews_df['sentiment'].value_counts()}")

# Simple text preprocessing function without NLTK
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Preprocess review text
print("\nPreprocessing review text...")
reviews_df['processed_text'] = reviews_df['review_text'].apply(preprocess_text)

# Feature-level sentiment analysis
print("\nPerforming feature-level sentiment analysis...")

# Group by feature and sentiment
feature_sentiment = reviews_df.groupby(['feature_mentioned', 'sentiment']).size().unstack(fill_value=0)

# Calculate sentiment score (-1 for negative, 0 for neutral, 1 for positive)
if 'negative' in feature_sentiment.columns and 'positive' in feature_sentiment.columns:
    feature_sentiment['sentiment_score'] = (feature_sentiment['positive'] - feature_sentiment['negative']) / (
        feature_sentiment['positive'] + feature_sentiment['neutral'] + feature_sentiment['negative']
    )
else:
    # Handle case where some sentiment categories might be missing
    feature_sentiment['sentiment_score'] = 0
    if 'positive' in feature_sentiment.columns:
        feature_sentiment['sentiment_score'] += feature_sentiment['positive']
    if 'negative' in feature_sentiment.columns:
        feature_sentiment['sentiment_score'] -= feature_sentiment['negative']
    total_reviews = feature_sentiment.sum(axis=1)
    feature_sentiment['sentiment_score'] = feature_sentiment['sentiment_score'] / total_reviews

# Sort by sentiment score
feature_sentiment = feature_sentiment.sort_values('sentiment_score', ascending=False)

# Visualize feature sentiment
plt.figure(figsize=(12, 8))
feature_sentiment['sentiment_score'].head(15).plot(kind='bar')
plt.title('Top 15 Features by Sentiment Score')
plt.xlabel('Feature')
plt.ylabel('Sentiment Score (-1 to 1)')
plt.tight_layout()
plt.savefig('feature_sentiment_score.png')
plt.close()

# Attribute-level sentiment analysis
attribute_sentiment = reviews_df.groupby(['attribute_mentioned', 'sentiment']).size().unstack(fill_value=0)

# Calculate sentiment score for attributes
if 'negative' in attribute_sentiment.columns and 'positive' in attribute_sentiment.columns:
    attribute_sentiment['sentiment_score'] = (attribute_sentiment['positive'] - attribute_sentiment['negative']) / (
        attribute_sentiment['positive'] + attribute_sentiment['neutral'] + attribute_sentiment['negative']
    )
else:
    # Handle case where some sentiment categories might be missing
    attribute_sentiment['sentiment_score'] = 0
    if 'positive' in attribute_sentiment.columns:
        attribute_sentiment['sentiment_score'] += attribute_sentiment['positive']
    if 'negative' in attribute_sentiment.columns:
        attribute_sentiment['sentiment_score'] -= attribute_sentiment['negative']
    total_reviews = attribute_sentiment.sum(axis=1)
    attribute_sentiment['sentiment_score'] = attribute_sentiment['sentiment_score'] / total_reviews

# Sort by sentiment score
attribute_sentiment = attribute_sentiment.sort_values('sentiment_score', ascending=False)

# Visualize attribute sentiment
plt.figure(figsize=(12, 8))
attribute_sentiment['sentiment_score'].head(15).plot(kind='bar')
plt.title('Top 15 Attributes by Sentiment Score')
plt.xlabel('Attribute')
plt.ylabel('Sentiment Score (-1 to 1)')
plt.tight_layout()
plt.savefig('attribute_sentiment_score.png')
plt.close()

# Product-level analysis
print("\nPerforming product-level analysis...")

# Group by product and calculate average rating and sentiment distribution
product_analysis = reviews_df.groupby('product').agg({
    'rating': 'mean',
    'review_id': 'count'
}).rename(columns={'review_id': 'review_count'})

# Add sentiment distribution
sentiment_counts = pd.crosstab(reviews_df['product'], reviews_df['sentiment'])
product_analysis = product_analysis.join(sentiment_counts)

# Calculate positive sentiment ratio
if 'positive' in product_analysis.columns:
    product_analysis['positive_ratio'] = product_analysis['positive'] / product_analysis.review_count
else:
    product_analysis['positive_ratio'] = 0

# Sort by average rating
product_analysis = product_analysis.sort_values('rating', ascending=False)

# Visualize product ratings
plt.figure(figsize=(14, 8))
product_analysis['rating'].plot(kind='bar')
plt.title('Average Rating by Product')
plt.xlabel('Product')
plt.ylabel('Average Rating')
plt.axhline(y=product_analysis['rating'].mean(), color='r', linestyle='--', label='Average')
plt.legend()
plt.xticks(rotation=90)
plt.tight_layout()
plt.savefig('product_ratings.png')
plt.close()

# Product strengths and weaknesses analysis
print("\nIdentifying product strengths and weaknesses...")

# Function to get top features and attributes for a product by sentiment
def get_product_insights(product_name, sentiment_type='positive'):
    product_reviews = reviews_df[reviews_df['product'] == product_name]
    
    # Get positive/negative reviews
    filtered_reviews = product_reviews[product_reviews['sentiment'] == sentiment_type]
    
    # Top features
    feature_counts = filtered_reviews['feature_mentioned'].value_counts().head(3)
    
    # Top attributes
    attribute_counts = filtered_reviews['attribute_mentioned'].value_counts().head(3)
    
    return {
        'features': feature_counts,
        'attributes': attribute_counts
    }

# Get insights for top 5 products
top_products = product_analysis.head(5).index.tolist()
product_insights = {}

for product in top_products:
    strengths = get_product_insights(product, 'positive')
    weaknesses = get_product_insights(product, 'negative')
    
    product_insights[product] = {
        'strengths': strengths,
        'weaknesses': weaknesses
    }

# Print insights for top products
print("\nProduct Insights (Strengths and Weaknesses):")
for product, insights in product_insights.items():
    print(f"\n{product}:")
    
    print("  Strengths:")
    if not insights['strengths']['features'].empty:
        print("    Top Features:")
        for feature, count in insights['strengths']['features'].items():
            print(f"      - {feature}: {count} mentions")
    
    if not insights['strengths']['attributes'].empty:
        print("    Top Attributes:")
        for attribute, count in insights['strengths']['attributes'].items():
            print(f"      - {attribute}: {count} mentions")
    
    print("  Weaknesses:")
    if not insights['weaknesses']['features'].empty:
        print("    Top Features:")
        for feature, count in insights['weaknesses']['features'].items():
            print(f"      - {feature}: {count} mentions")
    
    if not insights['weaknesses']['attributes'].empty:
        print("    Top Attributes:")
        for attribute, count in insights['weaknesses']['attributes'].items():
            print(f"      - {attribute}: {count} mentions")

# Generate embeddings for vector search
print("\nGenerating embeddings for vector search...")

# Use TF-IDF to create document vectors
tfidf = TfidfVectorizer(max_features=100)
review_vectors = tfidf.fit_transform(reviews_df['processed_text'])

# Function to search for similar reviews
def search_similar_reviews(query, top_n=5):
    # Preprocess the query
    processed_query = preprocess_text(query)
    
    # Transform the query using the fitted vectorizer
    query_vector = tfidf.transform([processed_query])
    
    # Calculate cosine similarity
    similarities = cosine_similarity(query_vector, review_vectors).flatten()
    
    # Get top N similar reviews
    top_indices = similarities.argsort()[:-top_n-1:-1]
    
    return reviews_df.iloc[top_indices]

# Example search
print("\nExample vector search:")
search_query = "battery life problems"
similar_reviews = search_similar_reviews(search_query)
print(f"Query: '{search_query}'")
print("Top 5 similar reviews:")
for i, (_, review) in enumerate(similar_reviews.iterrows(), 1):
    print(f"{i}. Product: {review['product']}, Rating: {review['rating']}, Sentiment: {review['sentiment']}")
    print(f"   Review: {review['review_text'][:100]}...")

# Visualize embeddings with PCA
print("\nVisualizing review embeddings...")
pca = PCA(n_components=2)
review_vectors_dense = review_vectors.toarray()
embeddings_2d = pca.fit_transform(review_vectors_dense)

# Create DataFrame with embeddings
embedding_df = pd.DataFrame({
    'x': embeddings_2d[:, 0],
    'y': embeddings_2d[:, 1],
    'sentiment': reviews_df['sentiment'],
    'product': reviews_df['product'],
    'category': reviews_df['category']
})

# Plot embeddings by sentiment
plt.figure(figsize=(12, 10))
sentiment_colors = {'positive': 'green', 'neutral': 'blue', 'negative': 'red'}
for sentiment, color in sentiment_colors.items():
    mask = embedding_df['sentiment'] == sentiment
    plt.scatter(
        embedding_df.loc[mask, 'x'], 
        embedding_df.loc[mask, 'y'],
        c=color,
        label=sentiment,
        alpha=0.7
    )
plt.title('Review Embeddings by Sentiment')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.legend()
plt.tight_layout()
plt.savefig('review_embeddings_sentiment.png')
plt.close()

# Plot embeddings by category
plt.figure(figsize=(12, 10))
categories = embedding_df['category'].unique()
for i, category in enumerate(categories):
    mask = embedding_df['category'] == category
    plt.scatter(
        embedding_df.loc[mask, 'x'], 
        embedding_df.loc[mask, 'y'],
        label=category,
        alpha=0.7
    )
plt.title('Review Embeddings by Product Category')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.legend()
plt.tight_layout()
plt.savefig('review_embeddings_category.png')
plt.close()

# Temporal analysis of reviews
print("\nPerforming temporal analysis of reviews...")

# Group by month and calculate average rating
reviews_df['month_year'] = reviews_df['date'].dt.to_period('M')
monthly_ratings = reviews_df.groupby('month_year').agg({
    'rating': 'mean',
    'review_id': 'count'
}).rename(columns={'review_id': 'review_count'})

# Plot monthly average ratings
plt.figure(figsize=(14, 6))
monthly_ratings['rating'].plot(kind='line', marker='o')
plt.title('Average Rating by Month')
plt.xlabel('Month')
plt.ylabel('Average Rating')
plt.grid(True)
plt.tight_layout()
plt.savefig('monthly_ratings.png')
plt.close()

# Plot monthly review counts
plt.figure(figsize=(14, 6))
monthly_ratings['review_count'].plot(kind='bar')
plt.title('Review Count by Month')
plt.xlabel('Month')
plt.ylabel('Number of Reviews')
plt.grid(True)
plt.tight_layout()
plt.savefig('monthly_review_counts.png')
plt.close()

# Feature-attribute correlation analysis
print("\nAnalyzing feature-attribute correlations...")

# Create a cross-tabulation of features and attributes
feature_attribute = pd.crosstab(
    reviews_df['feature_mentioned'], 
    reviews_df['attribute_mentioned']
)

# Normalize by row (feature)
feature_attribute_norm = feature_attribute.div(feature_attribute.sum(axis=1), axis=0)

# Plot heatmap of top features and attributes
top_features = reviews_df['feature_mentioned'].value_counts().head(10).index
top_attributes = reviews_df['attribute_mentioned'].value_counts().head(10).index

plt.figure(figsize=(14, 10))
sns.heatmap(
    feature_attribute_norm.loc[top_features, top_attributes],
    annot=True,
    cmap='YlGnBu',
    fmt='.2f'
)
plt.title('Feature-Attribute Correlation Heatmap')
plt.tight_layout()
plt.savefig('feature_attribute_heatmap.png')
plt.close()

# RAG Implementation Simulation
print("\n=== RAG Implementation Simulation ===")
print("Note: This is a simplified simulation of how RAG would work with Google Gemini")

# Function to simulate RAG for product insights
def simulate_rag_product_insights(product_name):
    # Get product reviews
    product_reviews = reviews_df[reviews_df['product'] == product_name]
    
    # Calculate average rating
    avg_rating = product_reviews['rating'].mean()
    
    # Get sentiment distribution
    sentiment_dist = product_reviews['sentiment'].value_counts(normalize=True)
    
    # Get top features and attributes
    top_features = product_reviews['feature_mentioned'].value_counts().head(5)
    top_attributes = product_reviews['attribute_mentioned'].value_counts().head(5)
    
    # Get positive and negative aspects
    positive_reviews = product_reviews[product_reviews['sentiment'] == 'positive']
    negative_reviews = product_reviews[product_reviews['sentiment'] == 'negative']
    
    positive_features = positive_reviews['feature_mentioned'].value_counts().head(3)
    negative_features = negative_reviews['feature_mentioned'].value_counts().head(3)
    
    # Construct response
    response = f"Product Insights for {product_name}:\n\n"
    response += f"Average Rating: {avg_rating:.2f}/5\n\n"
    
    response += "Sentiment Distribution:\n"
    for sentiment, percentage in sentiment_dist.items():
        response += f"- {sentiment}: {percentage:.1%}\n"
    
    response += "\nTop Features:\n"
    for feature, count in top_features.items():
        response += f"- {feature}: {count} mentions\n"
    
    response += "\nTop Attributes:\n"
    for attribute, count in top_attributes.items():
        response += f"- {attribute}: {count} mentions\n"
    
    response += "\nStrengths (Positive Features):\n"
    for feature, count in positive_features.items():
        response += f"- {feature}: {count} mentions\n"
    
    response += "\nWeaknesses (Negative Features):\n"
    for feature, count in negative_features.items():
        response += f"- {feature}: {count} mentions\n"
    
    response += "\nSample Reviews:\n"
    # Get a positive and negative review
    if not positive_reviews.empty:
        pos_review = positive_reviews.iloc[0]
        response += f"Positive ({pos_review['rating']}/5): {pos_review['review_text']}\n\n"
    
    if not negative_reviews.empty:
        neg_review = negative_reviews.iloc[0]
        response += f"Negative ({neg_review['rating']}/5): {neg_review['review_text']}\n"
    
    return response

# Function to simulate RAG for comparative analysis
def simulate_rag_compare_products(product1, product2):
    # Get product reviews
    product1_reviews = reviews_df[reviews_df['product'] == product1]
    product2_reviews = reviews_df[reviews_df['product'] == product2]
    
    # Calculate average ratings
    avg_rating1 = product1_reviews['rating'].mean()
    avg_rating2 = product2_reviews['rating'].mean()
    
    # Get sentiment distributions
    sentiment_dist1 = product1_reviews['sentiment'].value_counts(normalize=True)
    sentiment_dist2 = product2_reviews['sentiment'].value_counts(normalize=True)
    
    # Get top features
    top_features1 = product1_reviews['feature_mentioned'].value_counts().head(3)
    top_features2 = product2_reviews['feature_mentioned'].value_counts().head(3)
    
    # Construct response
    response = f"Comparison: {product1} vs {product2}\n\n"
    
    response += "Average Rating:\n"
    response += f"- {product1}: {avg_rating1:.2f}/5\n"
    response += f"- {product2}: {avg_rating2:.2f}/5\n\n"
    
    response += "Positive Sentiment:\n"
    pos1 = sentiment_dist1.get('positive', 0)
    pos2 = sentiment_dist2.get('positive', 0)
    response += f"- {product1}: {pos1:.1%}\n"
    response += f"- {product2}: {pos2:.1%}\n\n"
    
    response += f"Top Features for {product1}:\n"
    for feature, count in top_features1.items():
        response += f"- {feature}: {count} mentions\n"
    
    response += f"\nTop Features for {product2}:\n"
    for feature, count in top_features2.items():
        response += f"- {feature}: {count} mentions\n"
    
    # Determine which product is better
    if avg_rating1 > avg_rating2:
        winner = product1
        margin = avg_rating1 - avg_rating2
    else:
        winner = product2
        margin = avg_rating2 - avg_rating1
    
    response += f"\nVerdict: {winner} is rated higher by {margin:.2f} points."
    
    return response

# Example RAG queries
print("\nExample RAG Query 1: Product Insights")
product_insights = simulate_rag_product_insights("DevBook 13")
print(product_insights)

print("\nExample RAG Query 2: Product Comparison")
product_comparison = simulate_rag_compare_products("TechPro X20", "GalaxyWave S5")
print(product_comparison)

# Create a dashboard-ready summary of product reviews
print("\nGenerating dashboard-ready summary...")

# Product category summary
category_summary = reviews_df.groupby('category').agg({
    'rating': 'mean',
    'review_id': 'count'
}).rename(columns={'review_id': 'review_count'})

# Add sentiment distribution by category
sentiment_by_cat = pd.crosstab(
    reviews_df['category'], 
    reviews_df['sentiment'], 
    normalize='index'
)
category_summary = category_summary.join(sentiment_by_cat)

# Product summary
product_summary = reviews_df.groupby('product').agg({
    'rating': 'mean',
    'review_id': 'count'
}).rename(columns={'review_id': 'review_count'})

# Add category information
product_category = reviews_df[['product', 'category']].drop_duplicates().set_index('product')
product_summary = product_summary.join(product_category)

# Add sentiment distribution
sentiment_by_prod = pd.crosstab(
    reviews_df['product'], 
    reviews_df['sentiment'], 
    normalize='index'
)
product_summary = product_summary.join(sentiment_by_prod)

# Sort by rating
product_summary = product_summary.sort_values('rating', ascending=False)

# Save summaries to CSV for dashboard
category_summary.to_csv('category_summary.csv')
product_summary.to_csv('product_summary.csv')

print("\nCategory Summary:")
print(category_summary)

print("\nTop 5 Products by Rating:")
print(product_summary.head())

print("\nProduct Review Analysis and RAG Implementation Completed")
print("Visualizations and summary files saved")

# Return dataframes for further analysis
category_summary, product_summary.head(10)