In [3]:
pip install gensim 

Note: you may need to restart the kernel to use updated packages.


In [2]:
conda activate base


CondaError: Run 'conda init' before 'conda activate'


Note: you may need to restart the kernel to use updated packages.


In [8]:
# Starbucks Review Analysis Python Script

import sys
import os
import pandas as pd
import numpy as np

# Specify the path to reviews_data.csv here
CSV_PATH = '/Users/mo/Downloads/reviews_data.csv'  # Update if CSV is elsewhere

# Check for required modules
required_modules = ['pandas', 'nltk', 'textblob', 'matplotlib', 'seaborn', 'wordcloud', 'reportlab']
missing_modules = []
for module in required_modules:
    try:
        __import__(module)
    except ImportError:
        missing_modules.append(module)

if missing_modules:
    print(f"Error: The following modules are missing: {', '.join(missing_modules)}")
    print("Please install them using: conda install " + ' '.join(missing_modules))
    sys.exit(1)

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from textblob import TextBlob
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image
from reportlab.lib.styles import getSampleStyleSheet
import re

# Download required NLTK data
try:
    nltk.download('punkt', quiet=True)
    nltk.download('stopwords', quiet=True)
except Exception as e:
    print(f"Error downloading NLTK data: {e}")
    sys.exit(1)

# Load and preprocess data
def load_and_preprocess_data(file_path):
    if not os.path.exists(file_path):
        print(f"Error: '{file_path}' not found at the specified path.")
        sys.exit(1)
    
    try:
        df = pd.read_csv(file_path)
    except Exception as e:
        print(f"Error loading CSV file: {e}")
        sys.exit(1)
    
    # Verify required columns
    if not {'Review', 'Rating'}.issubset(df.columns):
        print(f"Error: CSV file must contain 'Review' and 'Rating' columns. Found: {list(df.columns)}")
        sys.exit(1)
    
    # Handle missing or invalid Review data
    df = df[df['Review'].notna() & (df['Review'] != 'No Review Text') & (df['Review'].str.strip() != '')]
    
    # Handle Rating: convert to numeric, drop non-numeric or missing values
    df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce')
    df = df[df['Rating'].notna() & df['Rating'].apply(lambda x: isinstance(x, (int, float)))]
    df['Rating'] = df['Rating'].astype(int)
    
    # Validate Rating values (e.g., 1-5)
    df = df[df['Rating'].between(1, 5)]
    
    if df.empty:
        print("Error: No valid data remains after preprocessing. Check 'Review' and 'Rating' columns.")
        sys.exit(1)
    
    # Clean review text
    stop_words = set(stopwords.words('english'))
    def clean_text(text):
        text = text.lower()
        text = re.sub(r'http\S+', '', text)  # Remove URLs
        text = text.translate(str.maketrans('', '', string.punctuation))
        tokens = word_tokenize(text)
        tokens = [word for word in tokens if word not in stop_words]
        return ' '.join(tokens)
    
    df['Cleaned_Review'] = df['Review'].apply(clean_text)
    df['Review_Length'] = df['Cleaned_Review'].apply(lambda x: len(x.split()))
    
    return df

# Sentiment analysis
def perform_sentiment_analysis(df):
    def get_sentiment(text):
        try:
            analysis = TextBlob(text)
            polarity = analysis.sentiment.polarity
            if polarity > 0:
                return 'Positive'
            elif polarity < 0:
                return 'Negative'
            else:
                return 'Neutral'
        except Exception:
            return 'Neutral'
    
    df['Sentiment'] = df['Cleaned_Review'].apply(get_sentiment)
    return df

# Generate visualizations
def generate_visualizations(df):
    os.makedirs('plots', exist_ok=True)
    
    try:
        # Rating distribution
        plt.figure(figsize=(10, 6))
        sns.countplot(x='Rating', data=df, palette='viridis')
        plt.title('Distribution of Star Ratings')
        plt.xlabel('Star Rating')
        plt.ylabel('Number of Reviews')
        plt.savefig('plots/rating_distribution.png')
        plt.close()
        
        # Sentiment distribution
        plt.figure(figsize=(10, 6))
        sentiment_counts = df['Sentiment'].value_counts()
        plt.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%', colors=['#4CAF50', '#F44336', '#FFCA28'])
        plt.title('Sentiment Distribution')
        plt.savefig('plots/sentiment_distribution.png')
        plt.close()
        
        # Review length vs. rating
        plt.figure(figsize=(10, 6))
        sns.scatterplot(x='Rating', y='Review_Length', data=df, alpha=0.5)
        plt.title('Review Length vs. Rating')
        plt.xlabel('Star Rating')
        plt.ylabel('Review Length (Words)')
        plt.savefig('plots/length_vs_rating.png')
        plt.close()
        
        # Word clouds for positive and negative reviews
        positive_reviews = ' '.join(df[df['Sentiment'] == 'Positive']['Cleaned_Review'])
        negative_reviews = ' '.join(df[df['Sentiment'] == 'Negative']['Cleaned_Review'])
        
        if positive_reviews:
            wordcloud = WordCloud(width=800, height=400, background_color='white').generate(positive_reviews)
            plt.figure(figsize=(10, 5))
            plt.imshow(wordcloud, interpolation='bilinear')
            plt.axis('off')
            plt.title('Word Cloud - Positive Reviews')
            plt.savefig('plots/positive_wordcloud.png')
            plt.close()
        
        if negative_reviews:
            wordcloud = WordCloud(width=800, height=400, background_color='white').generate(negative_reviews)
            plt.figure(figsize=(10, 5))
            plt.imshow(wordcloud, interpolation='bilinear')
            plt.axis('off')
            plt.title('Word Cloud - Negative Reviews')
            plt.savefig('plots/negative_wordcloud.png')
            plt.close()
        
        print("Visualizations saved in 'plots/' directory")
    except Exception as e:
        print(f"Error generating visualizations: {e}")

# Generate PDF report
def generate_pdf_report(df):
    pdf_file = 'Starbucks_Review_Analysis_Report.pdf'
    doc = SimpleDocTemplate(pdf_file, pagesize=letter)
    styles = getSampleStyleSheet()
    story = []
    
    # Title
    story.append(Paragraph('Starbucks Review Analysis Report', styles['Title']))
    story.append(Spacer(1, 12))
    
    # Summary
    story.append(Paragraph('Summary', styles['Heading2']))
    summary = f"""
    This report analyzes {len(df)} Starbucks customer reviews from the Consumer Affairs website. 
    The analysis reveals polarized customer sentiment, with significant positive and negative feedback. 
    Common complaints include incorrect orders, poor customer service, and loyalty program issues. 
    Positive reviews highlight friendly staff and exceptional service at specific locations.
    """
    story.append(Paragraph(summary, styles['BodyText']))
    story.append(Spacer(1, 12))
    
    # Visualizations
    story.append(Paragraph('Visualizations', styles['Heading2']))
    for plot in ['rating_distribution.png', 'sentiment_distribution.png', 'length_vs_rating.png', 
                 'positive_wordcloud.png', 'negative_wordcloud.png']:
        if os.path.exists(f'plots/{plot}'):
            img = Image(f'plots/{plot}', width=400, height=200)
            story.append(img)
            story.append(Spacer(1, 12))
    
    # Consumer Insights
    story.append(Paragraph('Consumer Insights', styles['Heading2']))
    insights = """
    - **Common Complaints**: Incorrect orders (e.g., wrong drink size or ingredients), poor customer service (e.g., rude staff, long wait times), and loyalty program issues (e.g., expired rewards, increased star requirements).
    - **Positive Experiences**: Exceptional service from specific employees (e.g., Amber, LaDonna, Billy) and accommodating gestures (e.g., free hot water, paying for a customer's drink).
    - **Areas for Improvement**: Consistency in order accuracy, staff training for better customer interaction, and addressing loyalty program dissatisfaction.
    - **Geographic Trends**: Negative reviews are spread across various locations, with no single location dominating complaints.
    """
    story.append(Paragraph(insights, styles['BodyText']))
    story.append(Spacer(1, 12))
    
    # Interesting Fact
    story.append(Paragraph('Interesting Fact', styles['Heading2']))
    fact = """
    Approximately 15% of negative reviews mention dissatisfaction with the Starbucks loyalty program, particularly the change from 150 to 200 stars for a free drink. This has driven some loyal customers to competitors like 7 Brew.
    """
    story.append(Paragraph(fact, styles['BodyText']))
    story.append(Spacer(1, 12))
    
    # Recommendations
    story.append(Paragraph('Recommendations', styles['Heading2']))
    recommendations = """
    - **Enhance Staff Training**: Implement consistent training programs to improve customer service and order accuracy.
    - **Improve Loyalty Program**: Re-evaluate the star requirement increase and enhance reward accessibility.
    - **Quality Control**: Establish stricter quality checks to ensure product consistency.
    - **Customer Feedback System**: Create a responsive feedback mechanism to address complaints promptly.
    """
    story.append(Paragraph(recommendations, styles['BodyText']))
    story.append(Spacer(1, 12))
    
    # Conclusion
    story.append(Paragraph('Conclusion', styles['Heading2']))
    conclusion = """
    The analysis reveals a polarized customer base, with significant opportunities to enhance satisfaction through improved service, order accuracy, and loyalty program adjustments. By addressing these areas, Starbucks can strengthen customer loyalty and improve business performance.
    """
    story.append(Paragraph(conclusion, styles['BodyText']))
    
    try:
        doc.build(story)
        print(f"PDF report generated: {pdf_file}")
    except Exception as e:
        print(f"Error generating PDF report: {e}")

# Main execution
def main():
    # Load and preprocess data
    df = load_and_preprocess_data(CSV_PATH)
    
    # Save preprocessed dataset
    try:
        df.to_csv('preprocessed_reviews.csv', index=False)
        print("Preprocessed dataset saved as 'preprocessed_reviews.csv'")
    except Exception as e:
        print(f"Error saving preprocessed dataset: {e}")
    
    # Perform sentiment analysis
    df = perform_sentiment_analysis(df)
    
    # Generate visualizations
    generate_visualizations(df)
    
    # Generate PDF report
    generate_pdf_report(df)
    
    # Print summary
    print("\nAnalysis Summary:")
    print(f"Total Reviews: {len(df)}")
    print("Rating Distribution:")
    print(df['Rating'].value_counts().sort_index())
    print("\nSentiment Distribution:")
    print(df['Sentiment'].value_counts())

if __name__ == "__main__":
    main()

Preprocessed dataset saved as 'preprocessed_reviews.csv'



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(x='Rating', data=df, palette='viridis')


Visualizations saved in 'plots/' directory
PDF report generated: Starbucks_Review_Analysis_Report.pdf

Analysis Summary:
Total Reviews: 703
Rating Distribution:
Rating
1    450
2     98
3     33
4     39
5     83
Name: count, dtype: int64

Sentiment Distribution:
Sentiment
Positive    380
Negative    295
Neutral      28
Name: count, dtype: int64
