# Coffee Sentiment Analysis

This notebook implements a sentiment analysis system for coffee reviews using Natural Language Processing (NLP) techniques. It includes:
1. Web scraping from coffeereview.com
2. Text preprocessing and NLP
3. Sentiment analysis and visualization

## Setup and Dependencies

In [None]:
# Install required packages
!pip install beautifulsoup4 nltk numpy pandas plotly requests scikit-learn streamlit

In [None]:
# Import required libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.sentiment import vader
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import plotly.express as px
import plotly.graph_objects as go
import time
import pickle

## 1. Web Scraping

First, we'll implement the web scraping functionality to collect coffee reviews.

In [None]:
def scrape_coffee_reviews(num_pages=5):
    reviews = []
    base_url = "https://www.coffeereview.com/review/page/{}/"
    
    for page in range(1, num_pages + 1):
        try:
            response = requests.get(base_url.format(page))
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Find all review containers
            review_elements = soup.find_all('article', class_='review')
            
            for review in review_elements:
                title = review.find('h2').text.strip() if review.find('h2') else ''
                rating = review.find('div', class_='rating').text.strip() if review.find('div', class_='rating') else ''
                text = review.find('div', class_='entry-content').text.strip() if review.find('div', class_='entry-content') else ''
                
                reviews.append({
                    'title': title,
                    'rating': rating,
                    'text': text
                })
            
            print(f"Scraped page {page}")
            time.sleep(1)  # Be respectful with scraping
            
        except Exception as e:
            print(f"Error on page {page}: {str(e)}")
            continue
    
    return pd.DataFrame(reviews)

## 2. NLP Processing

Now we'll implement the sentiment analysis and NLP processing functionality.

In [None]:
# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')

In [None]:
class CoffeeSentimentAnalyzer:
    def __init__(self):
        self.stemmer = PorterStemmer()
        self.stop_words = set(stopwords.words('english'))
        self.sia = vader.SentimentIntensityAnalyzer()
        self.vectorizer = TfidfVectorizer()
        self.model = RandomForestClassifier()
        
    def preprocess_text(self, text):
        # Tokenization
        tokens = word_tokenize(text.lower())
        
        # Remove stopwords and stem
        tokens = [self.stemmer.stem(token) for token in tokens 
                 if token.isalnum() and token not in self.stop_words]
        
        return ' '.join(tokens)
    
    def extract_features(self, text):
        # Basic features
        features = {}
        features['text_length'] = len(text)
        features['word_count'] = len(text.split())
        
        # VADER sentiment scores
        sentiment_scores = self.sia.polarity_scores(text)
        features.update(sentiment_scores)
        
        return features
    
    def prepare_data(self, df):
        # Preprocess text
        df['processed_text'] = df['text'].apply(self.preprocess_text)
        
        # Extract features
        features_df = pd.DataFrame([self.extract_features(text) 
                                  for text in df['text']])
        
        # TF-IDF
        tfidf_matrix = self.vectorizer.fit_transform(df['processed_text'])
        tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), 
                               columns=self.vectorizer.get_feature_names_out())
        
        # Combine all features
        X = pd.concat([features_df, tfidf_df], axis=1)
        
        # Create labels (assuming ratings are 0-100)
        df['sentiment'] = pd.cut(pd.to_numeric(df['rating'].str.replace('[^\d.]', ''), 
                                             errors='coerce'),
                               bins=[0, 60, 80, 100],
                               labels=['negative', 'neutral', 'positive'])
        
        return X, df['sentiment']
    
    def train(self, X, y):
        self.model.fit(X, y)
        
    def predict(self, text):
        processed_text = self.preprocess_text(text)
        features = self.extract_features(text)
        features_df = pd.DataFrame([features])
        
        tfidf_matrix = self.vectorizer.transform([processed_text])
        tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), 
                               columns=self.vectorizer.get_feature_names_out())
        
        X = pd.concat([features_df, tfidf_df], axis=1)
        return self.model.predict(X)[0]

## 3. Data Collection and Model Training

In [None]:
# Scrape reviews
print("Scraping coffee reviews...")
df = scrape_coffee_reviews()
df.to_csv("coffee_reviews.csv", index=False)
print(f"Scraped {len(df)} reviews")

In [None]:
# Initialize and train the model
analyzer = CoffeeSentimentAnalyzer()
X, y = analyzer.prepare_data(df)
analyzer.train(X, y)
print("Model trained successfully!")

## 4. Analysis and Visualization

In [None]:
# Sentiment Distribution
sentiment_dist = df['sentiment'].value_counts()
fig = px.pie(values=sentiment_dist.values, names=sentiment_dist.index, title="Review Sentiments")
fig.show()

In [None]:
# Rating Distribution
fig = px.histogram(df, x='rating', title="Rating Distribution")
fig.show()

## 5. Interactive Review Analysis

In [None]:
def analyze_review(text):
    sentiment = analyzer.predict(text)
    features = analyzer.extract_features(text)
    
    print(f"Predicted Sentiment: {sentiment}")
    print("\nFeature Analysis:")
    for feature, value in features.items():
        if feature != 'compound':
            print(f"{feature}: {value:.3f}")

# Example usage
sample_review = "This coffee has an amazing aroma with hints of chocolate and a smooth finish."
analyze_review(sample_review)