In [None]:
# Advanced Sentiment Analysis with Gradio

# STEP 1: Install dependencies
!pip install -q nltk wordcloud matplotlib seaborn scikit-learn gradio textblob

# STEP 2: Import libraries
import pandas as pd
import numpy as np
import re
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from textblob import TextBlob
import gradio as gr
import warnings
warnings.filterwarnings('ignore')

# Download NLTK data only if not already present
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')

try:
    nltk.data.find('corpora/omw-1.4')
except LookupError:
    nltk.download('omw-1.4')

try:
    nltk.data.find('tokenizers/punkt_tab')
except LookupError:
    nltk.download('punkt_tab')


# STEP 3: Enhanced dataset with more diverse examples
sample_data = {
    'review': [
        'This movie was absolutely wonderful and breathtaking!',
        'Terrible film, complete waste of time and money.',
        'Great acting, compelling storyline, and excellent direction.',
        'Boring, predictable plot with poor character development.',
        'Amazing cinematography and outstanding direction throughout.',
        'The script was a disaster and the acting was equally terrible.',
        'An excellent thriller with stunning visuals and great suspense!',
        'Completely underwhelming and overhyped movie with no substance.',
        'A beautiful, emotional rollercoaster of a film with depth.',
        'Worst acting I\'ve ever seen in a film, truly awful.',
        'Mediocre film with some good moments but overall disappointing.',
        'Spectacular performances and incredible storytelling.',
        'Not the best movie but has its entertaining moments.',
        'Absolutely fantastic! One of the best films I\'ve ever seen.',
        'The movie was okay, nothing special but watchable.',
        'Brilliant cinematography and powerful performances.',
        'Dull and uninspiring with a confusing plot.',
        'Outstanding film with excellent character development.',
        'Waste of time, couldn\'t even finish watching it.',
        'Decent movie with good acting and reasonable plot.'
    ],
    'sentiment': [
        'positive', 'negative', 'positive', 'negative', 'positive',
        'negative', 'positive', 'negative', 'positive', 'negative',
        'negative', 'positive', 'neutral', 'positive', 'neutral',
        'positive', 'negative', 'positive', 'negative', 'positive'
    ]
}

df = pd.DataFrame(sample_data)

# STEP 4: Enhanced Text Preprocessing Class
class TextPreprocessor:
    def __init__(self):
        self.stop_words = set(stopwords.words("english"))
        self.lemmatizer = WordNetLemmatizer()
        self.contractions = {
            "won't": "will not", "can't": "cannot", "n't": " not",
            "'re": " are", "'ve": " have", "'ll": " will",
            "'d": " would", "'m": " am"
        }

    def expand_contractions(self, text):
        """Expand contractions in text"""
        for contraction, expansion in self.contractions.items():
            text = text.replace(contraction, expansion)
        return text

    def clean_text(self, text):
        """Clean and normalize text"""
        text = str(text).lower()
        text = self.expand_contractions(text)
        text = re.sub(r'<[^>]+>', '', text)  # Remove HTML tags
        text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
        text = re.sub(r'[^a-z\s]', '', text)  # Keep only letters and spaces
        text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
        return text

    def preprocess_text(self, text):
        """Complete text preprocessing pipeline"""
        text = self.clean_text(text)
        tokens = word_tokenize(text)
        tokens = [
            self.lemmatizer.lemmatize(word)
            for word in tokens
            if word not in self.stop_words and len(word) > 2
        ]
        return ' '.join(tokens)

# STEP 5: Enhanced EDA with better visualizations
def perform_eda(df):
    """Perform comprehensive EDA"""
    print("📊 Dataset Statistics:")
    print(f"Dataset shape: {df.shape}")
    print(f"Sentiment distribution:\n{df['sentiment'].value_counts()}")
    print(f"Average review length: {df['review'].str.len().mean():.2f} characters")

    # Enhanced sentiment distribution plot
    plt.figure(figsize=(12, 5))

    plt.subplot(1, 2, 1)
    sns.countplot(data=df, x='sentiment', palette='viridis')
    plt.title("Sentiment Distribution")
    plt.ylabel("Count")

    plt.subplot(1, 2, 2)
    df['review_length'] = df['review'].str.len()
    sns.boxplot(data=df, x='sentiment', y='review_length', palette='viridis')
    plt.title("Review Length by Sentiment")
    plt.ylabel("Review Length (characters)")

    plt.tight_layout()
    plt.show()

def create_enhanced_wordcloud(text_freq, title, colormap='viridis'):
    """Create enhanced word cloud with better styling"""
    if not text_freq:
        print(f"No text available for {title}")
        return

    wordcloud = WordCloud(
        background_color='white',
        max_words=100,
        colormap=colormap,
        width=800,
        height=400,
        relative_scaling=0.5,
        min_font_size=10
    ).generate_from_frequencies(text_freq)

    plt.figure(figsize=(12, 6))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(title, fontsize=16, fontweight='bold')
    plt.tight_layout()
    plt.show()

# STEP 6: Enhanced Model Training with Hyperparameter Tuning
class SentimentAnalyzer:
    def __init__(self):
        self.preprocessor = TextPreprocessor()
        self.vectorizer = None
        self.best_model = None
        self.models = {}

    def prepare_data(self, df):
        """Prepare and split data"""
        df['processed_review'] = df['review'].apply(self.preprocessor.preprocess_text)
        X = df['processed_review']
        y = df['sentiment']

        return train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

    def create_pipelines(self):
        """Create ML pipelines with hyperparameter tuning"""
        pipelines = {
            'Logistic Regression': Pipeline([
                ('tfidf', TfidfVectorizer(max_features=10000, ngram_range=(1,2))),
                ('clf', LogisticRegression(max_iter=1000))
            ]),
            'Naive Bayes': Pipeline([
                ('tfidf', TfidfVectorizer(max_features=10000, ngram_range=(1,2))),
                ('clf', MultinomialNB())
            ]),
            'SVM': Pipeline([
                ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1,2))),
                ('clf', SVC(kernel='linear', probability=True))
            ]),
            'Random Forest': Pipeline([
                ('tfidf', TfidfVectorizer(max_features=8000, ngram_range=(1,2))),
                ('clf', RandomForestClassifier(n_estimators=100, random_state=42))
            ])
        }
        return pipelines

    def train_models(self, X_train, y_train, X_test, y_test):
        """Train multiple models and select the best one"""
        pipelines = self.create_pipelines()
        results = {}

        print("🚀 Training Models...")
        for name, pipeline in pipelines.items():
            print(f"\n🔍 Training {name}...")

            # Train model
            pipeline.fit(X_train, y_train)

            # Make predictions
            y_pred = pipeline.predict(X_test)

            # Calculate metrics
            accuracy = accuracy_score(y_test, y_pred)
            cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='accuracy')

            print(f"{name} - Test Accuracy: {accuracy:.4f} | CV Accuracy: {cv_scores.mean():.4f} (±{cv_scores.std()*2:.4f})")

            results[name] = {
                'model': pipeline,
                'accuracy': accuracy,
                'cv_mean': cv_scores.mean(),
                'cv_std': cv_scores.std(),
                'predictions': y_pred
            }

        # Select best model
        best_model_name = max(results, key=lambda x: results[x]['cv_mean'])
        self.best_model = results[best_model_name]['model']

        print(f"\n🏆 Best Model: {best_model_name}")
        print(f"Best CV Score: {results[best_model_name]['cv_mean']:.4f}")

        return results, best_model_name

    def plot_results(self, results, y_test, best_model_name):
        """Plot model comparison and confusion matrix"""
        # Model comparison
        model_names = list(results.keys())
        accuracies = [results[name]['accuracy'] for name in model_names]
        cv_means = [results[name]['cv_mean'] for name in model_names]

        plt.figure(figsize=(15, 5))

        plt.subplot(1, 3, 1)
        x = np.arange(len(model_names))
        width = 0.35
        plt.bar(x - width/2, accuracies, width, label='Test Accuracy', alpha=0.8)
        plt.bar(x + width/2, cv_means, width, label='CV Accuracy', alpha=0.8)
        plt.xlabel('Models')
        plt.ylabel('Accuracy')
        plt.title('Model Performance Comparison')
        plt.xticks(x, model_names, rotation=45)
        plt.legend()

        # Confusion matrix
        plt.subplot(1, 3, 2)
        best_predictions = results[best_model_name]['predictions']
        cm = confusion_matrix(y_test, best_predictions)
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                   xticklabels=self.best_model.classes_,
                   yticklabels=self.best_model.classes_)
        plt.title(f'Confusion Matrix - {best_model_name}')
        plt.xlabel('Predicted')
        plt.ylabel('Actual')

        # Feature importance (for tree-based models)
        if hasattr(self.best_model.named_steps['clf'], 'feature_importances_'):
            plt.subplot(1, 3, 3)
            feature_names = self.best_model.named_steps['tfidf'].get_feature_names_out()
            importances = self.best_model.named_steps['clf'].feature_importances_
            top_indices = np.argsort(importances)[-10:]
            plt.barh(range(len(top_indices)), importances[top_indices])
            plt.yticks(range(len(top_indices)), [feature_names[i] for i in top_indices])
            plt.title('Top 10 Feature Importances')

        plt.tight_layout()
        plt.show()

    def predict_sentiment(self, text):
        """Predict sentiment with confidence and additional features"""
        if not self.best_model:
            return "Model not trained yet!"

        # Get prediction
        prediction = self.best_model.predict([text])[0]
        probabilities = self.best_model.predict_proba([text])[0]
        confidence = max(probabilities) * 100

        # Additional features using TextBlob
        blob = TextBlob(text)
        textblob_sentiment = blob.sentiment.polarity

        # Create detailed response
        result = f"🎯 *Predicted Sentiment:* {prediction.capitalize()}\n"
        result += f"📊 *Confidence:* {confidence:.2f}%\n"
        result += f"🔍 *TextBlob Polarity:* {textblob_sentiment:.3f}\n"

        if confidence < 60:
            result += "⚠ *Note:* Low confidence prediction. The text might be ambiguous."

        return result

# STEP 7: Main execution
def main():
    # Initialize analyzer
    analyzer = SentimentAnalyzer()

    # Perform EDA
    perform_eda(df)

    # Prepare data
    X_train, X_test, y_train, y_test = analyzer.prepare_data(df)

    # Create word clouds
    processed_df = df.copy()
    processed_df['processed_review'] = processed_df['review'].apply(analyzer.preprocessor.preprocess_text)

    from collections import Counter
    for sentiment in df['sentiment'].unique():
        sentiment_text = ' '.join(processed_df[processed_df['sentiment'] == sentiment]['processed_review'])
        # Generate word frequencies
        word_counts = Counter(sentiment_text.split())
        create_enhanced_wordcloud(word_counts, f'{sentiment.capitalize()} Reviews WordCloud')


    # Train models
    results, best_model_name = analyzer.train_models(X_train, y_train, X_test, y_test)

    # Plot results
    analyzer.plot_results(results, y_test, best_model_name)

    # Print classification report
    best_predictions = results[best_model_name]['predictions']
    print(f"\n📈 Classification Report for {best_model_name}:")
    print(classification_report(y_test, best_predictions))

    return analyzer

# Run main function
analyzer = main()

# STEP 8: Enhanced Gradio Interface
def create_gradio_interface(analyzer):
    """Create an enhanced Gradio interface"""

    def analyze_text(text):
        if not text.strip():
            return "Please enter some text to analyze."
        return analyzer.predict_sentiment(text)

    # Custom CSS for better styling
    custom_css = """
    .gradio-container {
        max-width: 800px;
        margin: auto;
    }
    .output-text {
        font-family: 'Courier New', monospace;
    }
    """

    interface = gr.Interface(
        fn=analyze_text,
        inputs=[
            gr.Textbox(
                lines=5,
                placeholder="Enter a movie review or any text to analyze sentiment...",
                label="📝 Input Text"
            )
        ],
        outputs=[
            gr.Textbox(
                label="📊 Sentiment Analysis Results",
                lines=6
            )
        ],
        title="🎬 Advanced Sentiment Analysis Tool",
        description="Enter any text to get detailed sentiment analysis with confidence scores and additional insights.",
        examples=[
            ["This movie was absolutely amazing! The acting was superb and the plot was engaging."],
            ["I didn't like this film at all. It was boring and predictable."],
            ["The movie was okay, nothing special but watchable."],
            ["Terrible acting and poor storyline. Complete waste of time."],
            ["Outstanding performances and incredible cinematography made this a masterpiece."]
        ],
        css=custom_css,
        theme=gr.themes.Soft()
    )

    return interface

# Create and launch the interface
interface = create_gradio_interface(analyzer)
interface.launch(share=True, debug=True)