# LLM-Based Sentiment Analysis with IndoBERT

This notebook uses the new SentimentAnalyzer class with:
- External data files for slang, emoji, and stopwords
- IndoBERT transformer model for sentiment analysis
- LLM prompting approach as fallback
- Advanced preprocessing and duplicate detection

In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import re
import os
import json
from collections import Counter
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import sys

# Add parent directory to path to import sentiment_analyzer
sys.path.append('..')
from sentiment_analyzer import SentimentAnalyzer

# Set style for matplotlib
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Configure pandas display
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 100)

print("Libraries imported successfully!")

Libraries imported successfully!


## 1. Initialize Sentiment Analyzer

In [2]:
# Initialize the sentiment analyzer
print("Initializing SentimentAnalyzer...")
analyzer = SentimentAnalyzer(data_dir="../data")

# Get model information
model_info = analyzer.get_model_info()
print("\nModel Information:")
for key, value in model_info.items():
    print(f"  {key}: {value}")

print("\nSentiment analyzer ready!")

Initializing SentimentAnalyzer...
Loaded 263 slang mappings
Loaded 678 emoji mappings
Loaded 753 stopwords
Initializing IndoBERT model...
Trying to load model: indobenchmark/indobert-base-p1
Failed to load indobenchmark/indobert-base-p1: There was a specific connection error when trying to load indobenchmark/indobert-base-p1:
401 Client Error: Unauthorized for url: https://huggingface.co/indobenchmark/indobert-base-p1/resolve/main/config.json (Request ID: Root=1-68a7ca6a-2661d7e543641d6158e5e566;f626a2ef-6606-4c32-afba-7d8c28d3888d)

Invalid credentials in Authorization header
Trying to load model: cahya/bert-base-indonesian-522M
Failed to load cahya/bert-base-indonesian-522M: There was a specific connection error when trying to load cahya/bert-base-indonesian-522M:
401 Client Error: Unauthorized for url: https://huggingface.co/cahya/bert-base-indonesian-522M/resolve/main/config.json (Request ID: Root=1-68a7ca6a-0b2ec3dd1d5caf8b5f0cc636;f7c4a6ac-9dcd-4786-80dd-d6d9ea4bff17)

Invalid cr

## 2. Data Loading and Preprocessing

In [3]:
def load_and_clean_data():
    """Load data from both platforms and remove duplicates"""
    data_frames = []
    
    # Load TikTok data
    tiktok_path = '../data/tiktok/all_tiktok_comments.csv'
    if os.path.exists(tiktok_path):
        tiktok_df = pd.read_csv(tiktok_path)
        tiktok_df['platform'] = 'TikTok'
        tiktok_df['text'] = tiktok_df.get('comment', '')
        tiktok_df['post_id'] = tiktok_df.get('aweme_id', '')
        tiktok_df['user_id'] = tiktok_df.get('username', '')
        data_frames.append(tiktok_df)
        print(f"Loaded {len(tiktok_df)} TikTok comments")
    else:
        print("TikTok data not found")
    
    # Load Instagram data
    instagram_path = '../data/instagram/all_comments.csv'
    if os.path.exists(instagram_path):
        instagram_df = pd.read_csv(instagram_path)
        instagram_df['platform'] = 'Instagram'
        instagram_df['text'] = instagram_df.get('comment_text', '')
        instagram_df['post_id'] = instagram_df.get('post_id', '')
        instagram_df['user_id'] = instagram_df.get('username', '')
        data_frames.append(instagram_df)
        print(f"Loaded {len(instagram_df)} Instagram comments")
    else:
        print("Instagram data not found")
    
    if not data_frames:
        print("No data found. Please run the scrapers first.")
        return pd.DataFrame()
    
    # Combine dataframes
    combined_df = pd.concat(data_frames, ignore_index=True)
    print(f"\nTotal comments before cleaning: {len(combined_df)}")
    
    # Remove empty comments
    combined_df = combined_df[combined_df['text'].notna()]
    combined_df = combined_df[combined_df['text'].str.strip() != '']
    print(f"After removing empty comments: {len(combined_df)}")
    
    # Remove duplicates based on username and post_id (potential spam)
    print(f"\nDuplicate detection:")
    duplicates = combined_df.duplicated(subset=['user_id', 'post_id'], keep='first')
    print(f"Found {duplicates.sum()} potential spam comments (same user, same post)")
    
    # Show some examples of duplicates before removing
    if duplicates.sum() > 0:
        print("\nExamples of potential spam:")
        spam_examples = combined_df[duplicates][['user_id', 'post_id', 'text', 'platform']].head()
        print(spam_examples)
    
    # Remove duplicates
    combined_df = combined_df[~duplicates]
    print(f"\nAfter removing duplicates: {len(combined_df)}")
    
    # Also remove exact text duplicates (copy-paste spam)
    text_duplicates = combined_df.duplicated(subset=['text'], keep='first')
    print(f"Found {text_duplicates.sum()} exact text duplicates")
    combined_df = combined_df[~text_duplicates]
    print(f"After removing text duplicates: {len(combined_df)}")
    
    return combined_df

# Load the data
df = load_and_clean_data()

if not df.empty:
    print(f"\nFinal dataset:")
    print(f"Total comments: {len(df)}")
    print(f"Platforms: {df['platform'].value_counts().to_dict()}")
    print(f"Unique users: {df['user_id'].nunique()}")
    print(f"Unique posts: {df['post_id'].nunique()}")

TikTok data not found
Instagram data not found
No data found. Please run the scrapers first.


## 3. Advanced Text Preprocessing

In [4]:
if not df.empty:
    print("Applying advanced preprocessing...")
    
    # Apply preprocessing
    df['cleaned_text'] = df['text'].apply(analyzer.preprocess_text)
    
    # Remove comments that became empty after cleaning
    df = df[df['cleaned_text'].str.len() > 0]
    print(f"After advanced cleaning: {len(df)} comments")
    
    # Show examples of preprocessing
    print("\nPreprocessing examples:")
    examples = df[['text', 'cleaned_text']].head(10)
    for idx, row in examples.iterrows():
        print(f"Original: {row['text'][:100]}...")
        print(f"Cleaned:  {row['cleaned_text'][:100]}...")
        print("-" * 50)
    
    # Also create version without stopwords for word clouds
    df['cleaned_no_stopwords'] = df['cleaned_text'].apply(
        lambda x: analyzer.preprocess_text(x, remove_stopwords=True)
    )
    
    print(f"\nText length statistics:")
    print(f"Original text - Mean: {df['text'].str.len().mean():.1f}, Median: {df['text'].str.len().median():.1f}")
    print(f"Cleaned text - Mean: {df['cleaned_text'].str.len().mean():.1f}, Median: {df['cleaned_text'].str.len().median():.1f}")
    print(f"No stopwords - Mean: {df['cleaned_no_stopwords'].str.len().mean():.1f}, Median: {df['cleaned_no_stopwords'].str.len().median():.1f}")

## 4. LLM-Based Sentiment Analysis

In [5]:
if not df.empty:
    print("Performing LLM-based sentiment analysis...")
    
    # Analyze sentiment using the new analyzer
    sentiment_results = []
    
    for idx, text in enumerate(df['cleaned_text']):
        if idx % 10 == 0:  # Progress indicator
            print(f"Processing {idx}/{len(df)} comments...")
        
        result = analyzer.analyze_sentiment(text)
        sentiment_results.append(result)
    
    # Extract results into separate columns
    df['sentiment'] = [r['sentiment'] for r in sentiment_results]
    df['confidence'] = [r['confidence'] for r in sentiment_results]
    df['method'] = [r.get('method', 'unknown') for r in sentiment_results]
    
    print("\nSentiment analysis completed!")
    
    print(f"\nSentiment distribution:")
    print(df['sentiment'].value_counts())
    
    print(f"\nMethod distribution:")
    print(df['method'].value_counts())
    
    print(f"\nAverage confidence: {df['confidence'].mean():.3f}")
    print(f"Confidence by sentiment:")
    print(df.groupby('sentiment')['confidence'].agg(['mean', 'std', 'count']))

## 5. Enhanced Visualizations

In [6]:
if not df.empty:
    # Create comprehensive visualization
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    
    # 1. Sentiment distribution
    sentiment_counts = df['sentiment'].value_counts()
    axes[0, 0].pie(sentiment_counts.values, labels=sentiment_counts.index, autopct='%1.1f%%', startangle=90)
    axes[0, 0].set_title('Sentiment Distribution (LLM-based)')
    
    # 2. Platform-wise sentiment
    platform_sentiment = pd.crosstab(df['platform'], df['sentiment'])
    platform_sentiment.plot(kind='bar', ax=axes[0, 1], rot=45)
    axes[0, 1].set_title('Sentiment Distribution by Platform')
    axes[0, 1].legend(title='Sentiment')
    
    # 3. Confidence distribution
    axes[0, 2].hist(df['confidence'], bins=20, alpha=0.7, color='skyblue', edgecolor='black')
    axes[0, 2].axvline(df['confidence'].mean(), color='red', linestyle='--', 
                      label=f'Mean: {df["confidence"].mean():.3f}')
    axes[0, 2].set_title('Confidence Distribution')
    axes[0, 2].set_xlabel('Confidence Score')
    axes[0, 2].set_ylabel('Frequency')
    axes[0, 2].legend()
    
    # 4. Method distribution
    method_counts = df['method'].value_counts()
    axes[1, 0].bar(method_counts.index, method_counts.values)
    axes[1, 0].set_title('Analysis Method Distribution')
    axes[1, 0].set_xlabel('Method')
    axes[1, 0].set_ylabel('Count')
    axes[1, 0].tick_params(axis='x', rotation=45)
    
    # 5. Confidence by sentiment
    df.boxplot(column='confidence', by='sentiment', ax=axes[1, 1])
    axes[1, 1].set_title('Confidence by Sentiment')
    axes[1, 1].set_xlabel('Sentiment')
    axes[1, 1].set_ylabel('Confidence')
    
    # 6. Text length vs confidence
    df['text_length'] = df['cleaned_text'].str.len()
    axes[1, 2].scatter(df['text_length'], df['confidence'], alpha=0.6)
    axes[1, 2].set_title('Text Length vs Confidence')
    axes[1, 2].set_xlabel('Text Length (characters)')
    axes[1, 2].set_ylabel('Confidence')
    
    plt.tight_layout()
    plt.show()

## 6. Word Clouds

In [7]:
def create_wordcloud(text_data, title="Word Cloud", max_words=100):
    """Create and display word cloud"""
    if not text_data:
        print("No text data available for word cloud")
        return
    
    # Combine all text
    all_text = ' '.join(text_data)
    
    if not all_text.strip():
        print(f"No meaningful text for {title}")
        return
    
    # Create word cloud
    wordcloud = WordCloud(
        width=800, 
        height=400, 
        background_color='white',
        max_words=max_words,
        colormap='viridis'
    ).generate(all_text)
    
    # Display
    plt.figure(figsize=(12, 6))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(title, fontsize=16, fontweight='bold')
    plt.tight_layout()
    plt.show()

if not df.empty:
    # Overall word cloud (using text without stopwords)
    create_wordcloud(df['cleaned_no_stopwords'].tolist(), "Overall Comments Word Cloud")
    
    # Word clouds by sentiment
    for sentiment in ['positif', 'negatif', 'netral']:
        sentiment_texts = df[df['sentiment'] == sentiment]['cleaned_no_stopwords'].tolist()
        if sentiment_texts:
            create_wordcloud(sentiment_texts, f"{sentiment.title()} Comments Word Cloud")
    
    # Word clouds by platform
    for platform in df['platform'].unique():
        platform_texts = df[df['platform'] == platform]['cleaned_no_stopwords'].tolist()
        if platform_texts:
            create_wordcloud(platform_texts, f"{platform} Comments Word Cloud")

## 7. Save Results and Model

In [8]:
if not df.empty:
    # Save the processed dataset
    output_path = '../data/llm_sentiment_analysis_results.csv'
    df.to_csv(output_path, index=False)
    print(f"Processed data saved to: {output_path}")
    
    # Create comprehensive summary
    summary_stats = {
        'total_comments': len(df),
        'platform_distribution': df['platform'].value_counts().to_dict(),
        'sentiment_distribution': df['sentiment'].value_counts().to_dict(),
        'method_distribution': df['method'].value_counts().to_dict(),
        'average_confidence': float(df['confidence'].mean()),
        'confidence_by_sentiment': df.groupby('sentiment')['confidence'].mean().to_dict(),
        'model_info': analyzer.get_model_info(),
        'preprocessing_stats': {
            'original_avg_length': float(df['text'].str.len().mean()),
            'cleaned_avg_length': float(df['cleaned_text'].str.len().mean()),
            'no_stopwords_avg_length': float(df['cleaned_no_stopwords'].str.len().mean())
        }
    }
    
    with open('../data/llm_model_summary.json', 'w', encoding='utf-8') as f:
        json.dump(summary_stats, f, indent=2, ensure_ascii=False)
    
    print("Model summary saved for Flask app")
    
    # Display final statistics
    print("\n=== LLM SENTIMENT ANALYSIS SUMMARY ===")
    print(f"Total comments analyzed: {len(df)}")
    print(f"\nSentiment Distribution:")
    for sentiment, count in df['sentiment'].value_counts().items():
        percentage = (count / len(df)) * 100
        print(f"  {sentiment}: {count} ({percentage:.1f}%)")
    
    print(f"\nMethod Distribution:")
    for method, count in df['method'].value_counts().items():
        percentage = (count / len(df)) * 100
        print(f"  {method}: {count} ({percentage:.1f}%)")
    
    print(f"\nAverage Confidence: {df['confidence'].mean():.3f}")
    print(f"\nModel Information:")
    for key, value in analyzer.get_model_info().items():
        print(f"  {key}: {value}")
    
    print("\nLLM-based sentiment analysis complete!")