# üìä CoreTax Sentiment Analysis - Complete Pipeline
## Analisis Sentimen Data Combined dari Semua Platform

**Data Source:** `CoreTax Combined Data Clean.csv`

**Pipeline:**
1. ‚úÖ Load & Explore Data
2. ‚úÖ Text Preprocessing (with visualization)
3. ‚úÖ Sentiment Labeling (RoBERTa Model)
4. ‚úÖ Sentiment Analysis & Visualization
5. ‚úÖ Keyword Extraction (TF-IDF & IndoBERT)
6. ‚úÖ WordCloud Generation

## 1Ô∏è‚É£ Setup & Import Libraries

In [None]:
# Mount Google Drive
from google.colab import drive
import os

print("Mounting Google Drive...")
drive.mount('/content/drive/')
print("‚úì Google Drive mounted!")

# Set data path
DATA_PATH = '/content/drive/MyDrive/Hackathon/data-actual/'
FILE_NAME = 'CoreTax Combined Data Clean.csv'

print(f"\nData path: {DATA_PATH}")
print(f"File name: {FILE_NAME}")

In [None]:
# Install required packages
!pip install -q transformers torch Sastrawi wordcloud scikit-learn matplotlib seaborn pandas numpy tqdm sentence-transformers

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import re
import warnings
import torch
warnings.filterwarnings('ignore')

# NLP Libraries
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from transformers import pipeline, AutoTokenizer, AutoModel

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

# TF-IDF & Clustering
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Progress bar
from tqdm import tqdm
tqdm.pandas()

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("‚úì All libraries imported successfully!")

## 2Ô∏è‚É£ Load & Explore Data

In [None]:
print("=" * 80)
print("LOADING DATA")
print("=" * 80)

# Load data
df = pd.read_csv(DATA_PATH + FILE_NAME, encoding='utf-8')

print(f"\n‚úì Data loaded successfully!")
print(f"Total rows: {len(df):,}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nData shape: {df.shape}")

# Display first few rows
print("\n" + "=" * 80)
print("PREVIEW DATA")
print("=" * 80)
display(df.head(10))

In [None]:
# Data info
print("=" * 80)
print("DATA INFORMATION")
print("=" * 80)
df.info()

print("\n" + "=" * 80)
print("MISSING VALUES")
print("=" * 80)
print(df.isnull().sum())

print("\n" + "=" * 80)
print("DISTRIBUTION BY SOURCE")
print("=" * 80)
print(df['source'].value_counts())

# Visualize source distribution
plt.figure(figsize=(10, 6))
df['source'].value_counts().plot(kind='bar', color='skyblue', edgecolor='black')
plt.title('Distribution of Data by Source', fontsize=16, fontweight='bold')
plt.xlabel('Source', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.xticks(rotation=45)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

## 3Ô∏è‚É£ Text Preprocessing

### Preprocessing Steps:
1. Lowercase conversion
2. Remove URLs, mentions, emails
3. Remove hashtags (keep words)
4. Remove special characters & numbers
5. Normalize slang words
6. Remove stopwords
7. Stemming

In [None]:
# Slang dictionary (Indonesian)
slang_dict = {
    'gak': 'tidak', 'ga': 'tidak', 'gk': 'tidak', 'ngga': 'tidak', 'nggak': 'tidak',
    'udah': 'sudah', 'udh': 'sudah', 'dah': 'sudah',
    'aja': 'saja', 'aj': 'saja',
    'banget': 'sangat', 'bgt': 'sangat',
    'emang': 'memang', 'emg': 'memang',
    'gimana': 'bagaimana', 'gmn': 'bagaimana', 'gmana': 'bagaimana',
    'kenapa': 'mengapa', 'knp': 'mengapa', 'knapa': 'mengapa',
    'kok': 'mengapa',
    'dong': '', 'sih': '',
    'nih': 'ini', 'tuh': 'itu',
    'yg': 'yang', 'dgn': 'dengan', 'utk': 'untuk', 'krn': 'karena',
    'klo': 'kalau', 'kalo': 'kalau',
    'tp': 'tapi', 'tpi': 'tapi',
    'jd': 'jadi', 'jdi': 'jadi',
    'lg': 'lagi', 'lgi': 'lagi',
    'sm': 'sama', 'dr': 'dari', 'dri': 'dari',
    'sdh': 'sudah', 'blm': 'belum', 'blum': 'belum',
    'hrs': 'harus', 'trs': 'terus', 'trus': 'terus',
    'bisa': 'bisa', 'bs': 'bisa',
    'cuma': 'hanya', 'cm': 'hanya',
    'org': 'orang',
    'skrg': 'sekarang', 'skr': 'sekarang',
    'bgm': 'bagaimana', 'dmn': 'dimana',
    'mksh': 'terima kasih', 'makasih': 'terima kasih', 'thx': 'terima kasih',
    'pls': 'tolong', 'plz': 'tolong', 'plis': 'tolong',
    'wkwk': '', 'wkwkwk': '', 'haha': '', 'hehe': '', 'hihi': '',
    'anjir': 'jelek', 'anjing': 'jelek', 'anj': 'jelek',
    'bangsat': 'jelek', 'bngst': 'jelek',
    'tai': 'jelek', 'taik': 'jelek',
    'ampas': 'jelek', 'sampah': 'jelek',
    'eror': 'error', 'erorr': 'error', 'erorrr': 'error',
    'lemot': 'lambat', 'lelet': 'lambat',
    'ribet': 'rumit', 'ruwet': 'rumit',
    'susah': 'sulit', 'gampang': 'mudah',
    'keren': 'bagus', 'mantap': 'bagus', 'mantul': 'bagus',
    'jelek': 'buruk', 'parah': 'buruk', 'payah': 'buruk',
    'bagus': 'baik', 'oke': 'baik', 'ok': 'baik',
}

# Custom stopwords
custom_stopwords = [
    'coretax', 'djp', 'pajak', 'npwp', 'kpp', 'aplikasi', 'website', 'web', 'app',
    'bang', 'kak', 'min', 'pak', 'bu', 'mas', 'mbak',
    'yg', 'nya', 'nih', 'tuh', 'dong', 'sih', 'deh', 'lah', 'kok',
]

print("‚úì Slang dictionary and custom stopwords loaded!")
print(f"  - Slang words: {len(slang_dict)}")
print(f"  - Custom stopwords: {len(custom_stopwords)}")

In [None]:
# Initialize Sastrawi
print("Initializing Sastrawi...")
stemmer_factory = StemmerFactory()
stemmer = stemmer_factory.create_stemmer()

stopword_factory = StopWordRemoverFactory()
stopwords = stopword_factory.get_stop_words()
stopwords.extend(custom_stopwords)

print(f"‚úì Sastrawi initialized!")
print(f"  - Total stopwords: {len(stopwords)}")

In [None]:
def clean_text(text):
    if pd.isna(text) or text == '':
        return ""
    text = str(text).lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'#(\w+)', r'\1', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def normalize_slang(text):
    words = text.split()
    normalized = [slang_dict.get(word, word) for word in words]
    return ' '.join(normalized)

def remove_stopwords(text):
    words = text.split()
    filtered = [word for word in words if word not in stopwords and len(word) > 2]
    return ' '.join(filtered)

def stem_text(text):
    return stemmer.stem(text)

print("‚úì Preprocessing functions defined!")

In [None]:
# Apply preprocessing
print("=" * 80)
print("APPLYING PREPROCESSING")
print("=" * 80)

# Use 'text' column if 'cleaned_text' doesn't exist or is empty
if 'cleaned_text' not in df.columns or df['cleaned_text'].isna().all():
    text_column = 'text'
else:
    text_column = 'cleaned_text'

print(f"\nUsing column: {text_column}")
df['original_text'] = df[text_column].copy()

print("\n1. Cleaning text...")
df['preprocessed'] = df[text_column].progress_apply(clean_text)

print("\n2. Normalizing slang...")
df['preprocessed'] = df['preprocessed'].progress_apply(normalize_slang)

print("\n3. Removing stopwords...")
df['preprocessed'] = df['preprocessed'].progress_apply(remove_stopwords)

print("\n4. Applying stemming...")
df['preprocessed'] = df['preprocessed'].progress_apply(stem_text)

# Remove empty texts
df = df[df['preprocessed'].str.len() > 0].reset_index(drop=True)

print(f"\n‚úì Preprocessing complete!")
print(f"Total data after preprocessing: {len(df):,} rows")

### üìä Visualization: Before vs After Preprocessing

In [None]:
# Sample comparison
print("=" * 80)
print("BEFORE vs AFTER PREPROCESSING - SAMPLE COMPARISON")
print("=" * 80)

sample_df = df.sample(n=10, random_state=42)[['original_text', 'preprocessed', 'source']]
sample_df.index = range(1, 11)

for idx, row in sample_df.iterrows():
    print(f"\n{'='*80}")
    print(f"Sample #{idx} | Source: {row['source']}")
    print(f"{'='*80}")
    print(f"BEFORE: {row['original_text'][:200]}...")
    print(f"\nAFTER:  {row['preprocessed'][:200]}...")

In [None]:
# Text length comparison
df['original_length'] = df['original_text'].str.len()
df['preprocessed_length'] = df['preprocessed'].str.len()

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Before preprocessing
axes[0].hist(df['original_length'], bins=50, color='coral', edgecolor='black', alpha=0.7)
axes[0].set_title('Text Length Distribution - BEFORE Preprocessing', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Character Count', fontsize=12)
axes[0].set_ylabel('Frequency', fontsize=12)
axes[0].axvline(df['original_length'].mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {df["original_length"].mean():.0f}')
axes[0].legend()
axes[0].grid(axis='y', alpha=0.3)

# After preprocessing
axes[1].hist(df['preprocessed_length'], bins=50, color='lightgreen', edgecolor='black', alpha=0.7)
axes[1].set_title('Text Length Distribution - AFTER Preprocessing', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Character Count', fontsize=12)
axes[1].set_ylabel('Frequency', fontsize=12)
axes[1].axvline(df['preprocessed_length'].mean(), color='green', linestyle='--', linewidth=2, label=f'Mean: {df["preprocessed_length"].mean():.0f}')
axes[1].legend()
axes[1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nStatistics:")
print(f"  BEFORE - Mean: {df['original_length'].mean():.2f}, Median: {df['original_length'].median():.2f}")
print(f"  AFTER  - Mean: {df['preprocessed_length'].mean():.2f}, Median: {df['preprocessed_length'].median():.2f}")
print(f"  Reduction: {((df['original_length'].mean() - df['preprocessed_length'].mean()) / df['original_length'].mean() * 100):.2f}%")

## 4Ô∏è‚É£ Sentiment Labeling with RoBERTa

Using: `w11wo/indonesian-roberta-base-sentiment-classifier`

In [None]:
# Initialize RoBERTa sentiment classifier
print("=" * 80)
print("INITIALIZING ROBERTA SENTIMENT CLASSIFIER")
print("=" * 80)

print("\nLoading model: w11wo/indonesian-roberta-base-sentiment-classifier")
print("This may take a few minutes...\n")

sentiment_classifier = pipeline(
    "sentiment-analysis",
    model="w11wo/indonesian-roberta-base-sentiment-classifier",
    device=0 if torch.cuda.is_available() else -1
)

print("\n‚úì RoBERTa model loaded successfully!")
print(f"  - Device: {'GPU' if torch.cuda.is_available() else 'CPU'}")

In [None]:
def get_sentiment_roberta(text):
    if pd.isna(text) or text == '':
        return 'neutral', 0.0
    
    try:
        text = text[:512]
        result = sentiment_classifier(text)[0]
        
        label_map = {
            'LABEL_0': 'negative',
            'LABEL_1': 'neutral',
            'LABEL_2': 'positive',
            'negative': 'negative',
            'neutral': 'neutral',
            'positive': 'positive'
        }
        
        sentiment = label_map.get(result['label'], 'neutral')
        score = result['score']
        
        return sentiment, score
    except Exception as e:
        print(f"Error processing text: {e}")
        return 'neutral', 0.0

print("‚úì Sentiment function defined!")

In [None]:
# Apply sentiment labeling
print("=" * 80)
print("APPLYING SENTIMENT LABELING")
print("=" * 80)

print(f"\nProcessing {len(df):,} texts...")
print("This may take a while depending on your hardware...\n")

tqdm.pandas(desc="Analyzing sentiment")
df[['sentiment_roberta', 'sentiment_score_roberta']] = df['original_text'].progress_apply(
    lambda x: pd.Series(get_sentiment_roberta(x))
)

print(f"\n‚úì Sentiment labeling complete!")
print(f"\nSentiment distribution:")
print(df['sentiment_roberta'].value_counts())
print(f"\nAverage confidence score: {df['sentiment_score_roberta'].mean():.4f}")

## 5Ô∏è‚É£ Sentiment Analysis & Visualization

In [None]:
# Overall sentiment distribution
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

sentiment_counts = df['sentiment_roberta'].value_counts()
colors = {'positive': '#2ecc71', 'neutral': '#f39c12', 'negative': '#e74c3c'}
bar_colors = [colors.get(sent, 'gray') for sent in sentiment_counts.index]

axes[0].bar(sentiment_counts.index, sentiment_counts.values, color=bar_colors, edgecolor='black', alpha=0.8)
axes[0].set_title('Overall Sentiment Distribution', fontsize=16, fontweight='bold')
axes[0].set_xlabel('Sentiment', fontsize=12)
axes[0].set_ylabel('Count', fontsize=12)
axes[0].grid(axis='y', alpha=0.3)

for i, (sent, count) in enumerate(sentiment_counts.items()):
    axes[0].text(i, count, f'{count:,}\n({count/len(df)*100:.1f}%)', 
                ha='center', va='bottom', fontsize=11, fontweight='bold')

axes[1].pie(sentiment_counts.values, labels=sentiment_counts.index, autopct='%1.1f%%',
           colors=[colors.get(sent, 'gray') for sent in sentiment_counts.index],
           startangle=90, textprops={'fontsize': 12, 'fontweight': 'bold'})
axes[1].set_title('Sentiment Proportion', fontsize=16, fontweight='bold')

plt.tight_layout()
plt.show()

In [None]:
# Sentiment by source
sentiment_by_source = pd.crosstab(df['source'], df['sentiment_roberta'], normalize='index') * 100

fig, ax = plt.subplots(figsize=(14, 8))
sentiment_by_source.plot(kind='bar', ax=ax, color=[colors.get(col, 'gray') for col in sentiment_by_source.columns],
                         edgecolor='black', alpha=0.8)
ax.set_title('Sentiment Distribution by Source (%)', fontsize=16, fontweight='bold')
ax.set_xlabel('Source', fontsize=12)
ax.set_ylabel('Percentage (%)', fontsize=12)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
ax.legend(title='Sentiment', title_fontsize=12, fontsize=11)
ax.grid(axis='y', alpha=0.3)

for container in ax.containers:
    ax.bar_label(container, fmt='%.1f%%', fontsize=9)

plt.tight_layout()
plt.show()

print("\nSentiment distribution by source (count):")
print(pd.crosstab(df['source'], df['sentiment_roberta']))

In [None]:
# Confidence score distribution
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for idx, sentiment in enumerate(['positive', 'neutral', 'negative']):
    data = df[df['sentiment_roberta'] == sentiment]['sentiment_score_roberta']
    
    axes[idx].hist(data, bins=30, color=colors[sentiment], edgecolor='black', alpha=0.7)
    axes[idx].set_title(f'{sentiment.capitalize()} - Confidence Score Distribution', 
                       fontsize=14, fontweight='bold')
    axes[idx].set_xlabel('Confidence Score', fontsize=11)
    axes[idx].set_ylabel('Frequency', fontsize=11)
    axes[idx].axvline(data.mean(), color='red', linestyle='--', linewidth=2, 
                     label=f'Mean: {data.mean():.3f}')
    axes[idx].legend()
    axes[idx].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

## 6Ô∏è‚É£ Keyword Extraction

### Method 1: TF-IDF (Statistical Approach)

In [None]:
print("=" * 80)
print("KEYWORD EXTRACTION USING TF-IDF")
print("=" * 80)

def extract_keywords_tfidf(texts, top_n=20):
    vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1, 2))
    tfidf_matrix = vectorizer.fit_transform(texts)
    
    feature_names = vectorizer.get_feature_names_out()
    tfidf_scores = tfidf_matrix.sum(axis=0).A1
    
    keywords_df = pd.DataFrame({
        'keyword': feature_names,
        'tfidf_score': tfidf_scores
    }).sort_values('tfidf_score', ascending=False).head(top_n)
    
    return keywords_df

# Extract keywords for each sentiment
for sentiment in ['positive', 'neutral', 'negative']:
    print(f"\n{'='*80}")
    print(f"Top Keywords (TF-IDF) - {sentiment.upper()}")
    print(f"{'='*80}")
    
    texts = df[df['sentiment_roberta'] == sentiment]['preprocessed'].tolist()
    
    if len(texts) > 0:
        keywords = extract_keywords_tfidf(texts, top_n=20)
        print(keywords.to_string(index=False))
        
        plt.figure(figsize=(12, 6))
        plt.barh(keywords['keyword'], keywords['tfidf_score'], color=colors[sentiment], edgecolor='black', alpha=0.8)
        plt.xlabel('TF-IDF Score', fontsize=12)
        plt.ylabel('Keyword', fontsize=12)
        plt.title(f'Top 20 Keywords (TF-IDF) - {sentiment.capitalize()} Sentiment', fontsize=14, fontweight='bold')
        plt.gca().invert_yaxis()
        plt.grid(axis='x', alpha=0.3)
        plt.tight_layout()
        plt.show()
    else:
        print(f"No data for {sentiment} sentiment")

### Method 2: IndoBERT (Contextual Approach)

Using: `indobenchmark/indobert-base-p1`

In [None]:
# Initialize IndoBERT
print("=" * 80)
print("INITIALIZING INDOBERT FOR KEYWORD EXTRACTION")
print("=" * 80)

print("\nLoading model: indobenchmark/indobert-base-p1")
print("This may take a few minutes...\n")

tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p1")
model = AutoModel.from_pretrained("indobenchmark/indobert-base-p1")

# Move to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
model.eval()

print("\n‚úì IndoBERT model loaded successfully!")
print(f"  - Device: {device}")

In [None]:
def get_bert_embeddings(texts, batch_size=32):
    """
    Get BERT embeddings for texts
    """
    embeddings = []
    
    for i in tqdm(range(0, len(texts), batch_size), desc="Generating embeddings"):
        batch_texts = texts[i:i+batch_size]
        
        # Tokenize
        encoded = tokenizer(batch_texts, padding=True, truncation=True, 
                          max_length=512, return_tensors='pt')
        
        # Move to device
        encoded = {k: v.to(device) for k, v in encoded.items()}
        
        # Get embeddings
        with torch.no_grad():
            outputs = model(**encoded)
            # Use [CLS] token embedding
            batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            embeddings.extend(batch_embeddings)
    
    return np.array(embeddings)

def extract_keywords_bert(texts, top_n=20, sample_size=1000):
    """
    Extract keywords using BERT embeddings and cosine similarity
    """
    # Sample texts if too many
    if len(texts) > sample_size:
        sample_texts = np.random.choice(texts, sample_size, replace=False).tolist()
    else:
        sample_texts = texts
    
    # Get all unique words
    all_words = []
    for text in sample_texts:
        all_words.extend(text.split())
    
    # Count word frequencies
    from collections import Counter
    word_freq = Counter(all_words)
    
    # Get top words by frequency (candidates)
    top_words = [word for word, _ in word_freq.most_common(100)]
    
    # Get embeddings for documents and words
    print(f"  Processing {len(sample_texts)} documents...")
    doc_embeddings = get_bert_embeddings(sample_texts)
    
    print(f"  Processing {len(top_words)} candidate keywords...")
    word_embeddings = get_bert_embeddings(top_words)
    
    # Calculate average document embedding
    avg_doc_embedding = doc_embeddings.mean(axis=0).reshape(1, -1)
    
    # Calculate similarity between words and average document
    similarities = cosine_similarity(word_embeddings, avg_doc_embedding).flatten()
    
    # Create dataframe
    keywords_df = pd.DataFrame({
        'keyword': top_words,
        'bert_score': similarities,
        'frequency': [word_freq[word] for word in top_words]
    }).sort_values('bert_score', ascending=False).head(top_n)
    
    return keywords_df

print("‚úì IndoBERT keyword extraction functions defined!")

In [None]:
# Extract keywords using IndoBERT
print("=" * 80)
print("KEYWORD EXTRACTION USING INDOBERT")
print("=" * 80)

for sentiment in ['positive', 'neutral', 'negative']:
    print(f"\n{'='*80}")
    print(f"Top Keywords (IndoBERT) - {sentiment.upper()}")
    print(f"{'='*80}")
    
    texts = df[df['sentiment_roberta'] == sentiment]['preprocessed'].tolist()
    
    if len(texts) > 0:
        keywords = extract_keywords_bert(texts, top_n=20, sample_size=1000)
        print(keywords.to_string(index=False))
        
        # Visualize
        fig, axes = plt.subplots(1, 2, figsize=(16, 6))
        
        # BERT score
        axes[0].barh(keywords['keyword'], keywords['bert_score'], 
                    color=colors[sentiment], edgecolor='black', alpha=0.8)
        axes[0].set_xlabel('BERT Similarity Score', fontsize=12)
        axes[0].set_ylabel('Keyword', fontsize=12)
        axes[0].set_title(f'Top 20 Keywords (IndoBERT) - {sentiment.capitalize()}', 
                         fontsize=14, fontweight='bold')
        axes[0].invert_yaxis()
        axes[0].grid(axis='x', alpha=0.3)
        
        # Frequency
        axes[1].barh(keywords['keyword'], keywords['frequency'], 
                    color=colors[sentiment], edgecolor='black', alpha=0.6)
        axes[1].set_xlabel('Frequency', fontsize=12)
        axes[1].set_ylabel('Keyword', fontsize=12)
        axes[1].set_title(f'Keyword Frequency - {sentiment.capitalize()}', 
                         fontsize=14, fontweight='bold')
        axes[1].invert_yaxis()
        axes[1].grid(axis='x', alpha=0.3)
        
        plt.tight_layout()
        plt.show()
    else:
        print(f"No data for {sentiment} sentiment")

## 7Ô∏è‚É£ WordCloud Generation

In [None]:
print("=" * 80)
print("GENERATING WORDCLOUDS")
print("=" * 80)

fig, axes = plt.subplots(1, 3, figsize=(20, 6))

for idx, sentiment in enumerate(['positive', 'neutral', 'negative']):
    print(f"\nGenerating WordCloud for {sentiment}...")
    
    texts = ' '.join(df[df['sentiment_roberta'] == sentiment]['preprocessed'].tolist())
    
    if len(texts) > 0:
        wordcloud = WordCloud(
            width=800, 
            height=400,
            background_color='white',
            colormap='Greens' if sentiment == 'positive' else ('Oranges' if sentiment == 'neutral' else 'Reds'),
            max_words=100,
            relative_scaling=0.5,
            min_font_size=10
        ).generate(texts)
        
        axes[idx].imshow(wordcloud, interpolation='bilinear')
        axes[idx].set_title(f'{sentiment.capitalize()} Sentiment', fontsize=16, fontweight='bold')
        axes[idx].axis('off')
    else:
        axes[idx].text(0.5, 0.5, f'No data for {sentiment}', 
                      ha='center', va='center', fontsize=14)
        axes[idx].axis('off')

plt.tight_layout()
plt.show()

print("\n‚úì WordClouds generated!")

## 8Ô∏è‚É£ Export Results

In [None]:
print("=" * 80)
print("EXPORTING RESULTS")
print("=" * 80)

# Select columns to export
export_df = df[[
    'original_text',
    'preprocessed',
    'sentiment_roberta',
    'sentiment_score_roberta',
    'source'
]].copy()

# Rename columns
export_df.columns = ['text', 'cleaned_text', 'sentiment', 'sentiment_score', 'source']

# Export to CSV
output_file = DATA_PATH + 'CoreTax_Sentiment_Analysis_Results.csv'
export_df.to_csv(output_file, index=False, encoding='utf-8')

print(f"\n‚úì Results exported to: {output_file}")
print(f"Total rows exported: {len(export_df):,}")

# Download file (for Google Colab)
try:
    from google.colab import files
    print(f"\nDownloading file...")
    files.download(output_file)
    print(f"‚úì File downloaded!")
except:
    print(f"\n(File saved to Google Drive: {output_file})")

## 9Ô∏è‚É£ Summary Statistics

In [None]:
print("=" * 80)
print("SUMMARY STATISTICS")
print("=" * 80)

print(f"\nüìä OVERALL STATISTICS")
print(f"  Total data analyzed: {len(df):,} rows")
print(f"  Data sources: {df['source'].nunique()}")
print(f"  Average text length (original): {df['original_length'].mean():.2f} characters")
print(f"  Average text length (preprocessed): {df['preprocessed_length'].mean():.2f} characters")

print(f"\nüòä SENTIMENT DISTRIBUTION")
for sentiment in ['positive', 'neutral', 'negative']:
    count = len(df[df['sentiment_roberta'] == sentiment])
    percentage = (count / len(df)) * 100
    avg_score = df[df['sentiment_roberta'] == sentiment]['sentiment_score_roberta'].mean()
    print(f"  {sentiment.capitalize():8s}: {count:6,} ({percentage:5.2f}%) | Avg confidence: {avg_score:.4f}")

print(f"\nüì± DISTRIBUTION BY SOURCE")
for source in df['source'].unique():
    count = len(df[df['source'] == source])
    percentage = (count / len(df)) * 100
    print(f"  {source:12s}: {count:6,} ({percentage:5.2f}%)")

print(f"\nüéØ SENTIMENT BY SOURCE")
print(pd.crosstab(df['source'], df['sentiment_roberta'], margins=True))

print(f"\n‚úì Analysis complete!")