# Advanced Sentiment Analysis for Social Media Comments

This notebook performs comprehensive sentiment analysis with advanced preprocessing:
- Emoji to text conversion
- Slang normalization
- Stopword removal
- Duplicate detection and removal
- Word cloud generation
- Sentiment distribution visualization

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from textblob import TextBlob
import re
import os
import json
from collections import Counter
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import string
import pickle

# Download required NLTK data
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

# Set style for matplotlib
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Configure pandas display
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 100)

## 1. Advanced Preprocessing Functions

In [None]:
# Indonesian slang dictionary
SLANG_DICT = {
    'gw': 'saya', 'gue': 'saya', 'w': 'saya', 'aku': 'saya',
    'lu': 'kamu', 'lo': 'kamu', 'u': 'kamu', 'km': 'kamu',
    'bgt': 'banget', 'bgt': 'banget', 'bgtt': 'banget',
    'yg': 'yang', 'yng': 'yang', 'yank': 'yang',
    'dgn': 'dengan', 'dg': 'dengan', 'sm': 'sama',
    'tp': 'tapi', 'tpi': 'tapi', 'tp': 'tetapi',
    'ga': 'tidak', 'gak': 'tidak', 'g': 'tidak', 'tdk': 'tidak',
    'udh': 'sudah', 'udah': 'sudah', 'dah': 'sudah',
    'blm': 'belum', 'blom': 'belum',
    'krn': 'karena', 'krna': 'karena', 'soalnya': 'karena',
    'jd': 'jadi', 'jdi': 'jadi', 'jadinya': 'jadi',
    'kl': 'kalau', 'klo': 'kalau', 'kalo': 'kalau',
    'gmn': 'gimana', 'gmna': 'gimana', 'bagaimana': 'gimana',
    'knp': 'kenapa', 'knpa': 'kenapa', 'mengapa': 'kenapa',
    'emg': 'memang', 'emang': 'memang', 'mmg': 'memang',
    'bkn': 'bukan', 'bukn': 'bukan',
    'hrs': 'harus', 'hrus': 'harus', 'mesti': 'harus',
    'bs': 'bisa', 'bsa': 'bisa', 'dapat': 'bisa',
    'org': 'orang', 'orng': 'orang', 'people': 'orang',
    'skrg': 'sekarang', 'skrang': 'sekarang', 'now': 'sekarang',
    'ntr': 'nanti', 'nanti': 'nanti', 'later': 'nanti',
    'kmrn': 'kemarin', 'kemaren': 'kemarin', 'yesterday': 'kemarin',
    'bsok': 'besok', 'besok': 'besok', 'tomorrow': 'besok',
    'mantap': 'bagus', 'mantul': 'bagus', 'keren': 'bagus',
    'jelek': 'buruk', 'ancur': 'buruk', 'parah': 'buruk',
    'wkwk': 'haha', 'wkwkwk': 'haha', 'hehe': 'haha',
    'anjay': 'wow', 'anjir': 'wow', 'astaga': 'wow',
    'bro': 'saudara', 'sis': 'saudara', 'guys': 'teman'
}

# Emoji to text mapping (common ones)
EMOJI_DICT = {
    '😀': 'senang', '😃': 'senang', '😄': 'senang', '😁': 'senang',
    '😆': 'tertawa', '😅': 'tertawa', '🤣': 'tertawa', '😂': 'tertawa',
    '🙂': 'senyum', '😊': 'senyum', '😇': 'senyum',
    '😍': 'cinta', '🥰': 'cinta', '😘': 'cinta', '💕': 'cinta', '❤️': 'cinta',
    '😢': 'sedih', '😭': 'sedih', '😿': 'sedih', '💔': 'sedih',
    '😠': 'marah', '😡': 'marah', '🤬': 'marah', '💢': 'marah',
    '😱': 'kaget', '😨': 'takut', '😰': 'cemas',
    '🤔': 'bingung', '😕': 'bingung', '😐': 'biasa',
    '👍': 'bagus', '👎': 'jelek', '👏': 'tepuk tangan',
    '🔥': 'keren', '💯': 'sempurna', '✨': 'bagus',
    '🙏': 'terima kasih', '💪': 'kuat', '👌': 'oke'
}

def convert_emojis_to_text(text):
    """Convert emojis to meaningful text"""
    if pd.isna(text):
        return ''
    
    text = str(text)
    for emoji, meaning in EMOJI_DICT.items():
        text = text.replace(emoji, f' {meaning} ')
    
    return text

def normalize_slang(text):
    """Normalize Indonesian slang words"""
    if pd.isna(text):
        return ''
    
    text = str(text).lower()
    words = text.split()
    normalized_words = []
    
    for word in words:
        # Remove punctuation from word for lookup
        clean_word = word.strip(string.punctuation)
        if clean_word in SLANG_DICT:
            normalized_words.append(SLANG_DICT[clean_word])
        else:
            normalized_words.append(word)
    
    return ' '.join(normalized_words)

def remove_stopwords(text, language='indonesian'):
    """Remove stopwords from text"""
    if pd.isna(text):
        return ''
    
    try:
        stop_words = set(stopwords.words(language))
    except:
        # If Indonesian not available, use English
        stop_words = set(stopwords.words('english'))
    
    # Add common Indonesian stopwords
    indonesian_stopwords = {
        'dan', 'atau', 'tetapi', 'namun', 'karena', 'sebab', 'oleh', 'untuk',
        'dari', 'ke', 'di', 'pada', 'dalam', 'dengan', 'tanpa', 'seperti',
        'ini', 'itu', 'tersebut', 'yang', 'adalah', 'akan', 'telah', 'sudah',
        'belum', 'masih', 'juga', 'hanya', 'saja', 'pun', 'lah', 'kah'
    }
    stop_words.update(indonesian_stopwords)
    
    words = word_tokenize(str(text).lower())
    filtered_words = [word for word in words if word not in stop_words and word not in string.punctuation]
    
    return ' '.join(filtered_words)

def advanced_clean_text(text):
    """Comprehensive text cleaning function"""
    if pd.isna(text) or text == '':
        return ''
    
    # Convert to string
    text = str(text)
    
    # Convert emojis to text
    text = convert_emojis_to_text(text)
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove mentions and hashtags
    text = re.sub(r'@\w+|#\w+', '', text)
    
    # Remove numbers (optional - you might want to keep some)
    text = re.sub(r'\d+', '', text)
    
    # Remove extra punctuation but keep sentence structure
    text = re.sub(r'[^\w\s.,!?]', '', text)
    
    # Normalize slang
    text = normalize_slang(text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

print("Preprocessing functions loaded successfully!")

## 2. Data Loading and Duplicate Removal

In [None]:
def load_and_clean_data():
    """Load data from both platforms and remove duplicates"""
    data_frames = []
    
    # Load TikTok data
    tiktok_path = '../data/tiktok/all_tiktok_comments.csv'
    if os.path.exists(tiktok_path):
        tiktok_df = pd.read_csv(tiktok_path)
        tiktok_df['platform'] = 'TikTok'
        tiktok_df['text'] = tiktok_df.get('comment', '')
        tiktok_df['post_id'] = tiktok_df.get('aweme_id', '')
        tiktok_df['user_id'] = tiktok_df.get('username', '')
        data_frames.append(tiktok_df)
        print(f"Loaded {len(tiktok_df)} TikTok comments")
    else:
        print("TikTok data not found")
    
    # Load Instagram data
    instagram_path = '../data/instagram/all_comments.csv'
    if os.path.exists(instagram_path):
        instagram_df = pd.read_csv(instagram_path)
        instagram_df['platform'] = 'Instagram'
        instagram_df['text'] = instagram_df.get('comment_text', '')
        instagram_df['post_id'] = instagram_df.get('post_id', '')
        instagram_df['user_id'] = instagram_df.get('username', '')
        data_frames.append(instagram_df)
        print(f"Loaded {len(instagram_df)} Instagram comments")
    else:
        print("Instagram data not found")
    
    if not data_frames:
        print("No data found. Please run the scrapers first.")
        return pd.DataFrame()
    
    # Combine dataframes
    combined_df = pd.concat(data_frames, ignore_index=True)
    print(f"\nTotal comments before cleaning: {len(combined_df)}")
    
    # Remove empty comments
    combined_df = combined_df[combined_df['text'].notna()]
    combined_df = combined_df[combined_df['text'].str.strip() != '']
    print(f"After removing empty comments: {len(combined_df)}")
    
    # Remove duplicates based on username and post_id (potential spam)
    print(f"\nDuplicate detection:")
    duplicates = combined_df.duplicated(subset=['user_id', 'post_id'], keep='first')
    print(f"Found {duplicates.sum()} potential spam comments (same user, same post)")
    
    # Show some examples of duplicates before removing
    if duplicates.sum() > 0:
        print("\nExamples of potential spam:")
        spam_examples = combined_df[duplicates][['user_id', 'post_id', 'text', 'platform']].head()
        print(spam_examples)
    
    # Remove duplicates
    combined_df = combined_df[~duplicates]
    print(f"\nAfter removing duplicates: {len(combined_df)}")
    
    # Also remove exact text duplicates (copy-paste spam)
    text_duplicates = combined_df.duplicated(subset=['text'], keep='first')
    print(f"Found {text_duplicates.sum()} exact text duplicates")
    combined_df = combined_df[~text_duplicates]
    print(f"After removing text duplicates: {len(combined_df)}")
    
    return combined_df

# Load the data
df = load_and_clean_data()

if not df.empty:
    print(f"\nFinal dataset:")
    print(f"Total comments: {len(df)}")
    print(f"Platforms: {df['platform'].value_counts().to_dict()}")
    print(f"Unique users: {df['user_id'].nunique()}")
    print(f"Unique posts: {df['post_id'].nunique()}")

## 3. Advanced Text Preprocessing

In [None]:
if not df.empty:
    print("Applying advanced preprocessing...")
    
    # Apply advanced cleaning
    df['cleaned_text'] = df['text'].apply(advanced_clean_text)
    
    # Remove comments that became empty after cleaning
    df = df[df['cleaned_text'].str.len() > 0]
    print(f"After advanced cleaning: {len(df)} comments")
    
    # Show examples of preprocessing
    print("\nPreprocessing examples:")
    examples = df[['text', 'cleaned_text']].head(10)
    for idx, row in examples.iterrows():
        print(f"Original: {row['text'][:100]}...")
        print(f"Cleaned:  {row['cleaned_text'][:100]}...")
        print("-" * 50)
    
    # Apply stopword removal (optional - create both versions)
    df['cleaned_no_stopwords'] = df['cleaned_text'].apply(remove_stopwords)
    
    print(f"\nText length statistics:")
    print(f"Original text - Mean: {df['text'].str.len().mean():.1f}, Median: {df['text'].str.len().median():.1f}")
    print(f"Cleaned text - Mean: {df['cleaned_text'].str.len().mean():.1f}, Median: {df['cleaned_text'].str.len().median():.1f}")
    print(f"No stopwords - Mean: {df['cleaned_no_stopwords'].str.len().mean():.1f}, Median: {df['cleaned_no_stopwords'].str.len().median():.1f}")

## 4. Sentiment Analysis with Preprocessing

In [None]:
def analyze_sentiment_advanced(text):
    """Advanced sentiment analysis with better classification"""
    if not text or text.strip() == '':
        return 0, 0, 'neutral'
    
    blob = TextBlob(text)
    polarity = blob.sentiment.polarity
    subjectivity = blob.sentiment.subjectivity
    
    # More nuanced sentiment classification
    if polarity > 0.3:
        sentiment = 'very_positive'
    elif polarity > 0.1:
        sentiment = 'positive'
    elif polarity > -0.1:
        sentiment = 'neutral'
    elif polarity > -0.3:
        sentiment = 'negative'
    else:
        sentiment = 'very_negative'
    
    return polarity, subjectivity, sentiment

if not df.empty:
    print("Performing advanced sentiment analysis...")
    
    # Analyze sentiment on cleaned text
    sentiment_results = df['cleaned_text'].apply(analyze_sentiment_advanced)
    df[['polarity', 'subjectivity', 'sentiment']] = pd.DataFrame(sentiment_results.tolist(), index=df.index)
    
    # Also analyze original text for comparison
    original_sentiment = df['text'].apply(analyze_sentiment_advanced)
    df[['original_polarity', 'original_subjectivity', 'original_sentiment']] = pd.DataFrame(original_sentiment.tolist(), index=df.index)
    
    print("Sentiment analysis completed!")
    
    print(f"\nSentiment distribution (cleaned text):")
    print(df['sentiment'].value_counts())
    
    print(f"\nSentiment distribution (original text):")
    print(df['original_sentiment'].value_counts())
    
    print(f"\nComparison of preprocessing impact:")
    print(f"Cleaned - Average polarity: {df['polarity'].mean():.3f}")
    print(f"Original - Average polarity: {df['original_polarity'].mean():.3f}")
    
    # Show examples where preprocessing changed sentiment
    sentiment_changed = df[df['sentiment'] != df['original_sentiment']]
    if len(sentiment_changed) > 0:
        print(f"\nPreprocessing changed sentiment for {len(sentiment_changed)} comments:")
        for idx, row in sentiment_changed.head(5).iterrows():
            print(f"Text: {row['text'][:80]}...")
            print(f"Original: {row['original_sentiment']} -> Cleaned: {row['sentiment']}")
            print("-" * 50)

## 5. Enhanced Visualizations

In [None]:
if not df.empty:
    # Create comprehensive visualization
    fig, axes = plt.subplots(3, 2, figsize=(16, 18))
    
    # 1. Sentiment distribution comparison
    sentiment_comparison = pd.DataFrame({
        'Original': df['original_sentiment'].value_counts(),
        'Cleaned': df['sentiment'].value_counts()
    }).fillna(0)
    
    sentiment_comparison.plot(kind='bar', ax=axes[0, 0], rot=45)
    axes[0, 0].set_title('Sentiment Distribution: Original vs Cleaned Text')
    axes[0, 0].legend()
    
    # 2. Platform-wise sentiment
    platform_sentiment = pd.crosstab(df['platform'], df['sentiment'])
    platform_sentiment.plot(kind='bar', ax=axes[0, 1], rot=45)
    axes[0, 1].set_title('Sentiment Distribution by Platform')
    axes[0, 1].legend(title='Sentiment')
    
    # 3. Polarity distribution
    axes[1, 0].hist([df['original_polarity'], df['polarity']], bins=30, alpha=0.7, 
                   label=['Original', 'Cleaned'], color=['red', 'blue'])
    axes[1, 0].set_title('Polarity Distribution Comparison')
    axes[1, 0].set_xlabel('Polarity Score')
    axes[1, 0].set_ylabel('Frequency')
    axes[1, 0].legend()
    
    # 4. Subjectivity distribution
    axes[1, 1].hist([df['original_subjectivity'], df['subjectivity']], bins=30, alpha=0.7,
                   label=['Original', 'Cleaned'], color=['orange', 'green'])
    axes[1, 1].set_title('Subjectivity Distribution Comparison')
    axes[1, 1].set_xlabel('Subjectivity Score')
    axes[1, 1].set_ylabel('Frequency')
    axes[1, 1].legend()
    
    # 5. Text length distribution
    text_lengths = pd.DataFrame({
        'Original': df['text'].str.len(),
        'Cleaned': df['cleaned_text'].str.len(),
        'No Stopwords': df['cleaned_no_stopwords'].str.len()
    })
    
    text_lengths.boxplot(ax=axes[2, 0])
    axes[2, 0].set_title('Text Length Distribution by Processing Stage')
    axes[2, 0].set_ylabel('Character Count')
    
    # 6. Sentiment by text length
    df['text_length_category'] = pd.cut(df['cleaned_text'].str.len(), 
                                       bins=[0, 20, 50, 100, float('inf')], 
                                       labels=['Very Short', 'Short', 'Medium', 'Long'])
    
    length_sentiment = pd.crosstab(df['text_length_category'], df['sentiment'])
    length_sentiment.plot(kind='bar', ax=axes[2, 1], rot=45)
    axes[2, 1].set_title('Sentiment by Text Length Category')
    axes[2, 1].legend(title='Sentiment')
    
    plt.tight_layout()
    plt.show()

## 6. Save Preprocessed Data and Model

In [None]:
if not df.empty:
    # Save the processed dataset
    output_path = '../data/processed_sentiment_data.csv'
    df.to_csv(output_path, index=False)
    print(f"Processed data saved to: {output_path}")
    
    # Save preprocessing functions for Flask app
    preprocessing_functions = {
        'slang_dict': SLANG_DICT,
        'emoji_dict': EMOJI_DICT
    }
    
    with open('../data/preprocessing_config.json', 'w', encoding='utf-8') as f:
        json.dump(preprocessing_functions, f, ensure_ascii=False, indent=2)
    
    print("Preprocessing configuration saved for Flask app")
    
    # Create summary for Flask app
    summary_stats = {
        'total_comments': len(df),
        'platform_distribution': df['platform'].value_counts().to_dict(),
        'sentiment_distribution': df['sentiment'].value_counts().to_dict(),
        'average_polarity': float(df['polarity'].mean()),
        'average_subjectivity': float(df['subjectivity'].mean()),
        'preprocessing_impact': {
            'original_avg_polarity': float(df['original_polarity'].mean()),
            'cleaned_avg_polarity': float(df['polarity'].mean()),
            'sentiment_changes': len(df[df['sentiment'] != df['original_sentiment']])
        }
    }
    
    with open('../data/model_summary.json', 'w') as f:
        json.dump(summary_stats, f, indent=2)
    
    print("Model summary saved for Flask app")
    print("\nAdvanced preprocessing and analysis complete!")