In [8]:
import os
import pandas as pd
import numpy as np
import re
import json
import logging
import matplotlib.pyplot as plt
import seaborn as sns

# Attempt to import Sastrawi with error handling
try:
    from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
    from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
except ImportError:
    print("Sastrawi library not found. Please install it using: pip install Sastrawi")
    StemmerFactory = None
    StopWordRemoverFactory = None

# Setup logging
logging.basicConfig(level=logging.INFO, 
                    format='%(asctime)s - %(levelname)s: %(message)s',
                    filename='indonesian_sentiment_analysis.log')

class IndonesianSentimentAnalyzer:
    def __init__(self, lexicon_path='leksikon'):
        """
        Initialize sentiment analyzer with Indonesian lexicon data and Sastrawi preprocessors
        """
        # Check if Sastrawi is available
        if StemmerFactory is None or StopWordRemoverFactory is None:
            raise ImportError("Sastrawi library is required but not installed.")
        
        self.lexicon_path = lexicon_path
        self.positive_words = set()
        self.negative_words = set()
        self.booster_words = set()
        self.negation_words = set()
        
        # Initialize Sastrawi preprocessors
        stemmer_factory = StopWordRemoverFactory()
        self.stop_words = set(stemmer_factory.get_stop_words())
        
        stemmer_factory = StemmerFactory()
        self.stemmer = stemmer_factory.create_stemmer()
        
        self.load_lexicons()
    
    def load_lexicons(self):
        """
        Load various Indonesian lexicon files
        """
        try:
            # Load positive words
            pos_files = [
                os.path.join(self.lexicon_path, 'inset', 'positive.tsv'),
                os.path.join(self.lexicon_path, 'inset', '_json_inset-pos.txt')
            ]
            for pos_file in pos_files:
                if os.path.exists(pos_file):
                    with open(pos_file, 'r', encoding='utf-8') as f:
                        self.positive_words.update(self._read_lexicon_file(f))
            
            # Load negative words
            neg_files = [
                os.path.join(self.lexicon_path, 'inset', 'negative.tsv'),
                os.path.join(self.lexicon_path, 'inset', '_json_inset-neg.txt')
            ]
            for neg_file in neg_files:
                if os.path.exists(neg_file):
                    with open(neg_file, 'r', encoding='utf-8') as f:
                        self.negative_words.update(self._read_lexicon_file(f))
            
            # Load booster and negation words
            booster_file = os.path.join(self.lexicon_path, 'sentistrength_id', 'boosterwords_id.txt')
            negation_file = os.path.join(self.lexicon_path, 'sentistrength_id', 'negatingword.txt')
            
            if os.path.exists(booster_file):
                with open(booster_file, 'r', encoding='utf-8') as f:
                    self.booster_words = set(line.strip().lower() for line in f if line.strip())
            
            if os.path.exists(negation_file):
                with open(negation_file, 'r', encoding='utf-8') as f:
                    self.negation_words = set(line.strip().lower() for line in f if line.strip())
            
            logging.info(f"Loaded lexicons: {len(self.positive_words)} positive, {len(self.negative_words)} negative words")
        
        except Exception as e:
            logging.error(f"Error loading lexicons: {e}")
    
    def _read_lexicon_file(self, file):
        """
        Read lexicon file with various formats
        """
        words = set()
        for line in file:
            line = line.strip().lower()
            # Handle TSV and JSON-like formats
            if '\t' in line:
                word = line.split('\t')[0].lower()
            elif ':' in line:
                try:
                    word = json.loads(line.replace("'", '"')).get('word', '').lower()
                except:
                    word = line.split(':')[0].lower()
            else:
                word = line.lower()
            
            if word:
                words.add(word)
        return words
    
    def preprocess_text(self, text):
        """
        Preprocess Indonesian text using Sastrawi
        """
        if not isinstance(text, str):
            return []
        
        # Convert to lowercase
        text = text.lower()
        
        # Remove special characters and numbers
        text = re.sub(r'[^a-z\s]', '', text)
        
        # Tokenize by splitting on whitespace
        words = text.split()
        
        # Remove stop words
        words = [word for word in words if word not in self.stop_words]
        
        # Stemming
        words = [self.stemmer.stem(word) for word in words]
        
        return words
    
    def analyze_sentiment(self, text):
        """
        Perform sentiment analysis on Indonesian text
        """
        if not isinstance(text, str) or text.strip() == '':
            return {
                'sentiment': 'Neutral',
                'sentiment_score': 0,
                'positive_count': 0,
                'negative_count': 0,
                'total_words': 0
            }
        
        # Preprocess text
        words = self.preprocess_text(text)
        
        # Count sentiments
        positive_count = sum(1 for word in words if word in self.positive_words)
        negative_count = sum(1 for word in words if word in self.negative_words)
        
        # Total words
        total_words = len(words)
        
        # Calculate sentiment score
        sentiment_score = (positive_count - negative_count) / (total_words + 1)  # Add 1 to avoid division by zero
        
        # Categorize sentiment
        if sentiment_score > 0.1:
            sentiment = 'Positive'
        elif sentiment_score < -0.1:
            sentiment = 'Negative'
        else:
            sentiment = 'Neutral'
        
        return {
            'sentiment': sentiment,
            'sentiment_score': sentiment_score,
            'positive_count': positive_count,
            'negative_count': negative_count,
            'total_words': total_words
        }


def process_lyrics_file(file_path, sentiment_analyzer):
    """
    Process lyrics file, analyzing entire lyrics and each line
    """
    try:
        # Extract artist and title from filename
        filename = os.path.basename(file_path)
        
        # More robust filename parsing
        parts = filename.rsplit('-', 1)
        if len(parts) < 2:
            artist = "Unknown Artist"
            title = filename.replace('.txt', '')
        else:
            artist = parts[0]
            title = parts[1].replace('.txt', '')
        
        # Read lyrics
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            lyrics_text = f.read()
            f.seek(0)  # Reset file pointer
            lyrics_lines = f.readlines()
        
        # Analyze entire lyrics
        overall_sentiment = sentiment_analyzer.analyze_sentiment(lyrics_text)
        
        # Analyze each line
        line_results = []
        for i, line in enumerate(lyrics_lines, 1):
            line = line.strip()
            if line:
                line_sentiment = sentiment_analyzer.analyze_sentiment(line)
                line_results.append({
                    'artist': artist,
                    'title': title,
                    'line_number': i,
                    'lyrics_line': line,
                    'sentiment': line_sentiment['sentiment'],
                    'sentiment_score': line_sentiment['sentiment_score'],
                    'positive_words': line_sentiment['positive_count'],
                    'negative_words': line_sentiment['negative_count'],
                    'total_words': line_sentiment['total_words']
                })
        
        # Ensure DataFrame creation works even with empty results
        line_df = pd.DataFrame(line_results) if line_results else pd.DataFrame()
        
        # Overall lyrics sentiment
        overall_result = {
            'artist': artist,
            'title': title,
            'overall_sentiment': overall_sentiment['sentiment'],
            'overall_sentiment_score': overall_sentiment['sentiment_score'],
            'total_positive_words': overall_sentiment['positive_count'],
            'total_negative_words': overall_sentiment['negative_count'],
            'total_words': overall_sentiment['total_words']
        }
        
        overall_df = pd.DataFrame([overall_result])
        
        return {
            'line_df': line_df,
            'overall_df': overall_df
        }
    
    except Exception as e:
        logging.error(f"Error processing {file_path}: {e}")
        return None

def perform_bulk_sentiment_analysis():
    """
    Perform sentiment analysis on all text files in input folder
    """
    # Initialize sentiment analyzer
    sentiment_analyzer = IndonesianSentimentAnalyzer()
    
    # Find text files in the input folder
    input_folder = 'input'
    os.makedirs(input_folder, exist_ok=True)  # Create input folder if it doesn't exist
    
    input_files = [f for f in os.listdir(input_folder) if f.endswith('.txt')]
    
    if not input_files:
        logging.warning("No text files found in the input folder.")
        return
    
    # Prepare lists to store results
    all_line_results = []
    all_overall_results = []
    
    # Process each input file
    for input_file in input_files:
        full_path = os.path.join(input_folder, input_file)
        print(f"Processing file: {input_file}")
        
        # Process individual lyrics file
        result = process_lyrics_file(full_path, sentiment_analyzer)
        
        if result is not None:
            # Only append if DataFrames are not empty
            if not result['line_df'].empty:
                all_line_results.append(result['line_df'])
            if not result['overall_df'].empty:
                all_overall_results.append(result['overall_df'])
    
    # Combine and export results
    if all_line_results and all_overall_results:
        # Create output directory
        output_folder = 'output'
        os.makedirs(output_folder, exist_ok=True)
        
        # Process each lyric separately
        for i, (line_df, overall_df) in enumerate(zip(all_line_results, all_overall_results), 1):
            # Create output file for each lyric
            try:
                artist = line_df['artist'].iloc[0]
                title = line_df['title'].iloc[0]
                output_file = os.path.join(output_folder, f'{artist}-{title}_sentiment_analysis.xlsx')
                
                # Export to Excel
                with pd.ExcelWriter(output_file, engine='xlsxwriter') as writer:
                    # Line-by-Line Results Sheet
                    line_df.to_excel(writer, sheet_name='Line Results', index=False)
                    
                    # Overall Sentiment Sheet
                    overall_df.to_excel(writer, sheet_name='Overall Sentiment', index=False)
                    
                    # Sentiment Summary
                    summary_df = line_df.groupby('sentiment').size().reset_index(name='Count')
                    summary_df['Percentage'] = (summary_df['Count'] / len(line_df) * 100).round(2)
                    summary_df.to_excel(writer, sheet_name='Sentiment Summary', index=False)
                
                # Create visualizations for each lyric
                create_visualizations(line_df, artist, title, output_folder)
                
                print(f"Analysis complete for {artist} - {title}. Results exported to {output_file}")
            
            except Exception as e:
                logging.error(f"Error processing dataframe {i}: {e}")
                print(f"Error processing dataframe {i}: {e}")
    else:
        logging.warning("No results to export.")

def create_visualizations(df, artist, title, output_folder):
    """
    Create comprehensive visualizations for each lyric
    """
    # Protect against empty DataFrame
    if df.empty:
        logging.warning(f"Cannot create visualizations for empty DataFrame: {artist} - {title}")
        return
    
    # Create a PDF with multiple visualizations
    from matplotlib.backends.backend_pdf import PdfPages
    
    # Output filename
    output_filename = os.path.join(output_folder, f'{artist}-{title}_sentiment_visualizations.pdf')
    
    with PdfPages(output_filename) as pdf:
        # 1. Overall Sentiment Distribution
        plt.figure(figsize=(15, 5))
        plt.subplot(1, 3, 1)
        sentiment_counts = df['sentiment'].value_counts()
        plt.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%')
        plt.title(f'Sentiment Distribution\n{artist} - {title}')
        
        # 2. Sentiment Score Distribution
        plt.subplot(1, 3, 2)
        sns.boxplot(x='sentiment', y='sentiment_score', data=df)
        plt.title('Sentiment Scores Distribution')
        
        # 3. Word Distribution
        plt.subplot(1, 3, 3)
        plt.bar(['Positive Words', 'Negative Words'], 
                [df['positive_words'].sum(), df['negative_words'].sum()])
        plt.title('Positive vs Negative Words')
        
        plt.tight_layout()
        pdf.savefig()
        plt.close()
        
        # Additional visualizations
        # Sentiment Score Violin Plot
        plt.figure(figsize=(10, 6))
        sns.violinplot(x='sentiment', y='sentiment_score', data=df)
        plt.title(f'Sentiment Score Distribution\n{artist} - {title}')
        pdf.savefig()
        plt.close()

def main():
    # Perform bulk sentiment analysis
    perform_bulk_sentiment_analysis()

if __name__ == '__main__':
    main()

Processing file: bartjhe-satudalamcinta.txt
Processing file: Febri Putri-Runtuh.txt
Processing file: Jrock-Ceria.txt
Processing file: lastchild-duka.txt
Processing file: Project Pop-Ingatlah Hari Ini.txt
Analysis complete for bartjhe - satudalamcinta. Results exported to output\bartjhe-satudalamcinta_sentiment_analysis.xlsx
Analysis complete for Febri Putri - Runtuh. Results exported to output\Febri Putri-Runtuh_sentiment_analysis.xlsx
Analysis complete for Jrock - Ceria. Results exported to output\Jrock-Ceria_sentiment_analysis.xlsx
Analysis complete for lastchild - duka. Results exported to output\lastchild-duka_sentiment_analysis.xlsx
Analysis complete for Project Pop - Ingatlah Hari Ini. Results exported to output\Project Pop-Ingatlah Hari Ini_sentiment_analysis.xlsx
