In [8]:
import os
import pandas as pd
import numpy as np
import re
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
from textblob import TextBlob
import spacy
import logging
from nltk.sentiment import SentimentIntensityAnalyzer
import xlsxwriter

# Setup logging
logging.basicConfig(level=logging.INFO, 
                    format='%(asctime)s - %(levelname)s: %(message)s',
                    filename='sentiment_analysis.log')

# Download necessary NLTK resources
try:
    nltk.download('punkt', quiet=True)
    nltk.download('vader_lexicon', quiet=True)
except Exception as e:
    logging.error(f"Error downloading NLTK resources: {e}")

# Load spaCy English model
try:
    nlp = spacy.load('en_core_web_sm')
except Exception as e:
    logging.error(f"Error loading spaCy model: {e}")
    print("Please install spaCy English model with: python -m spacy download en_core_web_sm")
    nlp = None

# Ensure input and output folders exist
os.makedirs('input', exist_ok=True)
os.makedirs('output', exist_ok=True)


In [2]:
# Load data from CSV
def load_lyrics_data(input_file):
    """
    Load lyrics data from a CSV file
    """
    try:
        # Try reading with different encodings
        encodings = ['utf-8', 'latin-1', 'iso-8859-1']
        
        for encoding in encodings:
            try:
                df = pd.read_csv(input_file, encoding=encoding)
                print(f"Successfully read file using {encoding} encoding")
                return df
            except Exception as e:
                print(f"Failed to read with {encoding} encoding: {e}")
        
        raise ValueError("Could not read the file with any of the attempted encodings")
    
    except Exception as e:
        print(f"Error reading file: {e}")
        return None

In [3]:
def advanced_sentiment_analysis(text):
    """
    Advanced sentiment analysis using multiple techniques
    """
    if not isinstance(text, str) or text.strip() == '':
        return {
            'sentiment': 'Neutral',
            'sentiment_score': 0,
            'positive_prob': 0,
            'negative_prob': 0,
            'debug_info': 'Empty text'
        }
    
    # Clean text
    cleaned_text = advanced_clean_text(text)
    
    # Multiple sentiment analysis techniques
    results = []
    debug_info = {}
    
    # 1. TextBlob Sentiment
    try:
        blob_sentiment = TextBlob(cleaned_text)
        text_blob_score = blob_sentiment.sentiment.polarity
        results.append(text_blob_score)
        debug_info['textblob_score'] = text_blob_score
    except Exception as e:
        logging.warning(f"TextBlob sentiment error: {e}")
        text_blob_score = 0
    
    # 2. NLTK VADER Sentiment Intensity Analyzer
    try:
        sia = SentimentIntensityAnalyzer()
        vader_scores = sia.polarity_scores(cleaned_text)
        vader_score = vader_scores['compound']
        results.append(vader_score)
        debug_info['vader_scores'] = vader_scores
    except Exception as e:
        logging.warning(f"VADER sentiment error: {e}")
        vader_score = 0
    
    # 3. SpaCy Named Entity and Dependency Analysis (if model loaded)
    spacy_score = 0
    if nlp:
        try:
            doc = nlp(cleaned_text)
            # Simple heuristic based on entities and dependency
            entity_sentiment = sum(1 if ent.label_ in ['ORG', 'PERSON'] else 0 for ent in doc.ents)
            spacy_score = entity_sentiment * 0.1
            debug_info['spacy_entities'] = [ent.text for ent in doc.ents]
        except Exception as e:
            logging.warning(f"SpaCy analysis error: {e}")
    
    # Combine scores with weighted average
    combined_score = np.mean(results + [spacy_score]) if results else 0
    
    # Sentiment categorization with more nuanced thresholds
    if combined_score > 0.2:
        sentiment = 'Positive'
    elif combined_score < -0.2:
        sentiment = 'Negative'
    else:
        sentiment = 'Neutral'
    
    return {
        'sentiment': sentiment,
        'sentiment_score': combined_score,
        'positive_prob': max(combined_score, 0),
        'negative_prob': abs(min(combined_score, 0)),
        'debug_info': debug_info
    }


In [4]:
def process_lyrics_file(file_path):
    """
    Process individual lyrics file
    """
    try:
        # Extract artist and title from filename
        filename = os.path.basename(file_path)
        artist, title = filename.rsplit('-', 1)
        title = title.replace('.txt', '')
        
        # Read lyrics
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            lyrics = f.read()
        
        # Perform sentiment analysis
        sentiment_result = advanced_sentiment_analysis(lyrics)
        
        # Create a DataFrame with the results
        result_df = pd.DataFrame({
            'Artist': [artist],
            'Title': [title],
            'Lyrics': [lyrics],
            'Sentiment': [sentiment_result['sentiment']],
            'Sentiment Score': [sentiment_result['sentiment_score']],
            'Positive Probability': [sentiment_result['positive_prob']],
            'Negative Probability': [sentiment_result['negative_prob']]
        })
        
        logging.info(f"Processed {filename}: {sentiment_result['sentiment']} sentiment")
        
        return result_df, sentiment_result['debug_info']
    
    except Exception as e:
        logging.error(f"Error processing {file_path}: {e}")
        return None, None


In [5]:
def perform_bulk_sentiment_analysis():
    """
    Perform sentiment analysis on all text files in input folder
    """
    # Find text files in the input folder
    input_files = [f for f in os.listdir('input') if f.endswith('.txt')]
    
    if not input_files:
        logging.warning("No text files found in the input folder.")
        return
    
    # Prepare list to store results
    all_results = []
    debug_logs = {}
    
    # Process each input file
    for input_file in input_files:
        full_path = os.path.join('input', input_file)
        print(f"Processing file: {input_file}")
        
        # Process individual lyrics file
        result_df, debug_info = process_lyrics_file(full_path)
        
        if result_df is not None:
            all_results.append(result_df)
            debug_logs[input_file] = debug_info
    
    # Combine all results
    if all_results:
        final_df = pd.concat(all_results, ignore_index=True)
        
        # Export to Excel with multiple sheets
        output_file = os.path.join('output', 'lyrics_sentiment_analysis.xlsx')
        
        with pd.ExcelWriter(output_file, engine='xlsxwriter') as writer:
            # Full Results Sheet
            final_df.to_excel(writer, sheet_name='Full Results', index=False)
            
            # Individual Sheets for each song
            for index, row in final_df.iterrows():
                sheet_name = f"{row['Artist']}_{row['Title']}"[:31]  # Excel sheet name limit
                pd.DataFrame([row]).to_excel(writer, sheet_name=sheet_name, index=False)
            
            # Sentiment Summary Sheet
            sentiment_summary = final_df['Sentiment'].value_counts(normalize=True) * 100
            sentiment_summary.to_excel(writer, sheet_name='Sentiment Summary')
        
        # Create visualizations
        create_visualizations(final_df)
        
        # Save debug information
        with open('output/debug_logs.txt', 'w') as f:
            for filename, debug_info in debug_logs.items():
                f.write(f"Filename: {filename}\n")
                f.write(f"Debug Info: {debug_info}\n\n")
        
        print(f"Analysis complete. Results exported to {output_file}")
    else:
        logging.warning("No results to export.")


In [6]:
def create_visualizations(df):
    """
    Create visualizations for sentiment analysis
    """
    output_folder = 'output'
    os.makedirs(output_folder, exist_ok=True)
    
    # Create a PDF with multiple visualizations
    from matplotlib.backends.backend_pdf import PdfPages
    
    with PdfPages(os.path.join(output_folder, 'sentiment_visualizations.pdf')) as pdf:
        # Sentiment Distribution Pie Chart
        plt.figure(figsize=(10, 5))
        plt.subplot(1, 2, 1)
        sentiment_counts = df['Sentiment'].value_counts()
        plt.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%')
        plt.title('Sentiment Distribution')
        
        # Sentiment Score Box Plot
        plt.subplot(1, 2, 2)
        sns.boxplot(x='Sentiment', y='Sentiment Score', data=df)
        plt.title('Sentiment Scores Distribution')
        
        plt.tight_layout()
        pdf.savefig()
        plt.close()


In [7]:
def main():
    # Perform bulk sentiment analysis
    perform_bulk_sentiment_analysis()

if __name__ == '__main__':
    main()

Processing file: bartjhe-satudalamcinta.txt
Processing file: lastchild-duka.txt
Analysis complete. Results exported to output\lyrics_sentiment_analysis.xlsx


In [37]:
def export_results(df, output_folder='output'):
    """
    Export analysis results
    """
    # Ensure output folder exists
    os.makedirs(output_folder, exist_ok=True)
    
    # Export full results
    full_results_path = os.path.join(output_folder, 'lyrics_sentiment_analysis.csv')
    df.to_csv(full_results_path, index=False)
    print(f"Full results exported to {full_results_path}")
    
    # Export summary
    summary_path = os.path.join(output_folder, 'sentiment_summary.txt')
    with open(summary_path, 'w', encoding='utf-8') as f:
        # Overall sentiment distribution
        sentiment_counts = df['sentiment'].value_counts()
        f.write("Sentiment Distribution:\n")
        for sentiment, count in sentiment_counts.items():
            f.write(f"{sentiment}: {count} ({count/len(df)*100:.2f}%)\n")
        
        # Average sentiment by artist
        f.write("\nAverage Sentiment by Artist:\n")
        artist_sentiment = df.groupby('artist')['sentiment_score'].mean().sort_values(ascending=False)
        for artist, score in artist_sentiment.items():
            f.write(f"{artist}: {score:.4f}\n")
    
    print(f"Summary exported to {summary_path}")

In [38]:
# Main Workflow
def main():
    # 1. Load Lyrics from Files
    try:
        lyrics_df = load_lyrics_from_files()
        print(f"Loaded {len(lyrics_df)} lyrics")
    except ValueError as e:
        print(e)
        return
    
    # 2. Clean Data
    cleaned_df = clean_lyrics(lyrics_df)
    
    # 3. Label Sentiment
    labeled_df = label_sentiment(cleaned_df)
    
    # 4. Train Model
    model, vectorizer = train_sentiment_model(labeled_df)
    
    # 5. Create Visualizations
    create_visualizations(labeled_df)
    
    # 6. Export Results
    export_results(labeled_df)
    
    return labeled_df, model, vectorizer

# Run the main workflow
results_df, sentiment_model, lyrics_vectorizer = main()

Loaded 2 lyrics
Model Evaluation Report:
              precision    recall  f1-score   support

     Neutral       1.00      1.00      1.00         1

    accuracy                           1.00         1
   macro avg       1.00      1.00      1.00         1
weighted avg       1.00      1.00      1.00         1



TypeError: create_visualizations() missing 1 required positional argument: 'text_columns'

In [26]:
# Function to analyze a single lyric file
def analyze_single_lyric(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lyrics = file.read().strip()
    
    sentiment, probabilities = analyze_lyrics_sentiment(lyrics, sentiment_model, lyrics_vectorizer)
    
    print(f"Lyrics: {lyrics[:100]}...")  # First 100 characters
    print(f"Predicted Sentiment: {sentiment}")
    print(f"Sentiment Probabilities: {probabilities}")

# Uncomment and modify to test a specific lyric file
# analyze_single_lyric('input/YourArtist - YourSong.txt')