# This script processes a dataset of song lyrics, cleans the lyrics, assigns categories based on keywords, and saves the processed data to a new CSV file. 
It also handles merging two datasets if needed.

In [None]:
import pandas as pd
import re
import json
from category import categories, multi_word_keywords


def clean_lyrics(lyrics):
    """Clean lyrics by converting to lowercase, splitting hyphens, and removing special characters."""
    lyrics = str(lyrics).lower()  # Convert to string to handle non-string inputs
    # Handle possessives (e.g., cat's → cat)
    lyrics = re.sub(r'\b(\w+)\'s\b', r'\1', lyrics)  # Remove 's
    lyrics = re.sub(r'\b(\w+)\'\b', r'\1', lyrics)  # Remove standalone '
    # lyrics = lyrics.replace('-', ' ')  # Replace hyphens with spaces to split words

    # Preserve multi-word keywords by replacing spaces with underscores
    for phrase in multi_word_keywords:
        lyrics = lyrics.replace(phrase, phrase.replace(' ', '_'))

    lyrics = re.sub(r'[^\w\s]', '', lyrics)  # Remove special characters except spaces
    return lyrics

def assign_categories_and_words(lyrics):
    """Assign categories and track relevant words based on unique word occurrences."""
    words = set(lyrics.split())  # Use set to get unique words
    assigned_categories = []
    category_words = []
    for category, keywords in categories.items():
        # Find matched keywords, converting underscores back to spaces for output
        matched_words = list(words & set(keywords))
        # Ensure at least three different words
        if len(matched_words) >= 1:
            assigned_categories.append(category)
            category_words.append(matched_words)
    
    if not assigned_categories:
        return ["None"], ["None"]
    return assigned_categories, category_words


def preprocess_dataset(input_file, output_file, lyrics_column='Lyrics'):
    """Read dataset, process lyrics, assign categories and words, and save to new CSV."""
    try:
        # Read the input CSV
        df = pd.read_csv(input_file)
        
        # Check if required columns exist
        required_columns = [lyrics_column, 'Song', 'Artist', 'Genre']
        missing_columns = [col for col in required_columns if col not in df.columns]
        if missing_columns:
            print(f"Error: Missing columns {missing_columns} in {input_file}")
            print("Available columns:", df.columns.tolist())
            raise KeyError(f"Missing columns: {missing_columns}")
        
        # Clean lyrics
        df['cleaned_lyrics'] = df[lyrics_column].apply(clean_lyrics)
        
        # Assign categories and relevant words
        df[['categories', 'category_words']] = df['cleaned_lyrics'].apply(
            lambda x: pd.Series(assign_categories_and_words(x))
        )
        
        # Convert category_words to string representation for CSV storage
        df['category_words'] = df['category_words'].apply(
            lambda x: json.dumps(x) if x != ["None"] else "[]"
        )
        
        # Select only the requested columns for output
        output_columns = ['Song', 'Artist', 'Genre', 'categories', 'category_words', 'cleaned_lyrics']
        df = df[output_columns]
        
        # Save the processed dataset
        df.to_csv(output_file, index=False)
        print(f"Processed dataset saved to {output_file}")
        return df
    
    except FileNotFoundError:
        print(f"Error: The file {input_file} was not found.")
        return None
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None

if __name__ == "__main__":
    original_test = pd.read_csv("../../data/test.csv")
    additional = pd.read_csv("../../data/More_song_test.csv")
    # Merging the two datasets
    merged = pd.concat([original_test, additional], ignore_index=True)

    # Save the merged dataset
    merged.to_csv("../../data/test_with_additional.csv",index=False)
    input_file = "../../data/test_with_additional.csv"
    output_file = "../../data/processed_test_dataset.csv"
    preprocess_dataset(input_file, output_file, lyrics_column='Lyrics')

Processed dataset saved to ../data/processed_test_dataset.csv
