### This script filters songs based on their categories and category words.
It processes the lyrics to find lines that match the category words and saves the results to a new CSV file.


In [None]:
import pandas as pd
import ast
import re

def filter_songs_with_pandas(input_file, output_file):
    df = pd.read_csv(input_file)

    def process_row(row):
        try:
            categories = ast.literal_eval(row['categories'])
            category_words = ast.literal_eval(row['category_words'])
        except:
            return None  # Skip if there's a parsing issue

        # Replace underscores with spaces in category_words
        category_words = [[word.replace('_', ' ') for word in word_list] for word_list in category_words]


        # Filter for categories with at least 2 words
        filtered = [(cat, words) for cat, words in zip(categories, category_words) if len(words) > 2]
        if not filtered:
            return None

        new_categories = [cat for cat, _ in filtered]
        new_category_words = [words for _, words in filtered]

        # Process lyrics
        lyrics_lines = row['cleaned_lyrics'].splitlines()
        matched_lines = set()

        for words in new_category_words:
            for word in words:
                word_pattern = r'\b{}\b'.format(re.escape(word.lower()))
                for line in lyrics_lines:
                    line_normalized = line.lower().replace('_', ' ')
                    # Check if the word (with space) is in the line (with underscores replaced)
                    if re.search(word_pattern, line_normalized):
                        matched_lines.add(line.replace('_', ' '))
                        break  # Only one line per word

        if not matched_lines:
            return None

        return pd.Series({
            'Song': row['Song'],
            'Artist': row['Artist'],
            'Genre': row['Genre'],
            'categories': str(new_categories),
            'category_words': str(new_category_words),
            'cleaned_lyrics': ''.join(line + '\n' for line in matched_lines)

        })

    # Apply the processing function to each row
    filtered_df = df.apply(process_row, axis=1).dropna()

    # Save the result
    filtered_df.to_csv(output_file, index=False)

# Example usage
filter_songs_with_pandas('../data/filtered_songs_disambiguated2.csv', '../data/oran_filtered.csv')
