In [5]:
import pandas as pd
import ast

def filter_songs_with_pandas(input_file, output_file):
    df = pd.read_csv(input_file)

    def process_row(row):
        try:
            categories = ast.literal_eval(row['categories'])
            category_words = ast.literal_eval(row['category_words'])
        except:
            return None  # Skip if there's a parsing issue

        # Filter for categories with at least 2 words
        filtered = [(cat, words) for cat, words in zip(categories, category_words) if len(words) > 1]
        if not filtered:
            return None

        new_categories = [cat for cat, _ in filtered]
        new_category_words = [words for _, words in filtered]

        # Process lyrics
        lyrics_lines = row['cleaned_lyrics'].splitlines()
        matched_lines = set()

        for words in new_category_words:
            for word in words:
                for line in lyrics_lines:
                    if word.lower() in line.lower():
                        matched_lines.add(line)
                        break  # Only one line per word

        if not matched_lines:
            return None

        return pd.Series({
            'Song': row['Song'],
            'Artist': row['Artist'],
            'Genre': row['Genre'],
            'categories': str(new_categories),
            'category_words': str(new_category_words),
            'cleaned_lyrics': ''.join(line + '\n' for line in matched_lines)

        })

    # Apply the processing function to each row
    filtered_df = df.apply(process_row, axis=1).dropna()

    # Save the result
    filtered_df.to_csv(output_file, index=False)

# Example usage
filter_songs_with_pandas('../data/processed_train_dataset.csv', '../data/filtered_songs.csv')
