## This script removes duplicate songs from a CSV file based on the song title and artist.
It ignores parentheses in song titles and filters out songs with 'cover' in the title.
It also filters out songs by the artist 'Glee'.
The script keeps the first occurrence of each song and saves the cleaned data to a new CSV file.
The script uses the pandas library for data manipulation and regular expressions for cleaning song titles.

In [None]:
import pandas as pd

def clean_song_title(title):
    """Remove parentheses and their contents from song titles."""
    import re
    return re.sub(r'\([^)]*\)', '', title).strip()

def remove_duplicates(input_csv, output_csv):
    """
    Remove duplicate songs based on Song and Artist, ignoring parentheses in song titles.
    Keeps the first occurrence.
    """
    # Read the CSV file
    df = pd.read_csv(input_csv)
    # Filter out songs with 'cover' in title (case insensitive)
    df = df[~df['Song'].str.lower().str.contains('cover')]
    
    # Filter out songs by Artist 'Glee' (exact match)
    df = df[df['Artist'] != 'glee']

    # Create a temporary column for cleaned song titles
    df['cleaned_song'] = df['Song'].apply(clean_song_title)
    
    # Identify duplicates based on cleaned song title and Artist
    df['is_duplicate'] = df.duplicated(subset=['cleaned_song', 'Artist'], keep='first')
    
    # Keep only the first occurrence (non-duplicates)
    df_cleaned = df[~df['is_duplicate']].copy()
    
    # Drop temporary columns
    df_cleaned = df_cleaned.drop(columns=['cleaned_song', 'is_duplicate'])
    
    # Save to output CSV
    df_cleaned.to_csv(output_csv, index=False)
    print(f"Processed {len(df)} songs, removed {len(df) - len(df_cleaned)} duplicates. Output saved to {output_csv}")

# Example usage
if __name__ == "__main__":
    input_file = "../../data/filtered_songs_three.csv"
    output_file = "../../data/deduplicated_songs.csv"
    remove_duplicates(input_file, output_file)

Processed 96043 songs, removed 17226 duplicates. Output saved to ../data/deduplicated_songs.csv
