In [61]:
import pandas as pd
file_path = './Cleaned_Data/Artist_Genre_List.csv'
artist_genre_data = pd.read_csv(file_path)


# define genre mapping
# used regular expression \b (word boundaries) for precise genre replacement to avoid partial matches 
genre_mapping = {
    r'\bc-pop\b': 'mandopop',
    r'\bclassic mandopop\b': 'mandopop',
    r'\bmando pop\b': 'mandopop',
    r'\bmandarin pop\b': 'mandopop',
    r'\bchinese pop\b': 'mandopop',
    r'\btaiwanese pop\b': 'mandopop',
    r'\bmainland chinese pop\b': 'mandopop',
    r'\btaiwan pop\b': 'mandopop',
    r'\bpop mainland mandopop\b': 'mandopop',
    r'\bmandopop, taiwan singer-songwriter\b': 'mandopop',
    r'\bmalaysian mandopop, mandopop\b': 'mandopop',
    r'\bmalaysian mandopop\b': 'mandopop',
    r'\bsingaporean mandopop\b': 'mandopop',
    r'\bzhongguo feng\b': 'mandopop',
    r'\bmainland mandopop\b': 'mandopop',
    r'\bsingaporean mandopop\b': 'mandopop',
    r'\bsingaporean mandopop\b': 'mandopop',
    r'\bdance pop\b': 'pop',
    r'\bchinese reggae\b': 'reggae',
    r'\breggae flow\b': 'reggae',
    r'\breggae colombiano\b': 'reggae',
    r'\breggae rock\b': 'reggae',
    r'\bkorean pop\b': 'k-pop',
    r'\bpop r&b\b': 'r&b',
    r'\bcontemporary r&b\b': 'r&b',
    r'\bcanadian contemporary r&b\b': 'r&b',
    r'\bkorean r&b\b': 'r&b',
    r'\bchinese r&b\b': 'r&b',
    r'\bchill r&b\b': 'r&b',
    r'\buk contemporary r&b\b': 'r&b',
    r'\bdark r&b\b': 'r&b',
    r'\bboy band,contemporary r&b\b': 'r&b',
    r'\bindie r&b\b': 'r&b',
    r'\bbedroom r&b\b': 'r&b',
    r'\bcanadian r&b\b': 'r&b',
    r'\buk r&b\b': 'r&b',
    r'\bhouse\b': 'edm',
    r'\bprogressive house\b': 'edm',
    r'\bfilter house\b': 'edm',
    r'\bswedish tropical house\b': 'edm',
    r'\bchill house\b': 'edm',
    r'\btropical house\b': 'edm',
    r'\bdutch house\b': 'edm',
    r'\bjazz house\b': 'edm',
    r'\bnordic house\b': 'edm',
    r'\bstutter house\b': 'edm',
    r'\btech house\b': 'edm',
    r'\bgerman house\b': 'edm',
    r'\bg-house\b': 'edm',
    r'\bpop rap\b': 'rap',
    r'\bviral pop\b': 'pop',
    r'\btrap\b': 'hip hop',
    r'\bpost-teen pop\b': 'pop',
    r'\bpost-teen pop\b': 'pop',
    r'\bmelodic rap\b': 'rap'
    
    
}


# apply genre replacements
for old_genre, new_genre in genre_mapping.items():
    artist_genre_data['genres'] = artist_genre_data['genres'].str.replace(old_genre, new_genre, regex=True)

In [62]:
#  normalize the 'genres' column
artist_genre_data['genres'] = artist_genre_data['genres'].str.lower().str.strip()

# Remove duplicates in genres
artist_genre_data['genres'] = artist_genre_data['genres'].str.split(',').apply(lambda x:','.join(sorted(set(x))))

In [63]:
# split genres into list
artist_genre_data['genres'] = artist_genre_data['genres'].str.split(',')

# explode the DataFrame to one genre per row
exploded_data = artist_genre_data.explode('genres')

# drop duplicate rows
exploded_data = exploded_data.drop_duplicates(subset=['artist_name','genres'])

# format DataFrame
formatted_data = exploded_data.reset_index(drop=True)

#  normalize the 'genres' column
formatted_data['genres'] = formatted_data['genres'].str.strip()

# save the updated data
formatted_data.to_csv('./Cleaned_Data/Artist_Genre_List.csv', index=False, encoding='utf-8-sig')