In [4]:
from keras import layers, models, callbacks
from keras.utils import to_categorical
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import seaborn as sns

In [52]:
df = pd.read_csv('albums_genres_cleaned.csv')

In [55]:
GENRE_KEYWORDS = [
    "pop", "rock", "rap", "hip hop", "trap", "country", "metal", "r&b", 
    "jazz", "indie", "electronic", "house", "techno", "reggae", "folk", 
    "funk", "soul", "punk", "blues", "alternative", "classical", "ambient", 
    "k-pop", "kids", "afrobeats","relaxed","latin"
]

# Mapping of leftover genre tags to canonical genres
MANUAL_GENRE_MAP = {
    # rock
    "psychedelic": "rock", "post-rock": "rock", "post-grunge": "rock", "grunge": "rock",
    "shoegaze": "rock", "canadian": "rock",

    # pop
    "female vocalist": "pop", "female vocalists": "pop", "5 seconds of summer": "pop",
    "synthpop": "pop", "tatemcrae": "pop", "my top songs": "pop", "sexy": "pop",
    "bee gees": "pop", "hyperpop": "pop", "digital tendencies": "pop", "2-step": "pop",
    "acoustic": "pop", "comedy": "pop", "singer-songwriter": "pop", "3": "pop",
    "disco": "pop", "wedding": "pop", "30": "pop", "peter": "pop", "midnights": "pop",
    "poptron": "pop", "doo-wop": "pop", "male vocals": "pop", "new wave": "pop",
    "later": "pop", "joaoaksnes": "pop", "madonna": "pop", "portals": "pop",
    "aoty": "pop", "mayhem": "pop", "guts": "pop", "short n' sweet": "pop",
    "soty": "pop", "cypriot": "pop", "breakbeat": "pop", "chinese": "pop",
    "german": "pop", "absolute bangers": "pop", "danish": "pop", "azerbaijan": "pop",
    "hardstyle": "pop", "japanese": "pop", "argentina": "pop", "indian": "pop",
    "india": "pop", "bhangra": "pop", "nigeria": "pop", "bollywood": "pop",
    "italian": "pop", "morocco": "pop", "traditional": "pop","ukrainian":"pop",

    # latin
    "puerto rico": "latin", "mexico": "latin", "sad sierreno": "latin",
    "lada del 602": "latin", "corridos tumbados": "latin", "seen live": "latin",
    "ramito de violeta": "latin", "chickencore": "latin", "corona": "latin",
    "cuck": "latin", "luar la l": "latin", "peso pluma": "latin", "spanish": "latin",
    "bachata": "latin", "salsa": "latin", "mierda": "latin", "banda": "latin",
    "duranguense": "latin", "romantico grupero": "latin", "colombia": "latin",
    "drum and bass": "latin", "sertanejo": "latin", "pagode": "latin", "samba": "latin","corrido tumbado":"latin","mexican":"latin","spain":"latin",

    # relaxed
    "ambient": "relaxed", "sleep": "relaxed", "reiki": "relaxed", "instrumental": "relaxed",
    "lo-fi": "relaxed", "rain": "relaxed", "nature sounds": "relaxed", "chill": "relaxed",
    "nature": "relaxed", "noise": "relaxed", "eargasm": "relaxed", "chillout": "relaxed",
    "white noise": "relaxed", "piano": "relaxed", "hindi": "relaxed","ambient":"relaxed",

    # rap
    "harder than diamonds": "rap", "peak": "rap", "drill": "rap", "heavy": "rap",
    "my scribbled": "rap", "drake": "rap", "baby keem": "rap", "g59": "rap", "j cole": "rap",
    "tag lil tecca-lot of me": "rap", "kanye wes": "rap", "bronx drill": "rap",
    "transitions": "rap", "auto-tagged": "rap", "gunna": "rap", "mumble crap": "rap",
    "boom bap": "rap", "quirky": "rap", "underrated": "rap", "phonk": "rap",
    "chipmunk soul": "rap", "diss": "rap", "worst album ever": "rap", "nitrous": "rap",
    "juggin": "rap", "4 out of 5": "rap", "czech": "rap",
    "sematary grave man from the haunted mound real nazgul skincarver keeper of da trees haunted mound lord": "rap",
    "grime": "rap","juice wrld":"rap","polo g":"rap","lil uzi vert":"rap","kanye west":"rap",

    # reggae
    "reggaeton": "reggae", "party": "reggae", "dancehall": "reggae", "love": "reggae",

    # hip hop
    "linedance": "hip hop", "nice": "hip hop", "florida": "hip hop", "don toliver": "hip hop",
    "southern hip-hop": "hip hop", "dr congo": "hip hop", "mother": "hip hop",
    "sampling": "hip hop", "plugg": "hip hop", "ebm": "hip hop", "egyptian": "hip hop",
    "a cappella": "hip hop", "ghana": "hip hop", "cumbia 420": "hip hop", "hip-hop": "hip hop","melodic hip-hop":"hip hop",

    # metal
    "metalcore": "metal", "progressive metalcore": "metal", "rage": "metal",
    "post-hardcore": "metal", "demonic": "metal",

    # r&b
    "rnb": "r&b", "aggressive": "r&b", "king billionheir": "r&b", "love at first listen": "r&b",
    "personal favourites": "r&b", "sza": "r&b", "british": "r&b", "3 out of 5": "r&b","steve lacy":"r&b",

    # k-pop
    "bts": "k-pop", "kpop": "k-pop", "korean": "k-pop", "jersey club": "k-pop", "it boy global": "k-pop",

    # religious
    "christian": "religious", "ccm": "religious", "worship": "religious", "gospel": "religious",
    "musiclist": "religious", "experimental": "religious", "hariharan": "religious", "thai": "religious",

    # electronic
    "depressive": "electronic", "indietronica": "electronic", "synthwave": "electronic",
    "childish gamblingo": "electronic", "featuring": "electronic", "trance": "electronic",
    "ass": "electronic", "downtempo": "electronic", "dance": "electronic", "60s": "electronic",
    "polish": "electronic", "russian": "electronic", "norway": "electronic", "eurodance": "electronic",
    "remix": "electronic", "frenchcore": "electronic", "uk garage": "electronic",

    # indie
    "songs i crank my hog to": "indie", "bossa nova": "indie", "wsum 91.7 fm madison": "indie",
    "songs i like to play whilst walking down the street at night music": "indie",
    "gambling addiction": "indie", "emo": "indie", "darkwave": "indie", "slowcore": "indie",
    "stolen": "indie", "life changing": "indie", "vinyl": "indie", "gothangelz": "indie",
    "ai": "indie", "ukranian": "indie", "opm": "indie",

    # afrobeats
    "afrobeats": "afrobeats", "kenyan": "afrobeats",

    # country
    "usa": "country", "fearless": "country", "linedance 2021": "country", "texas": "country",
    "feel good": "country", "furry": "country", "linedance catalan": "country", "american": "country", "haunted": "country","linedance 2022":"country",

    # kids
    "soundtrack": "kids", "video game music": "kids", "australian": "kids", "disney": "kids",
    "musical": "kids", "infantil": "kids",

    # trap
    "detroit trap": "trap",

    # funk
    "brazil": "funk",

    # folk
    "arabic": "folk"
}

def map_to_final_genre(genre_str):
    genre_str_lower = str(genre_str).strip().lower()
    
    # Check canonical match
    for genre in GENRE_KEYWORDS:
        if genre in genre_str_lower:
            return genre

    # Check exact match in manual mapping
    if genre_str_lower in MANUAL_GENRE_MAP:
        return MANUAL_GENRE_MAP[genre_str_lower]

    # Return as-is if not matched
    return genre_str_lower

# Apply to your DataFrame
df['genre_cleaned'] = df['genre'].apply(map_to_final_genre)


In [56]:
df["genre_cleaned"].unique()

array(['ambient', 'folk', 'country', 'rap', 'pop', 'r&b', 'hip hop',
       'alternative', 'rock', 'indie', 'relaxed', 'reggae', 'unknown',
       'latin', 'electronic', 'soul', 'afrobeats', 'metal', 'jazz',
       'religious', 'k-pop', 'funk', 'house', 'kids', 'afrobeat', 'punk',
       'blues', 'classic', 'classical', 'techno', 'eletronic', 'lofi'],
      dtype=object)

In [57]:
df.to_csv('albums_genres_grouped.csv')