### Scrape Chosic Sub-Genres
This notebooks scrapes sub-genres derived from a site called Chosic. This site analyzed all of Spotify's sub-genres and mapped them to parent genres.

In [1]:
# import libraries
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver

#### Init Driver

In [2]:
# scrape genres from chosic.com
url = "https://www.chosic.com/list-of-music-genres/"

# open the url
driver = webdriver.Chrome()
driver.get(url)

In [3]:
# Get the page source
page_source = driver.page_source

# Parse the HTML content using Beautiful Soup
soup = BeautifulSoup(page_source, "html.parser")

#### Parse Sub-Genres

In [7]:
# get parent genre objects
parent_genres = soup.find_all("li", {"class": "genre-term-basic"})
parent_genres = [genre.text for genre in parent_genres]
parent_genres = {f"genre{i}": genre for i, genre in enumerate(parent_genres)}

# initialize dataframe
all_genres = pd.DataFrame(columns=["sub_genre", "parent_genre"])

# loop through keys
for genre in parent_genres.keys():
    genre_name = parent_genres[genre]
    # create data_parent by subbing out genre, turning to int and add one
    data_parent = genre.replace("genre", "")
    data_parent = int(data_parent) + 1
    # get each sub genre
    subgenre_list = soup.find("ul", {"class": "ul-inside expandible", "data-parent": data_parent})
    # now we want to get the text for each li tag
    subgenre_list = subgenre_list.find_all("li", {"class": "capital-letter genre-term"})
    # append text to dataframe with genre name
    subgenre_list = [subgenre.text for subgenre in subgenre_list]
    subgenre_list = pd.DataFrame(subgenre_list, columns=["sub_genre"])
    subgenre_list["parent_genre"] = genre_name
    # append to all_genres
    all_genres = pd.concat([all_genres, subgenre_list])

#### Write Data

In [5]:
# close browser
driver.close()

In [8]:
# write dataframe to csv
all_genres.to_csv("../../data/chosic_genres.csv", index=False)