### Data Scraping and Selenium (Max Vo)

#### Data Scraping: https://grizzlyrose.com/most-popular-music-genre-by-state/

* Related Pages:
* https://wisevoter.com/report/favorite-music-genre-by-state/
* https://www.wideopencountry.com/most-popular-music-genre-by-state/

*** For the most part this form of data is not acessible via the website ~ as it is in image form

In [1]:
import requests
import lxml.html as lx
import re

In [2]:
endpoint = "https://grizzlyrose.com/most-popular-music-genre-by-state/"
response = requests.get(endpoint)
response.raise_for_status()
# no payload for particular site

In [3]:
try:
    data = response.json()
except requests.exceptions.JSONDecodeError:
    print("Response content is not valid JSON")
    data = None

if data:
    # Process your data here
    pass
else:
    print("No data to process")

Response content is not valid JSON
No data to process


In [4]:
html = lx.fromstring(response.text)
html # element exists w/ this method

# ex XPATH for state: //*[@id="main"]/div[1]/div/main/article/div[2]/div[2]/h3[1]
# ex XPATH for genre: //*[@id="main"]/div[1]/div/main/article/div[2]/div[2]/ul[1]/li[1]

<Element html at 0x257aefd5bc0>

In [5]:
states = html.xpath('//div/main/article/div/div/h3/text()')
states

['Alabama',
 'Alaska',
 'Arizona',
 'Arkansas',
 'California',
 'Colorado',
 'Connecticut',
 'Delaware',
 'Florida',
 'Georgia',
 'Hawaii',
 'Idaho',
 'Illinois',
 'Indiana',
 'Iowa',
 'Kansas',
 'Kentucky',
 'Louisiana',
 'Maine',
 'Maryland',
 'Massachusetts',
 'Michigan',
 'Minnesota',
 'Mississippi',
 'Missouri',
 'Montana',
 'Nebraska',
 'Nevada',
 'New Hampshire',
 'New Jersey',
 'New Mexico',
 'New York',
 'North Carolina',
 'North Dakota',
 'Ohio',
 'Oklahoma',
 'Oregon',
 'Pennsylvania',
 'Rhode Island',
 'South Carolina',
 'South Dakota',
 'Tennessee',
 'Texas',
 'Utah',
 'Vermont',
 'Virginia',
 'Washington',
 'West Virginia',
 'Wisconsin',
 'Wyoming']

In [6]:
elements = html.xpath('//div/main/article/div/div/ul/li/strong/following-sibling::text()[1]')
# ISSUE run into:
# identifying how to select element after <strong> tag in /li was difficult
# run into issue that more data then expected is selected this way

In [7]:
genres = []
for h3 in html.xpath('//div/main/article/div/div/h3'):
    # Find the first following ul element
    ul = h3.xpath('following-sibling::ul[1]')
    if ul:
        # Extract the first genre text after the strong tag in the ul
        genre = ul[0].xpath('.//li/strong/following-sibling::text()[1]')
        if genre:
            genres.append(genre[0].strip())
        else:
            genres.append('')
    else:
        genres.append('')

# Print the list of genres
print(genres)

['Country', 'Country', 'Metal', 'Country', 'Pop', 'Indie', 'Metal', 'Classic Rock', 'Alternative', 'Classic Rock', 'Rhythm and Blue', 'Country', 'Classic Rock', 'Classic Rock', 'Indie', 'Metal', 'Classic Rock', 'Classic Rock', 'Alternative', 'Indie', 'Indie', 'Classic Rock', 'Indie', 'Classic Rock', 'Classic Rock', 'Alternative', 'Country', 'Electronic', 'Country', 'Classic Rock', 'Alternative', 'Classic Rock', 'Indie', 'Metal', 'Country', 'Metal', 'Alternative', 'Country', 'Indie', 'Classic Rock', 'Metal', 'Indie', 'Latin', 'Country', 'Indie', 'Country', 'Indie', 'Country', 'Country', 'Indie']


In [8]:
genre_by_state = [{'state': state.strip(), 'genre': genre.strip()} for state, genre in zip(states, genres)]
genre_by_state

[{'state': 'Alabama', 'genre': 'Country'},
 {'state': 'Alaska', 'genre': 'Country'},
 {'state': 'Arizona', 'genre': 'Metal'},
 {'state': 'Arkansas', 'genre': 'Country'},
 {'state': 'California', 'genre': 'Pop'},
 {'state': 'Colorado', 'genre': 'Indie'},
 {'state': 'Connecticut', 'genre': 'Metal'},
 {'state': 'Delaware', 'genre': 'Classic Rock'},
 {'state': 'Florida', 'genre': 'Alternative'},
 {'state': 'Georgia', 'genre': 'Classic Rock'},
 {'state': 'Hawaii', 'genre': 'Rhythm and Blue'},
 {'state': 'Idaho', 'genre': 'Country'},
 {'state': 'Illinois', 'genre': 'Classic Rock'},
 {'state': 'Indiana', 'genre': 'Classic Rock'},
 {'state': 'Iowa', 'genre': 'Indie'},
 {'state': 'Kansas', 'genre': 'Metal'},
 {'state': 'Kentucky', 'genre': 'Classic Rock'},
 {'state': 'Louisiana', 'genre': 'Classic Rock'},
 {'state': 'Maine', 'genre': 'Alternative'},
 {'state': 'Maryland', 'genre': 'Indie'},
 {'state': 'Massachusetts', 'genre': 'Indie'},
 {'state': 'Michigan', 'genre': 'Classic Rock'},
 {'state'

In [73]:
import pandas as pd

genre_by_state_df = pd.DataFrame(genre_by_state)
genre_by_state_df.index = genre_by_state_df.index + 1

# Display the DataFrame
genre_by_state_df

Unnamed: 0,state,genre
1,Alabama,Country
2,Alaska,Country
3,Arizona,Metal
4,Arkansas,Country
5,California,Pop
6,Colorado,Indie
7,Connecticut,Metal
8,Delaware,Classic Rock
9,Florida,Alternative
10,Georgia,Classic Rock


In [None]:
# For Use in State-Level Aggregation
import pickle

with open('genre_state.pkl', 'wb') as f:
    pickle.dump(genre_by_state_df, f)

### Also want to Scrape: https://chartmasters.org/most-streamed-tracks-on-spotify/

In [19]:
import requests
from bs4 import BeautifulSoup

def get_top_10_songs():
    """
    Scrape the top 10 songs for a given artist from ChartMasters' Most Streamed Tracks on Spotify page.
    
    Parameters:
    artist_name (str): The name of the artist to search for.

    Returns:
    list of dict: A list containing the top 10 songs with their rank, title, streams, and other info.
    """
    url = "https://chartmasters.org/most-streamed-tracks-on-spotify/"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # SCRAPING LOGIC #
    top_10_songs = []
    table = soup.find('table')  # Adjust this selector based on the actual HTML structure
    if table:
        rows = table.find_all('tr')[1:11]  # Skip the header row and get the top 10 rows
        for row in rows:
            cols = row.find_all('td')
            rank = cols[1].text.strip()
            title = cols[2].text.strip()
            top_10_songs.append({'Rank': rank, 'Title': title})
    
    return top_10_songs

# Example 
top_10 = get_top_10_songs()
top_10

[{'Rank': '1', 'Title': 'Blinding Lights'},
 {'Rank': '2', 'Title': 'Shape of You'},
 {'Rank': '3', 'Title': 'Someone You Loved'},
 {'Rank': '4', 'Title': 'As It Was'},
 {'Rank': '5', 'Title': 'Sunflower'},
 {'Rank': '6', 'Title': 'Starboy'},
 {'Rank': '7', 'Title': 'One Dance'},
 {'Rank': '8', 'Title': 'Sweater Weather'},
 {'Rank': '9', 'Title': 'Stay'},
 {'Rank': '10', 'Title': 'Believer'}]

### Cont. Selenium | Require a Web Driver

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC

from bs4 import BeautifulSoup
import time

In [4]:
chrome_options = Options() # currently empty options

# Configure Selenium WebDriver
chrome_options.add_argument("--headless")  # Run in headless mode
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")

In [43]:
def dynamic_top10(artist_name):
    """
    Scrape the top 10 songs for a given artist from ChartMasters' Most Streamed Tracks on Spotify page.
    
    Parameters:
    artist_name (str): The name of the artist to search for.

    Returns:
    list of dict: A list containing the top 10 songs with their rank, title, streams, and other info.
    """
    # Initialize the WebDriver
    driver = webdriver.Chrome(
        service=Service(ChromeDriverManager().install()), 
        options=chrome_options
    )
    
    # Open the URL
    url = "https://chartmasters.org/most-streamed-tracks-on-spotify/"
    driver.get(url)
    time.sleep(5)  # Allow page to fully load (adjust this if needed)

# ELEMENT: <input type="text" class="form-control wdt-filter-control text_filter" placeholder="Artist" aria-label="Filter input for Artist">

    # Find the search input and enter the artist's name
    search_input = WebDriverWait(driver, 10).until( # waiting for javascript to load
        EC.presence_of_element_located((By.CSS_SELECTOR, 'input[placeholder="Artist"]'))
    )
    
    search_input.send_keys(artist_name)
    # search_input.send_keys(Keys.RETURN) ~ pressing return key causes different formatting 
    
    time.sleep(5)  # Wait for the search results to load
    
    # Get the page source and parse it with BeautifulSoup
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')
    
    # Scrape the top 10 songs
    top_10_songs = []
    table = soup.find('table')  # Adjust this selector based on the actual HTML structure
    if table:
        rows = table.find_all('tr')[1:11]  # Skip the header row and get the top 10 rows
        for row in rows:
            cols = row.find_all('td')
            rank = cols[0].text.strip()
            
            # Extract only the bolded text for the song title
            title = cols[1].find('b').text.strip() if cols[1].find('b') else cols[1].text.strip()
            streams = cols[2].text.strip()
            
            top_10_songs.append({'Rank': rank, 'Title': title, 'Streams': streams})

    
    driver.quit()
    return top_10_songs

In [44]:
# Ran into Issue with the ChromeDriverManager
# Ran into Issue with Javascript not loading
# Ran into Issue with the search_input not being found
# Ran into issue with Text for 'Title' key

# EX:
artist_name = "Lil Wayne"
top_10 = dynamic_top10(artist_name)

In [45]:
top_10

[{'Rank': '1', 'Title': "I'm the One", 'Streams': '1,295,402,255'},
 {'Rank': '2', 'Title': 'Sucker For Pain', 'Streams': '1,182,483,591'},
 {'Rank': '3', 'Title': 'Love Me', 'Streams': '911,167,799'},
 {'Rank': '4', 'Title': 'Down', 'Streams': '783,464,098'},
 {'Rank': '5', 'Title': 'Forever', 'Streams': '638,299,492'},
 {'Rank': '6', 'Title': 'Loyal', 'Streams': '608,000,200'},
 {'Rank': '7', 'Title': 'A Milli', 'Streams': '605,886,979'},
 {'Rank': '8', 'Title': 'The Motto', 'Streams': '585,233,304'},
 {'Rank': '9', 'Title': 'Lollipop', 'Streams': '581,375,940'},
 {'Rank': '10', 'Title': 'Only', 'Streams': '580,817,938'}]

In [48]:
import pickle

with open('top_10_artists.pkl', 'rb') as f: # serialized format
    top_10_artists = pickle.load(f)
    
top_10_artists

['Lil Wayne',
 'Future',
 'Tyler, The Creator',
 'Drake',
 'Kendrick Lamar',
 'Kanye West',
 'Metro Boomin',
 '21 Savage',
 'Morgan Wallen',
 'GloRilla']

In [None]:
artists_top_10_songs = []

# Loop through each artist and call the dynamic_top10 function
for artist in top_10_artists:
    top_10_songs = dynamic_top10(artist)
    artists_top_10_songs.append({artist: top_10_songs})
    
# list index our of range error occurs since the site does not have GloRilla as entries


In [None]:
artists_top_10_songs # top 9 artists are returned

In [51]:
import json

# FILE NAME
text_file = 'top10songs_fortop9artists_byoccurenceincities.txt'

# Write in JSON format
with open(text_file, 'w', encoding='utf-8') as file:
        json.dump( artists_top_10_songs , file, indent=4)

#### DEBUGGING SELENIUM

~Identification of Formatting Change  
~Identifcation of Bold Tags

In [46]:
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

# Open the URL
url = "https://chartmasters.org/most-streamed-tracks-on-spotify/"
driver.get(url)
time.sleep(5)  # Allow page to fully load

# Find the search input and enter the artist's name
search_input = WebDriverWait(driver, 10).until(
    EC.presence_of_element_located((By.CSS_SELECTOR, 'input[placeholder="Artist"]'))
)
artist_name = "Lil Wayne"
search_input.send_keys(artist_name)
# search_input.send_keys(Keys.RETURN)
time.sleep(5)  # Wait for the search results to load

# Get the page source and parse it with BeautifulSoup
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')

# Scrape the top 10 songs
table = soup.find('table')  # Adjust this selector based on the actual HTML structure

if table:
    rows = table.find_all('tr')[1:11]  # Skip the header row and get the top 10 rows

print(rows)

driver.quit()

[<tr class="odd wdt-responsive-row-action"><td class="expand numdata integer rankings column-rank">1</td><td class="column-song"><i><b>I'm the One</b></i><br/><a class="styledLink" href="https://chartmasters.org/spotify-streaming-numbers-tool/?artist_name=&amp;rt=tbmsttrck&amp;artist_id=0QHgL1lAIqAw0HtD7YldmP">DJ Khaled ft. Justin Bieber, Quavo, Chance The Rapper &amp; Lil Wayne</a> <a href="https://chartmasters.org/edit-track/?track_id=3DXncPQOG4VBw3QHh3S817"><img height="11" src="https://chartmasters.org/wp-content/uploads/2023/09/Edit-pen.jpg" width="9"/></a></td><td class="numdata integer numbersMain column-playcount sorting_1">1,295,402,255</td></tr>, <tr class="even wdt-responsive-row-action"><td class="expand numdata integer rankings column-rank">2</td><td class="column-song"><i><b>Sucker For Pain</b></i><br/><a class="styledLink" href="https://chartmasters.org/spotify-streaming-numbers-tool/?artist_name=&amp;rt=tbmsttrck&amp;artist_id=55Aa2cqylxrFIXC767Z865">Lil Wayne, Wiz Khal