# Data Scrape
Gathering from https://kworb.net/spotify/songs.html#google_vignette

In [5]:
import pandas as pd
import os
from selenium import webdriver
from selenium.webdriver.common.by import By # used to import different ways to access data in the XML or HTML file
from selenium.webdriver.chrome.service import Service # no longer need to download a driver file, use service
from webdriver_manager.chrome import ChromeDriverManager # used to manage the Chrome driver to emulate a Chrome web browser

import time
import random

In [6]:
# Define Scrolling function
# Initialize the Selenium web driver
browser = webdriver.Chrome()
# Create function for random scrolling
def random_scroll(browser, total_wait_time):
    # get the total height of the page
    total_height = browser.execute_script("return document.body.scrollHeight")
    
    # number of steps to scroll (you can adjust this number)
    scroll_steps = random.randint(3, 10) # randomize how many scroll steps we will use
    
    # calculate the height to scroll on each step
    scroll_increment = total_height // scroll_steps

    # calculate the total time available for scrolling each step
    time_per_step = total_wait_time / scroll_steps
    
    # random scrolling across time
    for step in range(scroll_steps):
        # scroll by the increment (dividing total height by number of steps)
        browser.execute_script(f"window.scrollBy(0, {scroll_increment});")
        
        # random wait time between scrolls to simulate varying speed
        random_wait = random.uniform(0.5 * time_per_step, 1.5 * time_per_step)  # randomize the wait within a range
        time.sleep(random_wait)
        
    # final scroll to make sure you are at the very bottom (in case it didn't exactly match)
    browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")

In [12]:
# Begin Scraping actual data
browser.maximize_window()

#Define URL
url = 'https://kworb.net/spotify/songs.html'
# Navigate to url
browser.get(url)
# Wait a random amount of time
time.sleep(random.uniform(3,7))
total_wait_time = random.uniform(3,7)
# Perform a random scroll with set amount of time from above
random_scroll(browser, total_wait_time)

# Find the table with data
song_data_table = browser.find_elements(By.XPATH, ".//div[@class='subcontainer']//table")


In [17]:
# Define dict for holding information
stream_data_dict = []
# Iterate through the tables
for table in song_data_table:
    rows = table.find_elements(By.XPATH, ".//tbody/tr")  # Select all rows in the table's body
    for row in rows:
        try:
            # Extract artist and song data
            artist_song = row.find_element(By.XPATH, ".//td[@class='text']/div").text
            streams = row.find_element(By.XPATH, "./td[2]").text
            daily = row.find_element(By.XPATH, "./td[3]").text

            stream_data_dict.append({
                "Artist/Song": artist_song,
                "Streams": streams,
                "Daily": daily
            })
        except Exception as e:
            # Handle cases where the expected element doesn't exist
            print(f"Error: {e}")
print("Finished Scraping")

Finished Scraping


In [20]:
# Load Data into DF
stream_data_df = pd.DataFrame(stream_data_dict)

In [22]:
# Save into a csv for cleaning
stream_data_df.to_csv('/Users/joseflemker/Documents/GitHub/3250-data-analytics-template/data/rawstream_dataRaw.csv', encoding='utf-8', index=False)