In [80]:
from selenium import webdriver
import requests
import json
from bs4 import BeautifulSoup
import time
import sys
import pandas as pd
import numpy as np

In [81]:
BASE_URL = 'https://soundcloud.com'
CACHE_FILENAME = 'cache.json'
CACHE_DICT = {}
SCROLL_PAUSE_TIME = 1
CHROMEDRIVER_PATH = './chromedriver'

In [82]:
def open_cache():
    '''
    Opens the cache file if it exists and loads the JSON into
    the CACHE_DICT dictionary.
    if the cache file doesn't exist, creates a new cache dictionary

    Parameters
    ----------
    None

    Returns
    -------
    cache_dict (dict): the cache variable
    '''
    try:
        cache_file = open(CACHE_FILENAME, 'r')
        cache_contents = cache_file.read()
        cache_dict = json.loads(cache_contents)
        cache_file.close()
    except:
        cache_dict = {}

    return cache_dict

def save_cache(cache_dict):
    '''
    Saves the current state of the cache to disk

    Parameters
    ----------
    cache_dict (dict): dictionary to write to disk

    Returns
    -------
    None
    '''
    dumped_json_cache = json.dumps(cache_dict, indent=2)
    fw = open(CACHE_FILENAME,"w")
    fw.write(dumped_json_cache)
    fw.close()

In [83]:
# def cache_page_with_genres(url):

#     CACHE_DICT = open_cache()
#     if url in CACHE_DICT.keys():
#         print("Using Cache")
#         return CACHE_DICT[url]

#     else:
#         print("Fetching")
#         browser = webdriver.Chrome("./chromedriver")
#         browser.get(url)
#         time.sleep(3)
#         xpath = '/html/body/div[1]/div[2]/div[2]/div/div/div[1]/div[2]/div/div[2]/div[4]/button'
#         browser.find_element_by_xpath(xpath).click()
#         time.sleep(3)
#         browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
#         time.sleep(3)
#         browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
#         time.sleep(3)

#         page_source = browser.page_source
#         CACHE_DICT[url] = page_source
#         save_cache(CACHE_DICT)
#         browser.close()
#         return CACHE_DICT[url]

In [84]:
def get_tracks_for_genre(bsObj):
    all_tracks = {}
    reggae_source = bsObj
    all_titles = reggae_source.find_all('li', class_="chartTracks__item")

    for i, track in enumerate(all_titles):
        details = track.find('div', class_='chartTrack__details')
        
        try:
            track_title = details.find('div', class_='chartTrack__title').text.strip()
        except:
            try: 
                track_title = details.find('div', class_='chartTrack__blockedTitle').text.strip()
            except:
                track_title = np.NaN

        try:
            end_url = details.find('div', class_='chartTrack__title').find('a')['href']
            track_url = BASE_URL + end_url
        except:
            track_url = np.NaN

        track_artist = details.find('div', class_='chartTrack__username').text.strip()
        artist_url_end = details.find('div', class_='chartTrack__username').find('a')['href']
        artist_url_full = 'https://soundcloud.com' + artist_url_end
        
        
        all_plays = track.find(
            'div', class_='chartTrack__score').find(
                'div', class_='sc-ministats')

        try:
            track_views_week = all_plays.find(
                'span', class_='chartTrack__scoreWeekPlays').find(
                    'span', class_='sc-visuallyhidden').text
        except:
            track_views_week = np.NaN

        try:
            track_views_all = all_plays.find(
                'span', class_='chartTrack__scoreAllPlays').find(
                    'span', class_='sc-visuallyhidden').text
        except:
            track_views_all = np.NaN

        all_tracks[i+1] = {
            'title': track_title,
            'url': track_url,
            'artist': track_artist,
            'weekly_views': track_views_week,
            'all_views': track_views_all,
            'artist_url': artist_url_full
        }

    return all_tracks

In [85]:
def build_genre_url_dict():
    genre_dict = {}
    url = 'https://soundcloud.com/charts/top?genre=reggae&country=US'
    reggae_source = BeautifulSoup(cache_page_with_genres(url), 'html.parser')

    all_genres = reggae_source.find_all('a', class_="linkMenu__link")
    for genre in all_genres:
        if (genre.text == 'All music genres') or (genre.text == 'All audio genres'):
            continue
        else:
            genre_dict[genre.text] = BASE_URL + genre['href']

    return genre_dict

# Scrape All Genre Data

## Process Flow (Based on Previously Created Functions)
1. Build dictionary that stores each genre and its respective URL for top 50 tracks. If the data exists, the cache will be used; else, it will build URLs using one genre page.
2. Create blank df that will hold data for top tracks in each music or audio genre. Each song has the following attributes: (1) genre, (2) title, (3) url, (4) artist, (5) weekly_views, (6) all_views (all-time views). Note that artist may not necessarily be artist but rather is the Soundcloud user that uploaded the media.
3. For each genre, get the top 50 tracks and relevant attributes. If the HTML for the genre page exists in the cache, it will use the cache; else, it will use Selenium to open a browser and save the page source (this will take ~5 minutes with no cache).
4. Concatenate results to make one large dataframe with all the tracks from all genres and their attributes
5. Cast float datatypes as floats
6. Write output dataframe to csv for further use in database

In [86]:
import requests
from bs4 import BeautifulSoup
import re
from selenium.webdriver.chrome.options import Options

In [107]:
def get_source_scrollable(url):
    """
    Gets the page source for a site that requires multiple scrolls but has a finite end
    """
    CACHE_DICT = open_cache()
    # Setup headless chromedriver
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    driver = webdriver.Chrome(CHROMEDRIVER_PATH, options=chrome_options)
    driver.get(url)
    # Get scroll height
    last_height = driver.execute_script("return document.body.scrollHeight")

    while True:
        # Scroll down to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        # Wait to load page
        time.sleep(SCROLL_PAUSE_TIME)

        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    html = driver.page_source
    CACHE_DICT[url] = html
    driver.close()
    return html

In [88]:
from datetime import datetime

In [89]:
url = "https://soundcloud.com/charts"

def request_with_cache(url):
    if url in CACHE_DICT.keys():
        print('Fetching from cache...')
        return CACHE_DICT[url]
    else:
        print('Fetching new data...')
        r = requests.get(url)
        html = r.text
        CACHE_DICT[url] = html
        save_cache(CACHE_DICT)
        return html

In [106]:
CACHE_DICT = {}
html = request_with_cache(url)
soup = BeautifulSoup(html, "html.parser")

category_links = soup.find_all('a', href=re.compile("^(/charts/top)((?!all-).)*$"))
print(len(category_links))
category_link = category_links[0]['href']

for category_link in category_links:
    full_path = f"https://soundcloud.com{category_link['href']}"
    print(f"{full_path}")
    html = get_source_scrollable(full_path)

Fetching new data...
41
https://soundcloud.com/charts/top?genre=alternativerock
https://soundcloud.com/charts/top?genre=ambient
https://soundcloud.com/charts/top?genre=classical
https://soundcloud.com/charts/top?genre=country
https://soundcloud.com/charts/top?genre=danceedm
https://soundcloud.com/charts/top?genre=dancehall
https://soundcloud.com/charts/top?genre=deephouse
https://soundcloud.com/charts/top?genre=disco
https://soundcloud.com/charts/top?genre=drumbass
https://soundcloud.com/charts/top?genre=dubstep
https://soundcloud.com/charts/top?genre=electronic
https://soundcloud.com/charts/top?genre=folksingersongwriter
https://soundcloud.com/charts/top?genre=hiphoprap
https://soundcloud.com/charts/top?genre=house
https://soundcloud.com/charts/top?genre=indie
https://soundcloud.com/charts/top?genre=jazzblues
https://soundcloud.com/charts/top?genre=latin
https://soundcloud.com/charts/top?genre=metal
https://soundcloud.com/charts/top?genre=piano
https://soundcloud.com/charts/top?genre=

In [114]:
html = get_source_scrollable(full_path)
soup = BeautifulSoup(html, "html.parser")

In [128]:
links = soup.find('div', {'class': 'chartTracks'}).find_all('a', href=re.compile('^/[^/]*$'))
list(set(links))

[<a class="sc-link-light sc-link-secondary" href="/jimquisition">Jimquisition</a>,
 <a class="sc-link-light sc-link-secondary" href="/gennabain">Genna Bain</a>,
 <a class="sc-link-light sc-link-secondary" href="/playstation">PlayStation</a>,
 <a class="sc-link-light sc-link-secondary" href="/twistartups">This Week in Startups</a>,
 <a class="sc-link-light sc-link-secondary" href="/twit">TWiT</a>,
 <a class="sc-link-light sc-link-secondary" href="/geekhistorylesson">Geek History Lesson</a>,
 <a class="sc-link-light sc-link-secondary" href="/macmagazine">MacMagazine</a>,
 <a class="sc-link-light sc-link-secondary" href="/soundonsound">Sound On Sound</a>,
 <a class="sc-link-light sc-link-secondary" href="/sophossecurity">Naked Security</a>]

In [202]:
link = links[1]
full_path = f"https://soundcloud.com{link['href']}"

headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
               'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'}
r = requests.get(full_path, headers=headers)

In [203]:
soup = BeautifulSoup(r.text, "html.parser")
x = soup.find_all('script')[9].string.split('= ')[1].strip('][').split('{')[-4].split(',')
stat_dict = {}
for val in x[6:25]:
    kps = val.split(':')
    stat_dict[kps[0].strip('"')] = kps[1].strip('"')

stat_dict

{'followers_count': '23333',
 'followings_count': '0',
 'first_name': '',
 'full_name': '',
 'groups_count': '0',
 'id': '125332894',
 'kind': 'user',
 'last_modified': '2021-11-02T17',
 'last_name': '',
 'likes_count': '2',
 'playlist_likes_count': '0',
 'permalink': 'jimquisition',
 'permalink_url': 'https',
 'playlist_count': '1',
 'reposts_count': 'null',
 'track_count': '501',
 'uri': 'https',
 'urn': 'soundcloud',
 'username': 'Jimquisition'}

In [93]:
final_df = pd.DataFrame(
    columns=['genre', 'title', 'url', 
             'artist', 'weekly_views', 'all_views'])

for genre_name, genre_link in genre_dict.items():
    genre_source = BeautifulSoup(cache_page_with_genres(url=genre_link), 'html.parser')
    genre_tracks = get_tracks_for_genre(genre_source)
    
    genre_df = pd.DataFrame.from_dict(genre_tracks, orient='index')
    genre_df['genre'] = genre_name
    
    genre_top_10 = genre_df[0:10]
    final_df = pd.concat([final_df, genre_top_10], axis=0)

NameError: name 'genre_dict' is not defined

In [None]:
expected_length = 10*len(genre_dict.keys())
actual_length = len(final_df)
print(f"Expected records: {expected_length}")
print(f"Actual records: {actual_length}")

Expected records: 410
Actual records: 410


In [None]:
final_df = final_df.astype({
    'weekly_views': 'float',
    'all_views': 'float'
})

In [None]:
final_df = final_df.reset_index()

In [95]:
import random

In [96]:
end_value = 10000

start_time = time.time()
dictionary_list = []
for i in range(0, end_value, 1):
    dictionary_data = {k: random.random() for k in range(30)}
    dictionary_list.append(dictionary_data)

df_final = pd.DataFrame.from_dict(dictionary_list)

end_time = time.time()
print('Execution time = %.6f seconds' % (end_time-start_time))

Execution time = 0.127007 seconds


In [101]:
dictionary_list[0]

{0: 0.2919131010767334,
 1: 0.03423444526069064,
 2: 0.8347510902866464,
 3: 0.3732159315225121,
 4: 0.1063969063671607,
 5: 0.8573312895137334,
 6: 0.17600691948742309,
 7: 0.1871182219984816,
 8: 0.0773879505156786,
 9: 0.9759285733664554,
 10: 0.9753133065653093,
 11: 0.24322393402243625,
 12: 0.12167046853260033,
 13: 0.8792171244915454,
 14: 0.9483340532819133,
 15: 0.19936074997621156,
 16: 0.1518833804291958,
 17: 0.0008335712366793757,
 18: 0.25907127519389017,
 19: 0.3353949628545878,
 20: 0.004803078506695813,
 21: 0.5358771767465208,
 22: 0.13843273505599785,
 23: 0.28071113030150785,
 24: 0.2749608736815816,
 25: 0.28918599244807364,
 26: 0.3377816946697342,
 27: 0.458015549969481,
 28: 0.6830041893028056,
 29: 0.7837048385343238}

In [97]:
df_final

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,0.291913,0.034234,0.834751,0.373216,0.106397,0.857331,0.176007,0.187118,0.077388,0.975929,...,0.004803,0.535877,0.138433,0.280711,0.274961,0.289186,0.337782,0.458016,0.683004,0.783705
1,0.544737,0.710535,0.636560,0.115241,0.513204,0.829177,0.770377,0.222180,0.686959,0.344631,...,0.173197,0.622980,0.257646,0.840802,0.889660,0.977149,0.604396,0.325661,0.018089,0.883656
2,0.761733,0.868270,0.860566,0.526478,0.033579,0.457551,0.831244,0.686206,0.110584,0.286271,...,0.985279,0.408876,0.884761,0.769910,0.095461,0.398376,0.804848,0.442980,0.234474,0.617080
3,0.001239,0.687389,0.388086,0.475010,0.405230,0.930747,0.005540,0.960681,0.500376,0.183593,...,0.436660,0.276894,0.675306,0.907508,0.076743,0.623425,0.367066,0.549289,0.319905,0.103540
4,0.069888,0.931174,0.022093,0.895974,0.510207,0.054338,0.595210,0.504452,0.488789,0.073213,...,0.071455,0.910737,0.342032,0.202909,0.599020,0.987651,0.912552,0.807461,0.915274,0.594814
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.849694,0.612039,0.964885,0.178261,0.506050,0.997768,0.553180,0.414955,0.221598,0.171258,...,0.810442,0.817540,0.167670,0.807882,0.096372,0.402370,0.517119,0.797800,0.063934,0.874364
9996,0.312635,0.191734,0.819061,0.901196,0.366393,0.668443,0.225152,0.995955,0.648941,0.421367,...,0.885576,0.053362,0.066459,0.826640,0.487012,0.825011,0.076195,0.215845,0.439215,0.187213
9997,0.854446,0.573710,0.545661,0.196483,0.267793,0.118307,0.625176,0.601107,0.124808,0.374314,...,0.686978,0.856359,0.401036,0.986818,0.762105,0.814375,0.678492,0.674425,0.605797,0.227190
9998,0.763417,0.100239,0.309172,0.430005,0.411971,0.140382,0.375914,0.187255,0.123082,0.973187,...,0.148479,0.468891,0.440479,0.567390,0.058571,0.532899,0.854768,0.133022,0.639081,0.289492


# Scraping Data for each Artist 

In [14]:
all_artist_urls = list(final_df['artist_url'])

In [15]:
def cache_artist_page(artist_url):

    url = artist_url + '/popular-tracks'
    CACHE_DICT = open_cache()
    if url in CACHE_DICT.keys():
        print("Using Cache")
        return CACHE_DICT[url]

    else:
        print("Fetching")
        browser = webdriver.Chrome("/Users/michael/Downloads/chromedriver")
        browser.get(url)
        time.sleep(3)

        page_source = browser.page_source
        CACHE_DICT[url] = page_source
        save_cache(CACHE_DICT)
        return CACHE_DICT[url]
    

In [16]:
def get_artist_info(artist_url_list):
    '''
    Parameters
    -----------
    artist_url: The URL to an artist page. The function uses 
    cache_artist_page to retrieve the HTML then creates BS 
    object from HTML text of artist page.
    
    '''
    
    all_artists = {}
    
    for i, artist_url in enumerate(artist_url_list):
        
        print(artist_url)
        
        
        artist_source = BeautifulSoup(
        cache_artist_page(artist_url), 'html.parser')
            
        try:
            artist_name = artist_source.find(
            'span', class_='soundTitle__usernameText').text.strip()
            
            top_track_name = artist_source.find(
            'a', class_='soundTitle__title').text.strip()

            top_track_views = artist_source.find(
            'span', class_='sc-ministats-plays').find(
            'span', class_='sc-visuallyhidden').text.strip().split(' ')[0]

            all_tables = artist_source.find_all('td')

            artist_followers = all_tables[0].find(
                'a')['title'].strip().split(' ')[0]

            artist_tracks = all_tables[2].find(
                'a')['title'].strip().split(' ')[0]


            all_artists[i+1] = {
                'artist_name': artist_name,
                'artist_url': artist_url,
                'artist_toptrack': top_track_name,
                'artist_toptrack_views': top_track_views,
                'artist_followers': artist_followers,
                'artist_numtracks': artist_tracks
            }
        except:
            artist_url = artist_url
            artist_name = np.NaN
            top_track_name = np.NaN
            top_track_views = np.NaN
            artist_followers = np.NaN
            artist_tracks = np.NaN
            
            all_artists[i+1] = {
            'artist_name': artist_name,
            'artist_url': artist_url,
            'artist_toptrack': top_track_name,
            'artist_toptrack_views': top_track_views,
            'artist_followers': artist_followers,
            'artist_numtracks': artist_tracks}

    return all_artists

In [17]:
all_artist_info = get_artist_info(all_artist_urls)

https://soundcloud.com/lil_peep
Using Cache
https://soundcloud.com/beachbunnymusic
Using Cache
https://soundcloud.com/vancejoy
Using Cache
https://soundcloud.com/hopelessrecords
Using Cache
https://soundcloud.com/greenday
Using Cache
https://soundcloud.com/tealoversunite
Using Cache
https://soundcloud.com/jorgehl-1
Using Cache
https://soundcloud.com/fueled_by_ramen
Using Cache
https://soundcloud.com/panicatthedisco
Using Cache
https://soundcloud.com/ajrbrothers
Using Cache
https://soundcloud.com/user-908929543
Using Cache
https://soundcloud.com/rjsfoundsounds
Using Cache
https://soundcloud.com/deepsleepbrownnoise-music
Using Cache
https://soundcloud.com/t-mega-40540774
Using Cache
https://soundcloud.com/stardustvibes
Using Cache
https://soundcloud.com/relaxing-white-noise
Using Cache
https://soundcloud.com/therhythmtree
Using Cache
https://soundcloud.com/felixblume
Using Cache
https://soundcloud.com/paul-tobin-6
Using Cache
https://soundcloud.com/oceansoundswhitenoiseforsleep
Using Cac

Using Cache
https://soundcloud.com/slipknot
Using Cache
https://soundcloud.com/slipknot
Using Cache
https://soundcloud.com/slipknot
Using Cache
https://soundcloud.com/roadrunner-usa
Using Cache
https://soundcloud.com/metal-head0-1
Using Cache
https://soundcloud.com/officialshizumaru
Using Cache
https://soundcloud.com/renatoluis-ferreira
Using Cache
https://soundcloud.com/slipknot
Using Cache
https://soundcloud.com/hikariultra
Using Cache
https://soundcloud.com/red-bull-records
Using Cache
https://soundcloud.com/awfulpianosound
Using Cache
https://soundcloud.com/protegemoi-1
Using Cache
https://soundcloud.com/1lychee
Using Cache
https://soundcloud.com/yvpoipoi
Using Cache
https://soundcloud.com/dawelocklear
Using Cache
https://soundcloud.com/sharieingente
Using Cache
https://soundcloud.com/starwarspunk
Using Cache
https://soundcloud.com/myuu
Using Cache
https://soundcloud.com/aouysdgfahsbdfiwegdfsesuf
Using Cache
https://soundcloud.com/rustleteh
Using Cache
https://soundcloud.com/secret

Using Cache
https://soundcloud.com/motivationdays
Using Cache
https://soundcloud.com/lifeleveler
Using Cache
https://soundcloud.com/rousseben
Using Cache
https://soundcloud.com/motivationdays
Using Cache
https://soundcloud.com/motivationdays
Using Cache
https://soundcloud.com/motivationdays
Using Cache
https://soundcloud.com/davidji
Using Cache
https://soundcloud.com/davidji
Using Cache
https://soundcloud.com/davidji
Using Cache
https://soundcloud.com/989wclz
Using Cache
https://soundcloud.com/radioplus-3
Using Cache
https://soundcloud.com/media-roots
Using Cache
https://soundcloud.com/media-roots
Using Cache
https://soundcloud.com/radioplus-3
Using Cache
https://soundcloud.com/pepebillete
Using Cache
https://soundcloud.com/globovision
Using Cache
https://soundcloud.com/shehbazsharif
Using Cache
https://soundcloud.com/taketwoshow
Using Cache
https://soundcloud.com/qudsn
Using Cache
https://soundcloud.com/ouncil1archive
Using Cache
https://soundcloud.com/ouncil1archive
Using Cache
https

In [18]:
artist_df = pd.DataFrame.from_dict(all_artist_info, orient='index')
artist_df = artist_df.reset_index()

In [19]:
print(final_df.columns)
print("")
print(artist_df.columns)

Index(['index', 'genre', 'title', 'url', 'artist', 'weekly_views', 'all_views',
       'artist_url'],
      dtype='object')

Index(['index', 'artist_name', 'artist_url', 'artist_toptrack',
       'artist_toptrack_views', 'artist_followers', 'artist_numtracks'],
      dtype='object')


In [20]:
artist_df = artist_df.rename(columns={'index': 'id'})

In [21]:
final_df = final_df.drop('index', axis=1).reset_index()

In [22]:
final_df = final_df.rename(columns={'index': 'id'})

# Creating SQLite Database with Python

## Create/Connect to SQLite DB and Establish Connection

In [23]:
import sqlite3

In [24]:
conn = sqlite3.connect('soundcloud_data.db')

In [25]:
c = conn.cursor()

## Create Tables

In [26]:
query_artists = '''
CREATE TABLE IF NOT EXISTS soundcloud_artists(
    id integer,
    artist_name text,
    artist_url text PRIMARY KEY,
    artist_toptrack text,
    artist_toptrack_views REAL,
    artist_followers REAL,
    artist_numtracks REAL)
'''

In [27]:
c.execute(query_artists)

<sqlite3.Cursor at 0x7fd750509f80>

In [28]:
query_tracks = '''
CREATE TABLE IF NOT EXISTS soundcloud_tracks (
    id integer PRIMARY KEY,
    track_genre text,
    track_title text,
    track_url text,
    track_artist text,
    track_views_week REAL,
    track_views_all REAL,
    track_artist_url text, 
    FOREIGN KEY (track_artist_url) REFERENCES soundcloud_artists (artist_url)
);
'''

In [29]:
c.execute(query_tracks)

<sqlite3.Cursor at 0x7fd750509f80>

## Put Dataframes with Scraped Data in SQL DB

In [30]:
artist_df.to_sql('soundcloud_artists', conn, if_exists='replace', index=False)

In [31]:
final_df.to_sql('soundcloud_tracks', conn, if_exists='replace', index=False)

In [32]:
conn.close()

In [33]:
artist_df.shape

(410, 7)

In [34]:
final_df.shape

(410, 8)

# Separate Cache to write to Github

In [37]:
cache_full = open_cache()

In [40]:
cache_half1 = dict(list(cache_full.items())[len(cache_full)//2:])
cache_half2 = dict(list(cache_full.items())[:len(cache_full)//2])

In [44]:
save_cache_with_name(cache_half1, 'cache_half1.json')
save_cache_with_name(cache_half2, 'cache_half2.json')

# Combine 2 Cache Files into 1

In [46]:
cache_half1 = open_cache_with_name('cache_half1.json')
cache_half2 = open_cache_with_name('cache_half2.json')

In [53]:
cache_half1.update(cache_half2)

In [56]:
save_cache_with_name(cache_half1, 'sc_cache.json')