In [9]:
from selenium import webdriver
import requests
import json
from bs4 import BeautifulSoup
import time
import sys
import pandas as pd
import numpy as np

In [10]:
BASE_URL = 'https://soundcloud.com'
CACHE_FILENAME = 'sc_cache.json'
CACHE_DICT = {}

In [11]:
def open_cache():
    ''' Opens the cache file if it exists and loads the JSON into
    the CACHE_DICT dictionary.
    if the cache file doesn't exist, creates a new cache dictionary

    Parameters
    ----------
    None

    Returns
    -------
    The opened cache: dict
    '''
    try:
        cache_file = open(CACHE_FILENAME, 'r')
        cache_contents = cache_file.read()
        cache_dict = json.loads(cache_contents)
        cache_file.close()
    except:
        cache_dict = {}

    return cache_dict

In [12]:
def save_cache(cache_dict):
    ''' Saves the current state of the cache to disk

    Parameters
    ----------
    cache_dict: dict
        The dictionary to save

    Returns
    -------
    None
    '''
    dumped_json_cache = json.dumps(cache_dict, indent=2)
    fw = open(CACHE_FILENAME,"w")
    fw.write(dumped_json_cache)
    fw.close()

In [13]:
def cache_page_with_genres(url):

    CACHE_DICT = open_cache()
    if url in CACHE_DICT.keys():
        print("Using Cache")
        return CACHE_DICT[url]

    else:
        print("Fetching")
        browser = webdriver.Chrome("/Users/michael/Downloads/chromedriver")
        browser.get(url)
        time.sleep(3)
        xpath = '/html/body/div[1]/div[2]/div[2]/div/div/div[1]/div[2]/div/div[2]/div[4]/button'
        browser.find_element_by_xpath(xpath).click()
        time.sleep(3)
        browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(3)
        browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(3)

        page_source = browser.page_source
        CACHE_DICT[url] = page_source
        save_cache(CACHE_DICT)
        browser.close()
        return CACHE_DICT[url]

In [14]:
def get_tracks_for_genre(bsObj):
    all_tracks = {}
    reggae_source = bsObj
    all_titles = reggae_source.find_all('li', class_="chartTracks__item")

    for i, track in enumerate(all_titles):
        details = track.find('div', class_='chartTrack__details')
        
        try:
            track_title = details.find('div', class_='chartTrack__title').text.strip()
        except:
            try: 
                track_title = details.find('div', class_='chartTrack__blockedTitle').text.strip()
            except:
                track_title = np.NaN

        try:
            end_url = details.find('div', class_='chartTrack__title').find('a')['href']
            track_url = BASE_URL + end_url
        except:
            track_url = np.NaN

        track_artist = details.find('div', class_='chartTrack__username').text.strip()
        artist_url_end = details.find('div', class_='chartTrack__username').find('a')['href']
        artist_url_full = 'https://soundcloud.com' + artist_url_end
        
        
        all_plays = track.find(
            'div', class_='chartTrack__score').find(
                'div', class_='sc-ministats')

        try:
            track_views_week = all_plays.find(
                'span', class_='chartTrack__scoreWeekPlays').find(
                    'span', class_='sc-visuallyhidden').text
        except:
            track_views_week = np.NaN

        try:
            track_views_all = all_plays.find(
                'span', class_='chartTrack__scoreAllPlays').find(
                    'span', class_='sc-visuallyhidden').text
        except:
            track_views_all = np.NaN

        all_tracks[i+1] = {
            'title': track_title,
            'url': track_url,
            'artist': track_artist,
            'weekly_views': track_views_week,
            'all_views': track_views_all,
            'artist_url': artist_url_full
        }

    return all_tracks

In [15]:
def build_genre_url_dict():
    genre_dict = {}
    url = 'https://soundcloud.com/charts/top?genre=reggae&country=US'
    reggae_source = BeautifulSoup(cache_page_with_genres(url), 'html.parser')

    all_genres = reggae_source.find_all('a', class_="linkMenu__link")
    for genre in all_genres:
        if (genre.text == 'All music genres') or (genre.text == 'All audio genres'):
            continue
        else:
            genre_dict[genre.text] = BASE_URL + genre['href']

    return genre_dict

# Scrape All Genre Data

## Process Flow (Based on Previously Created Functions)
1. Build dictionary that stores each genre and its respective URL for top 50 tracks. If the data exists, the cache will be used; else, it will build URLs using one genre page.
2. Create blank df that will hold data for top tracks in each music or audio genre. Each song has the following attributes: (1) genre, (2) title, (3) url, (4) artist, (5) weekly_views, (6) all_views (all-time views). Note that artist may not necessarily be artist but rather is the Soundcloud user that uploaded the media.
3. For each genre, get the top 50 tracks and relevant attributes. If the HTML for the genre page exists in the cache, it will use the cache; else, it will use Selenium to open a browser and save the page source (this will take ~5 minutes with no cache).
4. Concatenate results to make one large dataframe with all the tracks from all genres and their attributes
5. Cast float datatypes as floats
6. Write output dataframe to csv for further use in database

In [16]:
genre_dict = build_genre_url_dict()

Using Cache


In [24]:
final_df = pd.DataFrame(
    columns=['genre', 'title', 'url', 
             'artist', 'weekly_views', 'all_views'])

for genre_name, genre_link in genre_dict.items():
    genre_source = BeautifulSoup(cache_page_with_genres(url=genre_link), 'html.parser')
    genre_tracks = get_tracks_for_genre(genre_source)
    
    genre_df = pd.DataFrame.from_dict(genre_tracks, orient='index')
    genre_df['genre'] = genre_name
    
    genre_top_10 = genre_df[0:10]
    final_df = pd.concat([final_df, genre_top_10], axis=0)

Using Cache
Using Cache
Using Cache
Using Cache
Using Cache
Using Cache
Using Cache
Using Cache
Using Cache
Using Cache
Using Cache
Using Cache
Using Cache
Using Cache
Using Cache
Using Cache
Using Cache
Using Cache
Using Cache
Using Cache
Using Cache
Using Cache
Using Cache
Using Cache
Using Cache
Using Cache
Using Cache
Using Cache
Using Cache
Using Cache
Using Cache
Using Cache
Using Cache
Using Cache
Using Cache
Using Cache
Using Cache
Using Cache
Using Cache
Using Cache
Using Cache


In [26]:
expected_length = 10*len(genre_dict.keys())
actual_length = len(final_df)
print(f"Expected records: {expected_length}")
print(f"Actual records: {actual_length}")

Expected records: 410
Actual records: 410


In [27]:
final_df = final_df.astype({
    'weekly_views': 'float',
    'all_views': 'float'
})

In [28]:
# final_df['genre-title'] = final_df['genre'] + ':' + final_df['title']

In [29]:
final_df = final_df.reset_index()

In [30]:
final_df.head()

Unnamed: 0,index,genre,title,url,artist,weekly_views,all_views,artist_url
0,1,Alternative Rock,Lil Peep & XXXTENTACION - Falling Down,https://soundcloud.com/lil_peep/lil-peep-ft-xx...,☆LiL PEEP☆,98414.0,128627075.0,https://soundcloud.com/lil_peep
1,2,Alternative Rock,Cloud 9,https://soundcloud.com/beachbunnymusic/cloud-9,Beach Bunny,77887.0,1988184.0,https://soundcloud.com/beachbunnymusic
2,3,Alternative Rock,Riptide,https://soundcloud.com/vancejoy/02-riptide,Vance Joy,46602.0,50170524.0,https://soundcloud.com/vancejoy
3,4,Alternative Rock,"All Time Low - Dear Maria, Count Me In",https://soundcloud.com/hopelessrecords/dear-ma...,Hopeless Records,18057.0,9802645.0,https://soundcloud.com/hopelessrecords
4,5,Alternative Rock,Brain Stew,https://soundcloud.com/greenday/brain-stew,Green Day,15464.0,2396422.0,https://soundcloud.com/greenday


In [31]:
final_df.to_csv('soundcloud_tracks.csv')

# Scraping Data for each Artist 

In [152]:
all_artist_urls = list(final_df['artist_url'])

In [141]:
def cache_artist_page(artist_url):

    url = artist_url + '/popular-tracks'
    CACHE_DICT = open_cache()
    if url in CACHE_DICT.keys():
        print("Using Cache")
        return CACHE_DICT[url]

    else:
        print("Fetching")
        browser = webdriver.Chrome("/Users/michael/Downloads/chromedriver")
        browser.get(url)
        time.sleep(3)

        page_source = browser.page_source
        CACHE_DICT[url] = page_source
        save_cache(CACHE_DICT)
        return CACHE_DICT[url]
    

In [164]:
def get_artist_info(artist_url_list):
    '''
    Parameters
    -----------
    artist_url: The URL to an artist page. The function uses 
    cache_artist_page to retrieve the HTML then creates BS 
    object from HTML text of artist page.
    
    '''
    
    all_artists = {}
    
    for i, artist_url in enumerate(artist_url_list):
        
        print(artist_url)
        
        
        artist_source = BeautifulSoup(
        cache_artist_page(artist_url), 'html.parser')
            
        try:
            artist_name = artist_source.find(
            'span', class_='soundTitle__usernameText').text.strip()
            
            top_track_name = artist_source.find(
            'a', class_='soundTitle__title').text.strip()

            top_track_views = artist_source.find(
            'span', class_='sc-ministats-plays').find(
            'span', class_='sc-visuallyhidden').text.strip().split(' ')[0]

            all_tables = artist_source.find_all('td')

            artist_followers = all_tables[0].find(
                'a')['title'].strip().split(' ')[0]

            artist_tracks = all_tables[2].find(
                'a')['title'].strip().split(' ')[0]


            all_artists[i+1] = {
                'artist_name': artist_name,
                'artist_url': artist_url,
                'artist_toptrack': top_track_name,
                'artist_toptrack_views': top_track_views,
                'artist_followers': artist_followers,
                'artist_numtracks': artist_tracks
            }
        except:
            artist_url = artist_url
            artist_name = np.NaN
            top_track_name = np.NaN
            top_track_views = np.NaN
            artist_followers = np.NaN
            artist_tracks = np.NaN
            
            all_artists[i+1] = {
            'artist_name': artist_name,
            'artist_url': artist_url,
            'artist_toptrack': top_track_name,
            'artist_toptrack_views': top_track_views,
            'artist_followers': artist_followers,
            'artist_numtracks': artist_tracks}

    return all_artists

In [166]:
test_artist_list = all_artist_urls[0:20]
test_artist_list

['https://soundcloud.com/lil_peep',
 'https://soundcloud.com/beachbunnymusic',
 'https://soundcloud.com/vancejoy',
 'https://soundcloud.com/hopelessrecords',
 'https://soundcloud.com/greenday',
 'https://soundcloud.com/tealoversunite',
 'https://soundcloud.com/fueled_by_ramen',
 'https://soundcloud.com/jorgehl-1',
 'https://soundcloud.com/ajrbrothers',
 'https://soundcloud.com/panicatthedisco',
 'https://soundcloud.com/user-908929543',
 'https://soundcloud.com/rjsfoundsounds',
 'https://soundcloud.com/deepsleepbrownnoise-music',
 'https://soundcloud.com/stardustvibes',
 'https://soundcloud.com/t-mega-40540774',
 'https://soundcloud.com/relaxing-white-noise',
 'https://soundcloud.com/oceansoundswhitenoiseforsleep',
 'https://soundcloud.com/felixblume',
 'https://soundcloud.com/therhythmtree',
 'https://soundcloud.com/paul-tobin-6']

In [167]:
all_artist_info = get_artist_info(test_artist_list)

https://soundcloud.com/lil_peep
Fetching
https://soundcloud.com/beachbunnymusic
Fetching
https://soundcloud.com/vancejoy
Fetching
https://soundcloud.com/hopelessrecords
Fetching
https://soundcloud.com/greenday
Fetching
https://soundcloud.com/tealoversunite
Fetching
https://soundcloud.com/fueled_by_ramen
Fetching
https://soundcloud.com/jorgehl-1
Fetching
https://soundcloud.com/ajrbrothers
Fetching
https://soundcloud.com/panicatthedisco
Fetching
https://soundcloud.com/user-908929543
Fetching
https://soundcloud.com/rjsfoundsounds
Fetching
https://soundcloud.com/deepsleepbrownnoise-music
Fetching
https://soundcloud.com/stardustvibes
Fetching
https://soundcloud.com/t-mega-40540774
Fetching
https://soundcloud.com/relaxing-white-noise
Fetching
https://soundcloud.com/oceansoundswhitenoiseforsleep
Fetching
https://soundcloud.com/felixblume
Fetching
https://soundcloud.com/therhythmtree
Fetching
https://soundcloud.com/paul-tobin-6
Fetching


In [173]:
artist_df = pd.DataFrame.from_dict(all_artist_info, orient='index')
artist_df = artist_df.reset_index()

Unnamed: 0,index,artist_name,artist_url,artist_toptrack,artist_toptrack_views,artist_followers,artist_numtracks
0,1,☆LiL PEEP☆,https://soundcloud.com/lil_peep,Lil Peep & XXXTENTACION - Falling Down,128671898,2231981,107
1,2,Beach Bunny,https://soundcloud.com/beachbunnymusic,Cloud 9,2011525,21213,35


In [192]:
artist_df = artist_df.drop('index', axis=1)
artist_df.head(2)

Unnamed: 0,artist_name,artist_url,artist_toptrack,artist_toptrack_views,artist_followers,artist_numtracks
0,☆LiL PEEP☆,https://soundcloud.com/lil_peep,Lil Peep & XXXTENTACION - Falling Down,128671898,2231981,107
1,Beach Bunny,https://soundcloud.com/beachbunnymusic,Cloud 9,2011525,21213,35


In [193]:
final_df = final_df.drop('index', axis=1)
final_df.head(2)

Unnamed: 0,genre,title,url,artist,weekly_views,all_views,artist_url
0,Alternative Rock,Lil Peep & XXXTENTACION - Falling Down,https://soundcloud.com/lil_peep/lil-peep-ft-xx...,☆LiL PEEP☆,98414.0,128627075.0,https://soundcloud.com/lil_peep
1,Alternative Rock,Cloud 9,https://soundcloud.com/beachbunnymusic/cloud-9,Beach Bunny,77887.0,1988184.0,https://soundcloud.com/beachbunnymusic


In [194]:
print(final_df.columns)
print("")
print(artist_df.columns)

Index(['genre', 'title', 'url', 'artist', 'weekly_views', 'all_views',
       'artist_url'],
      dtype='object')

Index(['artist_name', 'artist_url', 'artist_toptrack', 'artist_toptrack_views',
       'artist_followers', 'artist_numtracks'],
      dtype='object')


In [200]:
artist_df = artist_df.reset_index()

In [207]:
artist_df = artist_df.rename(columns={'index': 'id'})

In [212]:
final_df = final_df.reset_index()

In [213]:
final_df = final_df.rename(columns={'index': 'id'})

# Creating SQLite Database with Python

## Create/Connect to SQLite DB and Establish Connection

In [195]:
import sqlite3

In [196]:
conn = sqlite3.connect('soundcloud_data.db')

In [197]:
c = conn.cursor()

## Create Tables

In [210]:
query_artists = '''
CREATE TABLE IF NOT EXISTS soundcloud_artists(
    id integer PRIMARY KEY,
    artist_name text,
    artist_url text,
    artist_toptrack text,
    artist_toptrack_views REAL,
    artist_followers REAL,
    artist_numtracks REAL)
'''

In [211]:
c.execute(query_artists)

<sqlite3.Cursor at 0x7fadb66f16c0>

In [214]:
query_tracks = '''
CREATE TABLE IF NOT EXISTS soundcloud_tracks (
    id integer PRIMARY KEY,
    track_genre text,
    track_title text,
    track_url text,
    track_artist text,
    track_views_week REAL,
    track_views_all REAL,
    track_artist_url text, 
    FOREIGN KEY (track_artist_url) REFERENCES soundcloud_artists (artist_url)
);
'''

In [215]:
c.execute(query_tracks)

<sqlite3.Cursor at 0x7fadb66f16c0>

## Put Dataframes with Scraped Data in SQL DB

In [216]:
artist_df.to_sql('soundcloud_artists', conn, if_exists='replace', index=False)

In [217]:
final_df.to_sql('soundcloud_tracks', conn, if_exists='replace', index=False)

In [218]:
conn.close()

# User Interaction

In [111]:
# user_genre = 'Ambient'
# genre_link_complete = None
# for genre in genre_dict.keys():
#     if user_genre.lower() == genre.lower():
#         genre_link_complete = genre_dict[genre]

# if genre_link_complete is not None:
#     print(genre_link_complete)
# else:
#     print("Please try again and enter a valid genre or exit.")