### Scrapes playlists from Open Broadcast Radio

In [28]:
import pprint
import requests
from bs4 import BeautifulSoup as bs

pp = pprint.PrettyPrinter(indent=4)

#### scrape all playlists:

In [93]:
playlists = []
base_url = "https://www.openbroadcast.org"
href = "/content/playlists/?page=1"
while True:
    url = base_url + href #+ str(page)
    response = requests.get(url)
    if response.status_code == 200:
        soup = bs(response.content)
        page_items = soup.find_all('div', class_='card--playlist')
        page_playlists = []
        for item in page_items:
            body = item.find('div', class_='card__body').find_all('div')
            playlist = {
                'title': " ".join(body[2].text.split()),
                'author': {
                    'name': body[4].text.strip(),
                    'href': body[4].a['href']
                },
                'href': body[0].a['href'],
                'tags': list(item.find('div', class_='tags').stripped_strings),
            }
            page_playlists.append(playlist)
        playlists.extend(page_playlists)
    else:
        print(f"error while scraping page {page}")
        print(f"    error code: {response.status_code}")
        print(f"    URL: {url}")
        continue
    pagination = soup.find('div', class_='pagination')
    current_page = pagination.find('span', class_='current page').text
    print(f"scraped page {current_page}")
    if pagination.find('span', class_='disabled next'):
        print("finished!")
        print(f"{len(playlists)} playlists in total")
        break
    href = pagination.find('a', class_='next')['href']
    print(f"next href: {href}")
    if current_page == "44":
        print("failsafe!")
        break

scraped page 1
next href: /content/playlists/?page=2
scraped page 2
next href: /content/playlists/?page=3
scraped page 3
next href: /content/playlists/?page=4
scraped page 4
next href: /content/playlists/?page=5
scraped page 5
next href: /content/playlists/?page=6
scraped page 6
next href: /content/playlists/?page=7
scraped page 7
next href: /content/playlists/?page=8
scraped page 8
next href: /content/playlists/?page=9
scraped page 9
next href: /content/playlists/?page=10
scraped page 10
next href: /content/playlists/?page=11
scraped page 11
next href: /content/playlists/?page=12
scraped page 12
next href: /content/playlists/?page=13
scraped page 13
next href: /content/playlists/?page=14
scraped page 14
next href: /content/playlists/?page=15
scraped page 15
next href: /content/playlists/?page=16
scraped page 16
next href: /content/playlists/?page=17
scraped page 17
next href: /content/playlists/?page=18
scraped page 18
next href: /content/playlists/?page=19
scraped page 19
next href: 

In [97]:
d_find_and_replace = {'Easy ...ening': 'Easy Listening'}
for playlist in playlists:
    tags = playlist['tags']
    for i, item in enumerate(tags):
        tags[i] = d_find_and_replace.get(item, item)
playlists

[{'title': 'Sali! #15',
  'author': {'name': 'Baxter',
   'href': '/network/users/5be9a833-8338-43cc-b7d7-a4186b19c44d/'},
  'href': '/content/playlists/coffee-kiss/',
  'tags': ['Alternative', 'Bossa Nova', 'Easy Listening', 'Jazz', 'Neo Soul']},
 {'title': 'Close #11',
  'author': {'name': 'Pat',
   'href': '/network/users/5a9bc657-dd53-485e-a4b4-c8f32af354e5/'},
  'href': '/content/playlists/close-11/',
  'tags': ['Flow', 'Funk', 'Sleep', 'Soul']},
 {'title': 'Stoner #4',
  'author': {'name': 'blundetto',
   'href': '/network/users/5fd28952-6bca-41c3-b295-f33a6ecf1d2d/'},
  'href': '/content/playlists/stoner-4/',
  'tags': ['Psychedelic', 'Sleep']},
 {'title': 'Sun Daze #2',
  'author': {'name': 'blundetto',
   'href': '/network/users/5fd28952-6bca-41c3-b295-f33a6ecf1d2d/'},
  'href': '/content/playlists/sun-daze-2/',
  'tags': ['Chill', 'Flow', 'Hip Hop', 'Pop', 'Soul', 'World']},
 {'title': 'Sleeper #5',
  'author': {'name': 'Pat',
   'href': '/network/users/5a9bc657-dd53-485e-a4b

In [94]:
import json
import os
dir = 'scrapes'
if not os.path.exists(dir):
    os.mkdir(dir)
with open(os.path.join('scrapes', 'openbroadcast-all_playlists.json'), 'w', encoding='utf-8') as f:
    json.dump(playlists, f, ensure_ascii=False, indent=4)

In [101]:
import json
with open(os.path.join('scrapes', 'openbroadcast-all_playlists.json')) as f:
    playlists = json.load(f)
playlists

[{'title': 'Sali! #15',
  'author': {'name': 'Baxter',
   'href': '/network/users/5be9a833-8338-43cc-b7d7-a4186b19c44d/'},
  'href': '/content/playlists/coffee-kiss/',
  'tags': ['Alternative', 'Bossa Nova', 'Easy Listening', 'Jazz', 'Neo Soul']},
 {'title': 'Close #11',
  'author': {'name': 'Pat',
   'href': '/network/users/5a9bc657-dd53-485e-a4b4-c8f32af354e5/'},
  'href': '/content/playlists/close-11/',
  'tags': ['Flow', 'Funk', 'Sleep', 'Soul']},
 {'title': 'Stoner #4',
  'author': {'name': 'blundetto',
   'href': '/network/users/5fd28952-6bca-41c3-b295-f33a6ecf1d2d/'},
  'href': '/content/playlists/stoner-4/',
  'tags': ['Psychedelic', 'Sleep']},
 {'title': 'Sun Daze #2',
  'author': {'name': 'blundetto',
   'href': '/network/users/5fd28952-6bca-41c3-b295-f33a6ecf1d2d/'},
  'href': '/content/playlists/sun-daze-2/',
  'tags': ['Chill', 'Flow', 'Hip Hop', 'Pop', 'Soul', 'World']},
 {'title': 'Sleeper #5',
  'author': {'name': 'Pat',
   'href': '/network/users/5a9bc657-dd53-485e-a4b

In [102]:
for playlist in playlists:
    print(f"'{playlist['title']}' by '{playlist['author']['name']}' ({playlist['href']})")

'Sali! #15' by 'Baxter' (/content/playlists/coffee-kiss/)
'Close #11' by 'Pat' (/content/playlists/close-11/)
'Stoner #4' by 'blundetto' (/content/playlists/stoner-4/)
'Sun Daze #2' by 'blundetto' (/content/playlists/sun-daze-2/)
'Sleeper #5' by 'Pat' (/content/playlists/sleeper-5/)
'Early Bird #7' by 'blundetto' (/content/playlists/early-bird-7/)
'Couchsurfing #6' by 'Pat' (/content/playlists/charizza-3/)
'' by 'ohrstrom' (/content/playlists/foooooo/)
'Night Ride #2' by 'blundetto' (/content/playlists/night-ride-2/)
'Yard Tings #12' by 'blundetto' (/content/playlists/yard-tings-12/)
'' by 'Baxter' (/content/playlists/good-vibes-nineteen/)
'Sun Even #1' by 'Pat' (/content/playlists/softplay-weekend/)
'' by 'DJLeo' (/content/playlists/indie-block-7-8-20/)
'Sun Daze #1' by 'blundetto' (/content/playlists/cool-session-week-end/)
'Valentine #13' by 'Baxter' (/content/playlists/love-is-the-key/)
'Valentine #5' by 'blundetto' (/content/playlists/2-valentine-2020/)
'Valentine #4' by 'blundett

In [114]:
lists = [list for list in playlists 
         if (list['author']['name'] == 'blundetto') and ("Stoner" in list['title'])]
hrefs = [list['href'] for list in lists]
hrefs

['/content/playlists/stoner-4/',
 '/content/playlists/stoner-3/',
 '/content/playlists/stoner-2/',
 '/content/playlists/stoner-1/']

In [116]:
tracks = []
for href in hrefs:
    URL = base_url + href
    response = requests.get(URL)
    if response.status_code == 200:
        print(f"scraped page {href}")
    else:
        print(f"error while scraping page {href}: {response}")
    soup = BeautifulSoup(response.content, 'html.parser')
    page_items = soup.find_all('div', class_='list_body_item c2')
    for item in page_items:
        title = item.li.a.contents[0].strip()
        artist = item.find_all('li')[1].a.text.strip()
        tracks.append({'title': title, 'artist': artist})
print(f"{len(tracks)} tracks")

scraped page /content/playlists/stoner-4/
scraped page /content/playlists/stoner-3/
scraped page /content/playlists/stoner-2/
scraped page /content/playlists/stoner-1/
143 tracks


In [118]:
playlist = {'series': "Stoner", 'author': "blundetto", 'tracks': tracks}

In [119]:
import json
import os
dir = 'scrapes'
if not os.path.exists(dir):
    os.mkdir(dir)
with open(os.path.join('scrapes', 'openbroadcast-stoner.json'), 'w', encoding='utf-8') as f:
    json.dump(playlist, f, ensure_ascii=False, indent=4)