### Scrapes playlists from Open Broadcast Radio

In [1]:
import pprint
import requests
from bs4 import BeautifulSoup as bs

pp = pprint.PrettyPrinter(indent=4)

#### scrape all playlists:

In [25]:
playlists = []
base_url = "https://www.openbroadcast.org"
href = "/content/playlists/?page=1"
while True:
    url = base_url + href
    response = requests.get(url)
    if response.status_code == 200:
        soup = bs(response.content)
        page_items = soup.find_all('div', class_='list-item')
#         print(f"1st page_item: {page_items[0]}")
        page_playlists = []
        for item in page_items:
            all_links = item.find_all('a')
            title = all_links[0]['title']
            href = all_links[0]['href']
            author = {
                'name': all_links[1].text.strip(),
                'href': all_links[1]['href']
            }
            tags = [span.text.strip() for span in item.find_all('span', class_='tag')]
            playlist = {
                'title': title,
                'href': href,
                'author': author,
                'tags': tags
            }
            page_playlists.append(playlist)
        playlists.extend(page_playlists)
#         print(playlists[0])
    else:
        print(f"error while scraping page {page}")
        print(f"    error code: {response.status_code}")
        print(f"    URL: {url}")
        continue
    pagination = soup.find('div', class_='pagination')
    current_page = pagination.find('span', class_='current page').text
    print(f"scraped page {current_page}")
    if pagination.find('span', class_='disabled next'):
        print("finished!")
        print(f"{len(playlists)} playlists in total")
        break
    href = pagination.find('a', class_='next')['href']
    print(f"next href: {href}")
    if current_page == "100":
        print("failsafe!")
        break

scraped page 1
next href: /content/playlists/?page=2
scraped page 2
next href: /content/playlists/?page=3
scraped page 3
next href: /content/playlists/?page=4
scraped page 4
next href: /content/playlists/?page=5
scraped page 5
next href: /content/playlists/?page=6
scraped page 6
next href: /content/playlists/?page=7
scraped page 7
next href: /content/playlists/?page=8
scraped page 8
next href: /content/playlists/?page=9
scraped page 9
next href: /content/playlists/?page=10
scraped page 10
next href: /content/playlists/?page=11
scraped page 11
next href: /content/playlists/?page=12
scraped page 12
next href: /content/playlists/?page=13
scraped page 13
next href: /content/playlists/?page=14
scraped page 14
next href: /content/playlists/?page=15
scraped page 15
next href: /content/playlists/?page=16
scraped page 16
next href: /content/playlists/?page=17
scraped page 17
next href: /content/playlists/?page=18
scraped page 18
next href: /content/playlists/?page=19
scraped page 19
next href: 

In [26]:
playlists[20:25]

[{'title': 'Ocean',
  'href': '/content/playlists/edf34a36-1210-4f3f-9ab0-bf0f200d4eab/',
  'author': {'name': 'Restomas',
   'href': '/network/users/9e82d32f-a49f-4395-8ba9-f06a1c7d438c/'},
  'tags': ['Electronic', 'Experimental', 'Field...rding', 'Jazz']},
 {'title': 'Ajele - on a Weekend too ;-)',
  'href': '/content/playlists/27313549-b5f6-416b-8545-918a87b78997/',
  'author': {'name': 'Pat',
   'href': '/network/users/5a9bc657-dd53-485e-a4b4-c8f32af354e5/'},
  'tags': ['Eclectic', 'Soul', 'World']},
 {'title': 'Charriza',
  'href': '/content/playlists/83c43e12-7280-4797-a65a-295737d8579d/',
  'author': {'name': 'Pat',
   'href': '/network/users/5a9bc657-dd53-485e-a4b4-c8f32af354e5/'},
  'tags': ['Alternative', 'Flow', 'Mellow', 'Sleep']},
 {'title': 'Afternoon Bunny',
  'href': '/content/playlists/7eef7959-5f88-4105-992e-a0223514ac7d/',
  'author': {'name': 'Baxter',
   'href': '/network/users/5be9a833-8338-43cc-b7d7-a4186b19c44d/'},
  'tags': ['Alternative', 'Funk', 'Neo Soul', '

In [27]:
d_find_and_replace = {'Easy ...ening': 'Easy Listening'}
for playlist in playlists:
    tags = playlist['tags']
    for i, item in enumerate(tags):
        tags[i] = d_find_and_replace.get(item, item)
playlists[20:25]

[{'title': 'Ocean',
  'href': '/content/playlists/edf34a36-1210-4f3f-9ab0-bf0f200d4eab/',
  'author': {'name': 'Restomas',
   'href': '/network/users/9e82d32f-a49f-4395-8ba9-f06a1c7d438c/'},
  'tags': ['Electronic', 'Experimental', 'Field...rding', 'Jazz']},
 {'title': 'Ajele - on a Weekend too ;-)',
  'href': '/content/playlists/27313549-b5f6-416b-8545-918a87b78997/',
  'author': {'name': 'Pat',
   'href': '/network/users/5a9bc657-dd53-485e-a4b4-c8f32af354e5/'},
  'tags': ['Eclectic', 'Soul', 'World']},
 {'title': 'Charriza',
  'href': '/content/playlists/83c43e12-7280-4797-a65a-295737d8579d/',
  'author': {'name': 'Pat',
   'href': '/network/users/5a9bc657-dd53-485e-a4b4-c8f32af354e5/'},
  'tags': ['Alternative', 'Flow', 'Mellow', 'Sleep']},
 {'title': 'Afternoon Bunny',
  'href': '/content/playlists/7eef7959-5f88-4105-992e-a0223514ac7d/',
  'author': {'name': 'Baxter',
   'href': '/network/users/5be9a833-8338-43cc-b7d7-a4186b19c44d/'},
  'tags': ['Alternative', 'Funk', 'Neo Soul', '

In [29]:
import json
import os
import utils
folder = 'data'
filename = f'openbroadcast-all_playlists_{utils.timestamp()}.json'
if not os.path.exists(folder):
    os.mkdir(folder)
with open(os.path.join(folder, filename), 'w', encoding='utf-8') as f:
    json.dump(playlists, f, ensure_ascii=False, indent=4)

In [6]:
import json
import os
folder = 'data'
filename = f'openbroadcast-all_playlists_2020-04-23-17-34-22.json'
with open(os.path.join(folder, filename)) as f:
    playlists = json.load(f)
playlists[:5]

[{'title': 'Sun Even',
  'href': '/content/playlists/fb243c57-8f9d-46f8-a688-3c9ddb1fb4e4/',
  'author': {'name': 'Pat',
   'href': '/network/users/5a9bc657-dd53-485e-a4b4-c8f32af354e5/'},
  'tags': ['Experimental', 'Jazz', 'Psychedelic', 'Sleep']},
 {'title': 'NDR Info informiert über Coronavirus',
  'href': '/content/playlists/b32f009a-e03a-4e91-bf57-13707ecf986a/',
  'author': {'name': 'Restomas',
   'href': '/network/users/9e82d32f-a49f-4395-8ba9-f06a1c7d438c/'},
  'tags': ['corona']},
 {'title': 'Good Morning Fifteen',
  'href': '/content/playlists/706606ee-a927-45f5-82a0-831f71036858/',
  'author': {'name': 'Baxter',
   'href': '/network/users/5be9a833-8338-43cc-b7d7-a4186b19c44d/'},
  'tags': ['Alternative', 'Conte...y R&B', 'Easy Listening', 'Hip Hop']},
 {'title': 'Sleeper',
  'href': '/content/playlists/51c12665-9315-4094-ba82-fa0418775167/',
  'author': {'name': 'Pat',
   'href': '/network/users/5a9bc657-dd53-485e-a4b4-c8f32af354e5/'},
  'tags': ['Ambient', 'Electronic', 'Ja

In [40]:
author = 'Restomas'
series = "Crossfader"
lists = [list for list in playlists 
         if (list['author']['name'] == author) and (series in list['title'])]
[list['title'] for list in lists if list['title'] != 'NDR Info informiert über Coronavirus']

['Crossfader',
 'Crossfader',
 'Crossfader',
 'Crossfader',
 'Crossfader',
 'Crossfader',
 'Crossfader',
 'Crossfader',
 'Crossfader',
 'Crossfader',
 'Crossfader',
 'Crossfader',
 'Crossfader',
 'Crossfader ',
 'Crossfader',
 'Crossfader',
 'Crossfader',
 'Crossfader',
 'Crossfader',
 'Crossfader 13',
 'Crossfader',
 'Crossfader',
 'Crossfader',
 'Crossfader',
 'Crossfader',
 'Crossfader',
 'Crossfader',
 'Crossfader',
 'Crossfader']

In [41]:
hrefs = [list['href'] for list in lists]
hrefs

['/content/playlists/eb8ced11-e7b9-4a9f-80ef-0f639fdfc583/',
 '/content/playlists/3319e56d-3e1c-4d1d-b9a4-0b01865f0021/',
 '/content/playlists/eafd99dd-dd08-43a2-ab0e-b29c4d42323c/',
 '/content/playlists/a940b93e-a66b-43aa-a83b-09bdb7e359f2/',
 '/content/playlists/9bacb7c7-1be1-488c-8bb7-e663c24ec291/',
 '/content/playlists/26bb8445-c25b-4973-b85e-865a4de36084/',
 '/content/playlists/6f59e0a6-e106-499d-9da5-558a7e7805cf/',
 '/content/playlists/a2dabbdb-dcde-4adc-88c8-ca016d62e12f/',
 '/content/playlists/f40c1443-3560-45b4-80a9-33a0944a656a/',
 '/content/playlists/b6f7386a-8b50-4df8-879f-953cdbdd0bc0/',
 '/content/playlists/3a9d98ef-a25f-4f02-a4b7-6035ba8f209a/',
 '/content/playlists/919726a9-ebb7-4e28-b497-bfe6555e972a/',
 '/content/playlists/082be01d-ad91-439c-9062-8bdfd3a77b66/',
 '/content/playlists/ab0151cb-fe81-4b24-b2e6-0ffdd44939c1/',
 '/content/playlists/bd3afd53-8fea-429a-8c92-937f6fda6538/',
 '/content/playlists/ca869ce3-5d18-45f6-bb9a-13f790335834/',
 '/content/playlists/7db

In [42]:
import requests
from bs4 import BeautifulSoup as bs
base_url = "https://www.openbroadcast.org"
tracks = []
for href in hrefs:
    URL = base_url + href
    response = requests.get(URL)
    if response.status_code == 200:
        print(f"scraped page {href}")
    else:
        print(f"error while scraping page {href}: {response}")
    soup = bs(response.content, 'html.parser')
    page_items = soup.find_all('div', class_='list_body_item c2')
    for item in page_items:
        title = item.li.a.contents[0].strip()
        artist = item.find_all('li')[1].a.text.strip()
        tracks.append({'title': title, 'artist': artist})
print(f"{len(tracks)} tracks")

scraped page /content/playlists/eb8ced11-e7b9-4a9f-80ef-0f639fdfc583/
scraped page /content/playlists/3319e56d-3e1c-4d1d-b9a4-0b01865f0021/
scraped page /content/playlists/eafd99dd-dd08-43a2-ab0e-b29c4d42323c/
scraped page /content/playlists/a940b93e-a66b-43aa-a83b-09bdb7e359f2/
scraped page /content/playlists/9bacb7c7-1be1-488c-8bb7-e663c24ec291/
scraped page /content/playlists/26bb8445-c25b-4973-b85e-865a4de36084/
scraped page /content/playlists/6f59e0a6-e106-499d-9da5-558a7e7805cf/
scraped page /content/playlists/a2dabbdb-dcde-4adc-88c8-ca016d62e12f/
scraped page /content/playlists/f40c1443-3560-45b4-80a9-33a0944a656a/
scraped page /content/playlists/b6f7386a-8b50-4df8-879f-953cdbdd0bc0/
scraped page /content/playlists/3a9d98ef-a25f-4f02-a4b7-6035ba8f209a/
scraped page /content/playlists/919726a9-ebb7-4e28-b497-bfe6555e972a/
scraped page /content/playlists/082be01d-ad91-439c-9062-8bdfd3a77b66/
scraped page /content/playlists/ab0151cb-fe81-4b24-b2e6-0ffdd44939c1/
scraped page /conten

In [43]:
tracks[:5]

[{'title': 'User generated -... breathe and fly', 'artist': 'Roland Widmer'},
 {'title': 'Wasting My Time', 'artist': 'Harry Nilsson'},
 {'title': 'Working for the Man', 'artist': 'PJ Harvey'},
 {'title': 'Angeles', 'artist': 'Elliott Smith'},
 {'title': "Don't Be Cruel", 'artist': 'Billy Swan'}]

In [44]:
playlist = {'series': series, 'author': author, 'tracks': tracks}
playlist

{'series': 'Crossfader',
 'author': 'Restomas',
 'tracks': [{'title': 'User generated -... breathe and fly',
   'artist': 'Roland Widmer'},
  {'title': 'Wasting My Time', 'artist': 'Harry Nilsson'},
  {'title': 'Working for the Man', 'artist': 'PJ Harvey'},
  {'title': 'Angeles', 'artist': 'Elliott Smith'},
  {'title': "Don't Be Cruel", 'artist': 'Billy Swan'},
  {'title': 'One Night In Prague', 'artist': 'Benny Sings'},
  {'title': 'STATION-ID - Whisper', 'artist': 'Roland Widmer'},
  {'title': 'GOLD feat. Cengiz Can', 'artist': 'Nu'},
  {'title': '50 Ways to Leave Your Lover', 'artist': 'Paul Simon'},
  {'title': 'Love Song (Helado Negro Remix)', 'artist': 'Devendra Banhart'},
  {'title': 'STATION-ID - Whisper', 'artist': 'Roland Widmer'},
  {'title': 'Close to You', 'artist': 'Antoine Pesle'},
  {'title': 'Send a Message', 'artist': 'Amp Fiddler'},
  {'title': 'Sangria', 'artist': 'Céu'},
  {'title': 'Tonada de luna llena', 'artist': 'Nella Rojas'},
  {'title': 'Luscinia megarhy...m

In [45]:
playlist['series']

'Crossfader'

In [46]:
import utils
f"openbroadcast-{''.join(playlist['series'].split())}_{utils.timestamp()}.json"

'openbroadcast-Crossfader_2020-04-23-19-57-45.json'

In [47]:
import json
import os
folder = 'data'
filename = f"openbroadcast-{''.join(playlist['series'].split())}_{utils.timestamp()}.json"
if not os.path.exists(folder):
    os.mkdir(folder)
with open(os.path.join(folder, filename), 'w', encoding='utf-8') as f:
    json.dump(playlist, f, ensure_ascii=False, indent=4)