### Scrapes playlists from Open Broadcast Radio

In [1]:
import pprint
import requests
from bs4 import BeautifulSoup as bs

pp = pprint.PrettyPrinter(indent=4)

#### scrape all playlists:

In [4]:
playlists = []
base_url = "https://www.openbroadcast.org"
href = "/content/playlists/?page=1"
while True:
    url = base_url + href
    response = requests.get(url)
    if response.status_code == 200:
        soup = bs(response.content)
        page_items = soup.find_all('div', class_='card--playlist')
        page_playlists = []
        for item in page_items:
            body = item.find('div', class_='card__body').find_all('div')
            playlist = {
                'title': " ".join(body[2].text.split()),
                'author': {
                    'name': body[4].text.strip(),
                    'href': body[4].a['href']
                },
                'href': body[0].a['href'],
                'tags': list(item.find('div', class_='tags').stripped_strings),
            }
            page_playlists.append(playlist)
        playlists.extend(page_playlists)
    else:
        print(f"error while scraping page {page}")
        print(f"    error code: {response.status_code}")
        print(f"    URL: {url}")
        continue
    pagination = soup.find('div', class_='pagination')
    current_page = pagination.find('span', class_='current page').text
    print(f"scraped page {current_page}")
    if pagination.find('span', class_='disabled next'):
        print("finished!")
        print(f"{len(playlists)} playlists in total")
        break
    href = pagination.find('a', class_='next')['href']
    print(f"next href: {href}")
    if current_page == "50":
        print("failsafe!")
        break

scraped page 1
next href: /content/playlists/?page=2
scraped page 2
next href: /content/playlists/?page=3
scraped page 3
next href: /content/playlists/?page=4
scraped page 4
next href: /content/playlists/?page=5
scraped page 5
next href: /content/playlists/?page=6
scraped page 6
next href: /content/playlists/?page=7
scraped page 7
next href: /content/playlists/?page=8
scraped page 8
next href: /content/playlists/?page=9
scraped page 9
next href: /content/playlists/?page=10
scraped page 10
next href: /content/playlists/?page=11
scraped page 11
next href: /content/playlists/?page=12
scraped page 12
next href: /content/playlists/?page=13
scraped page 13
next href: /content/playlists/?page=14
scraped page 14
next href: /content/playlists/?page=15
scraped page 15
next href: /content/playlists/?page=16
scraped page 16
next href: /content/playlists/?page=17
scraped page 17
next href: /content/playlists/?page=18
scraped page 18
next href: /content/playlists/?page=19
scraped page 19
next href: 

In [9]:
playlists[20:25]

[{'title': '-',
  'author': {'name': 'DJLeo',
   'href': '/network/users/0d4df140-23aa-424e-a1dc-4e531a7ed4ab/'},
  'href': '/content/playlists/6c0734dd-98e8-4f44-80fc-556c68ff32f4/',
  'tags': ['New Music']},
 {'title': 'Sweet Morning #1',
  'author': {'name': 'blundetto',
   'href': '/network/users/5fd28952-6bca-41c3-b295-f33a6ecf1d2d/'},
  'href': '/content/playlists/0d6aabbe-8996-4295-9384-09baccd2606b/',
  'tags': ['Eclectic', 'Flow', 'Smooth']},
 {'title': 'Mikro #16',
  'author': {'name': 'Restomas',
   'href': '/network/users/9e82d32f-a49f-4395-8ba9-f06a1c7d438c/'},
  'href': '/content/playlists/e6757d6f-5ad3-4b40-a784-55c285081e2e/',
  'tags': ['corona']},
 {'title': 'Sali! #16',
  'author': {'name': 'Baxter',
   'href': '/network/users/5be9a833-8338-43cc-b7d7-a4186b19c44d/'},
  'href': '/content/playlists/c05af960-d17e-4f64-ad01-8ffac8cec886/',
  'tags': ['Alternative',
   'Chill',
   'Easy ...ening',
   'Jazz',
   'Neo Soul',
   'Soul']},
 {'title': 'Mikro #15',
  'author': 

In [10]:
d_find_and_replace = {'Easy ...ening': 'Easy Listening'}
for playlist in playlists:
    tags = playlist['tags']
    for i, item in enumerate(tags):
        tags[i] = d_find_and_replace.get(item, item)
playlists[20:25]

[{'title': '-',
  'author': {'name': 'DJLeo',
   'href': '/network/users/0d4df140-23aa-424e-a1dc-4e531a7ed4ab/'},
  'href': '/content/playlists/6c0734dd-98e8-4f44-80fc-556c68ff32f4/',
  'tags': ['New Music']},
 {'title': 'Sweet Morning #1',
  'author': {'name': 'blundetto',
   'href': '/network/users/5fd28952-6bca-41c3-b295-f33a6ecf1d2d/'},
  'href': '/content/playlists/0d6aabbe-8996-4295-9384-09baccd2606b/',
  'tags': ['Eclectic', 'Flow', 'Smooth']},
 {'title': 'Mikro #16',
  'author': {'name': 'Restomas',
   'href': '/network/users/9e82d32f-a49f-4395-8ba9-f06a1c7d438c/'},
  'href': '/content/playlists/e6757d6f-5ad3-4b40-a784-55c285081e2e/',
  'tags': ['corona']},
 {'title': 'Sali! #16',
  'author': {'name': 'Baxter',
   'href': '/network/users/5be9a833-8338-43cc-b7d7-a4186b19c44d/'},
  'href': '/content/playlists/c05af960-d17e-4f64-ad01-8ffac8cec886/',
  'tags': ['Alternative',
   'Chill',
   'Easy Listening',
   'Jazz',
   'Neo Soul',
   'Soul']},
 {'title': 'Mikro #15',
  'author':

In [14]:
import json
import os
folder = 'data'
filename = f'openbroadcast-all_playlists_{utils.timestamp()}.json'
if not os.path.exists(folder):
    os.mkdir(folder)
with open(os.path.join(folder, filename), 'w', encoding='utf-8') as f:
    json.dump(playlists, f, ensure_ascii=False, indent=4)

In [15]:
import json
with open(os.path.join(folder, filename)) as f:
    playlists = json.load(f)
playlists[:5]

[{'title': 'Easter #8',
  'author': {'name': 'Restomas',
   'href': '/network/users/9e82d32f-a49f-4395-8ba9-f06a1c7d438c/'},
  'href': '/content/playlists/eb8ced11-e7b9-4a9f-80ef-0f639fdfc583/',
  'tags': ['Flow', 'Folk', 'Funk', 'Indie', 'Mellow', 'Pop']},
 {'title': 'Easter #7',
  'author': {'name': 'Restomas',
   'href': '/network/users/9e82d32f-a49f-4395-8ba9-f06a1c7d438c/'},
  'href': '/content/playlists/3319e56d-3e1c-4d1d-b9a4-0b01865f0021/',
  'tags': ['Flow', 'Indie', 'Mellow', 'Pop']},
 {'title': 'Ocean #10',
  'author': {'name': 'Restomas',
   'href': '/network/users/9e82d32f-a49f-4395-8ba9-f06a1c7d438c/'},
  'href': '/content/playlists/edf34a36-1210-4f3f-9ab0-bf0f200d4eab/',
  'tags': ['Electronic',
   'Experimental',
   'Field...rding',
   'Jazz',
   'Night',
   'Sleep']},
 {'title': 'Mikro #23',
  'author': {'name': 'Restomas',
   'href': '/network/users/9e82d32f-a49f-4395-8ba9-f06a1c7d438c/'},
  'href': '/content/playlists/42f2fba5-8715-4e0c-bbc9-80b41993ad3b/',
  'tags':

In [17]:
lists = [list for list in playlists 
         if (list['author']['name'] == 'blundetto') and ("Yard" in list['title'])]
lists

[{'title': 'Yard Tings #12',
  'author': {'name': 'blundetto',
   'href': '/network/users/5fd28952-6bca-41c3-b295-f33a6ecf1d2d/'},
  'href': '/content/playlists/d651e8b6-e7b7-4cb1-866d-81c3108a729c/',
  'tags': ['Reggae']},
 {'title': 'Yard Tings #11',
  'author': {'name': 'blundetto',
   'href': '/network/users/5fd28952-6bca-41c3-b295-f33a6ecf1d2d/'},
  'href': '/content/playlists/08bfab75-add8-4a1a-9aef-50bb937eb764/',
  'tags': ['Flow', 'Jamaica', 'Reggae', 'Rocksteady']},
 {'title': 'Yard Tings #10',
  'author': {'name': 'blundetto',
   'href': '/network/users/5fd28952-6bca-41c3-b295-f33a6ecf1d2d/'},
  'href': '/content/playlists/a40e9961-82c6-4178-92c6-112093d09ae8/',
  'tags': ['Flow', 'Jamaica', 'Reggae', 'Rocksteady']},
 {'title': 'Yard Tings #9',
  'author': {'name': 'blundetto',
   'href': '/network/users/5fd28952-6bca-41c3-b295-f33a6ecf1d2d/'},
  'href': '/content/playlists/01f8cfaa-4889-4ab2-812e-af387616a52e/',
  'tags': ['Dub', 'Jamaica', 'Reggae', 'Roots']},
 {'title': '

In [18]:
hrefs = [list['href'] for list in lists]
hrefs

['/content/playlists/d651e8b6-e7b7-4cb1-866d-81c3108a729c/',
 '/content/playlists/08bfab75-add8-4a1a-9aef-50bb937eb764/',
 '/content/playlists/a40e9961-82c6-4178-92c6-112093d09ae8/',
 '/content/playlists/01f8cfaa-4889-4ab2-812e-af387616a52e/',
 '/content/playlists/405ee094-1e95-487d-b670-5d58340e2bbf/',
 '/content/playlists/34781e07-70f9-4d0c-ba53-0b54275a7060/',
 '/content/playlists/64daefc1-c43b-4df2-94c2-478845d758ef/',
 '/content/playlists/a8d93cfb-0091-4fbe-b85c-52a8ee793f49/',
 '/content/playlists/c3d29353-2821-4d44-9bc4-b1f279aab03e/',
 '/content/playlists/a16076f3-b0f0-4c01-b485-d8eedbd358dc/',
 '/content/playlists/641769fb-9232-416c-b23f-44646953f226/',
 '/content/playlists/f9ca5fd3-ae2f-4d0d-ad37-3d9dfea31d50/']

In [20]:
tracks = []
for href in hrefs:
    URL = base_url + href
    response = requests.get(URL)
    if response.status_code == 200:
        print(f"scraped page {href}")
    else:
        print(f"error while scraping page {href}: {response}")
    soup = bs(response.content, 'html.parser')
    page_items = soup.find_all('div', class_='list_body_item c2')
    for item in page_items:
        title = item.li.a.contents[0].strip()
        artist = item.find_all('li')[1].a.text.strip()
        tracks.append({'title': title, 'artist': artist})
print(f"{len(tracks)} tracks")

scraped page /content/playlists/d651e8b6-e7b7-4cb1-866d-81c3108a729c/
scraped page /content/playlists/08bfab75-add8-4a1a-9aef-50bb937eb764/
scraped page /content/playlists/a40e9961-82c6-4178-92c6-112093d09ae8/
scraped page /content/playlists/01f8cfaa-4889-4ab2-812e-af387616a52e/
scraped page /content/playlists/405ee094-1e95-487d-b670-5d58340e2bbf/
scraped page /content/playlists/34781e07-70f9-4d0c-ba53-0b54275a7060/
scraped page /content/playlists/64daefc1-c43b-4df2-94c2-478845d758ef/
scraped page /content/playlists/a8d93cfb-0091-4fbe-b85c-52a8ee793f49/
scraped page /content/playlists/c3d29353-2821-4d44-9bc4-b1f279aab03e/
scraped page /content/playlists/a16076f3-b0f0-4c01-b485-d8eedbd358dc/
scraped page /content/playlists/641769fb-9232-416c-b23f-44646953f226/
scraped page /content/playlists/f9ca5fd3-ae2f-4d0d-ad37-3d9dfea31d50/
291 tracks


In [21]:
tracks[:5]

[{'title': 'STATION-ID - Roar kurz', 'artist': 'Roland Widmer'},
 {'title': 'The Look of Love', 'artist': 'Dennis Brown'},
 {'title': "Alton's Official Daughter", 'artist': 'Alton Ellis'},
 {'title': 'Satisfaction', 'artist': 'Carl Dawkins'},
 {'title': 'STATION-ID - Andavoicepur', 'artist': 'Roland Widmer'}]

In [23]:
playlist = {'series': "Yard Things", 'author': "blundetto", 'tracks': tracks}
playlist

{'series': 'Yard Things',
 'author': 'blundetto',
 'tracks': [{'title': 'STATION-ID - Roar kurz', 'artist': 'Roland Widmer'},
  {'title': 'The Look of Love', 'artist': 'Dennis Brown'},
  {'title': "Alton's Official Daughter", 'artist': 'Alton Ellis'},
  {'title': 'Satisfaction', 'artist': 'Carl Dawkins'},
  {'title': 'STATION-ID - Andavoicepur', 'artist': 'Roland Widmer'},
  {'title': 'Santic Rock', 'artist': 'Leonard Santic All Stars'},
  {'title': 'Late at Night', 'artist': 'William Shakespeare'},
  {'title': 'Problem #2', 'artist': 'Leonard Santic All Stars'},
  {'title': 'STATION-ID - Whisper', 'artist': 'Roland Widmer'},
  {'title': 'Got to Be Cool', 'artist': 'Wailing Souls'},
  {'title': 'Gideons High', 'artist': 'Max Edwards'},
  {'title': 'If You Ask Me', 'artist': 'Leon Dinero'},
  {'title': 'Bandits', 'artist': 'Screechy Dan'},
  {'title': 'STATION-ID - 123 Knock', 'artist': 'Roland Widmer'},
  {'title': 'Get Together', 'artist': 'Carl Dawkins'},
  {'title': "Ain't No Sunshi

In [25]:
playlist['series']

'Yard Things'

In [27]:
import json
import os
folder = 'data'
filename = f'openbroadcast-YardThings_{utils.timestamp()}.json'
if not os.path.exists(folder):
    os.mkdir(folder)
with open(os.path.join(folder, filename), 'w', encoding='utf-8') as f:
    json.dump(playlist, f, ensure_ascii=False, indent=4)